org.apache.spark.mllib.linalg.DenseVector Scala Examples
The following examples show how to use org.apache.spark.mllib.linalg.DenseVector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: IDFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } test("params") { ParamsSuite.checkParams(new IDF) val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0))) ParamsSuite.checkParams(model) } test("compute IDF with default parameter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((numOfData + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } }
Example 2
Source File: NormalizerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var data: Array[Vector] = _ @transient var dataFrame: DataFrame = _ @transient var normalizer: Normalizer = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) val sqlContext = new SQLContext(sc) dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData)) normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized_features") } //收集的结果 def collectResult(result: DataFrame): Array[Vector] = { result.select("normalized_features").collect().map { case Row(features: Vector) => features } } //向量的断言类型 def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } //断言值 def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { (vector1, vector2) => vector1 ~== vector2 absTol 1E-5 }, "The vector value is not correct after normalization.") } test("Normalization with default parameter") {//默认参数的正常化 //transform()方法将DataFrame转化为另外一个DataFrame的算法 normalizer.transform(dataFrame).show() val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l2Normalized) } test("Normalization with setter") {//规范化设置 normalizer.setP(1) //transform()方法将DataFrame转化为另外一个DataFrame的算法 normalizer.transform(dataFrame).show() val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l1Normalized) } } private object NormalizerSuite { case class FeatureData(features: Vector) }
Example 3
Source File: IDFSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") {//设置IDF计算 val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df)//fit()方法将DataFrame转化为一个Transformer的算法 //transform()方法将DataFrame转化为另外一个DataFrame的算法 idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } }
Example 4
Source File: PolynomialExpansionSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") {//参数 ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") {//带有默认参数的多项式展开 val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") //transform()方法将DataFrame转化为另外一个DataFrame的算法 polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } //多项式展开设置 test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) //transform()方法将DataFrame转化为另外一个DataFrame的算法 polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } }
Example 5
Source File: ChiSqSelector.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val indices = Statistics.chiSqTest(data) .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) .map { case (_, indices) => indices } .sorted new ChiSqSelectorModel(indices) } }
Example 6
Source File: Normalizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 7
Source File: IDFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 8
Source File: ElementwiseProductSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 9
Source File: NormalizerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var data: Array[Vector] = _ @transient var dataFrame: DataFrame = _ @transient var normalizer: Normalizer = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) val sqlContext = new SQLContext(sc) dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData)) normalizer = new Normalizer() .setInputCol("features") .setOutputCol("normalized_features") } def collectResult(result: DataFrame): Array[Vector] = { result.select("normalized_features").collect().map { case Row(features: Vector) => features } } def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { (vector1, vector2) => vector1 ~== vector2 absTol 1E-5 }, "The vector value is not correct after normalization.") } test("Normalization with default parameter") { val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l2Normalized) } test("Normalization with setter") { normalizer.setP(1) val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l1Normalized) } } private object NormalizerSuite { case class FeatureData(features: Vector) }
Example 10
Source File: ElementwiseProductSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { //产品应适用于数据集在一个密集的矢量 test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) //批理变换和每个变换,得到相同的结果 //transform()方法将DataFrame转化为另外一个DataFrame的算法 val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } //元素(Hadamard)产品应正确运用向量的稀疏数据集 test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) //transform()方法将DataFrame转化为另外一个DataFrame的算法 val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 11
Source File: PolynomialExpansionSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } }
Example 12
Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new VectorAssembler) } test("assemble") { import org.apache.spark.ml.feature.VectorAssembler.assemble assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty)) assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0))) val dv = Vectors.dense(2.0, 0.0) assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0))) val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0)) assert(assemble(0.0, dv, 1.0, sv) === Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0))) for (v <- Seq(1, "a", null)) { intercept[SparkException](assemble(v)) intercept[SparkException](assemble(1.0, v)) } } test("assemble should compress vectors") { import org.apache.spark.ml.feature.VectorAssembler.assemble val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) assert(v1.isInstanceOf[SparseVector]) val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) assert(v2.isInstanceOf[DenseVector]) } test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) )).toDF("id", "x", "y", "name", "z", "n") val assembler = new VectorAssembler() .setInputCols(Array("x", "y", "z", "n")) .setOutputCol("features") assembler.transform(df).select("features").collect().foreach { case Row(v: Vector) => assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0))) } } test("ML attributes") { val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari") val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0) val user = new AttributeGroup("user", Array( NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"), NumericAttribute.defaultAttr.withName("salary"))) val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0))) val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad") .select( col("browser").as("browser", browser.toMetadata()), col("hour").as("hour", hour.toMetadata()), col("count"), // "count" is an integer column without ML attribute col("user").as("user", user.toMetadata()), col("ad")) // "ad" is a vector column without ML attribute val assembler = new VectorAssembler() .setInputCols(Array("browser", "hour", "count", "user", "ad")) .setOutputCol("features") val output = assembler.transform(df) val schema = output.schema val features = AttributeGroup.fromStructField(schema("features")) assert(features.size === 7) val browserOut = features.getAttr(0) assert(browserOut === browser.withIndex(0).withName("browser")) val hourOut = features.getAttr(1) assert(hourOut === hour.withIndex(1).withName("hour")) val countOut = features.getAttr(2) assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2)) val userGenderOut = features.getAttr(3) assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3)) val userSalaryOut = features.getAttr(4) assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4)) assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5)) assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6)) } }
Example 13
Source File: Normalizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 14
Source File: IDFSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 15
Source File: ElementwiseProductSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 16
Source File: Normalizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 17
Source File: AugmentedDickeyFullerSuite.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts.stats import com.cloudera.sparkts.models.ARModel import org.apache.commons.math3.random.MersenneTwister import org.apache.spark.mllib.linalg.DenseVector import org.scalatest.FunSuite class AugmentedDickeyFullerSuite extends FunSuite { test("non-stationary AR model") { val rand = new MersenneTwister(10L) val arModel = new ARModel(0.0, .95) val sample = arModel.sample(500, rand) val (adfStat, pValue) = TimeSeriesStatisticalTests.adftest(sample, 1) assert(!java.lang.Double.isNaN(adfStat)) assert(!java.lang.Double.isNaN(pValue)) println("adfStat: " + adfStat) println("pValue: " + pValue) } test("iid samples") { val rand = new MersenneTwister(11L) val iidSample = Array.fill(500)(rand.nextDouble()) val (adfStat, pValue) = TimeSeriesStatisticalTests.adftest(new DenseVector(iidSample), 1) assert(!java.lang.Double.isNaN(adfStat)) assert(!java.lang.Double.isNaN(pValue)) println("adfStat: " + adfStat) println("pValue: " + pValue) } }
Example 18
Source File: PythonConnector.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts import java.nio.ByteBuffer import java.time._ import scala.collection.mutable.ArrayBuffer import org.apache.spark.mllib.linalg.{DenseVector, Vector} import org.apache.spark.api.java.function.{PairFunction, Function} import PythonConnector._ private object PythonConnector { val INT_SIZE = 4 val DOUBLE_SIZE = 8 val LONG_SIZE = 8 def putVector(buf: ByteBuffer, vec: Vector): Unit = { buf.putInt(vec.size) var i = 0 while (i < vec.size) { buf.putDouble(vec(i)) i += 1 } } def arrayListToSeq(list: java.util.ArrayList[Any]): Seq[Any] = { // implement with ArrayBuffer var result = ArrayBuffer[Any]() if (list != null) { result = ArrayBuffer[Any](list.toArray: _*) } result } } private class BytesToKeyAndSeries extends PairFunction[Array[Byte], String, Vector] { override def call(arr: Array[Byte]): (String, Vector) = { val buf = ByteBuffer.wrap(arr) val keySize = buf.getInt() val keyBytes = new Array[Byte](keySize) buf.get(keyBytes) val seriesSize = buf.getInt() val series = new Array[Double](seriesSize) var i = 0 while (i < seriesSize) { series(i) = buf.getDouble() i += 1 } (new String(keyBytes, "UTF8"), new DenseVector(series)) } } private class KeyAndSeriesToBytes extends Function[(String, Vector), Array[Byte]] { override def call(keyVec: (String, Vector)): Array[Byte] = { val keyBytes = keyVec._1.getBytes("UTF-8") val vec = keyVec._2 val arr = new Array[Byte](INT_SIZE + keyBytes.length + INT_SIZE + DOUBLE_SIZE * vec.size) val buf = ByteBuffer.wrap(arr) buf.putInt(keyBytes.length) buf.put(keyBytes) putVector(buf, vec) arr } } private class InstantToBytes extends Function[(ZonedDateTime, Vector), Array[Byte]] { override def call(instant: (ZonedDateTime, Vector)): Array[Byte] = { val arr = new Array[Byte](LONG_SIZE + INT_SIZE + DOUBLE_SIZE * instant._2.size) val buf = ByteBuffer.wrap(arr) buf.putLong(TimeSeriesUtils.zonedDateTimeToLong(instant._1)) putVector(buf, instant._2) arr } }
Example 19
Source File: ElementwiseProductSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 20
Source File: Gradient.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.optimization import org.apache.spark.annotation.DeveloperApi import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} def compute( iter: Iterator[(Double, Vector)], weights: Vector, cumGradient: Vector): (Long, Double) = { var loss = 0D var count = 0L iter.foreach { t => loss += compute(t._2, t._1, weights, cumGradient) count += 1 } (count, loss) } }
Example 21
Source File: RunMTM.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.spark.clustering.mtm import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.DenseVector import org.clustering4ever.math.distances.scalar.RawEuclidean import org.clustering4ever.math.distances.RawContinuousDistance object RunSom { def fit( sparkMaster: String, intputFile: RDD[Array[Double]], outputDir: String, metric: RawContinuousDistance = new RawEuclidean(false), execName: String = "RunMTM", nbRow: Int = 10, nbCol: Int = 10, tmin: Double = 0.9, tmax: Double = 8, convergeDist: Double = -0.001, maxIter: Int = 50, sep : String = ";", initMap: Int = 0, initMapFile : String = "", nbRealVars : Int = 10 ) = { exec( intputFile, outputDir, metric, nbRow, nbCol, tmin, tmax, convergeDist, maxIter, sep, initMap, initMapFile, nbRealVars, true ) } def exec( intputFile: RDD[Array[Double]], outputDir: String, metric: RawContinuousDistance = new RawEuclidean(false), nbRow: Int = 10, nbCol: Int = 10, tmin: Double = 0.9, tmax: Double = 8, convergeDist: Double = -0.001, maxIter: Int = 50, sep : String = ";", initMap: Int = 0, initMapFile : String = "", nbRealVars : Int = 10, stop: Boolean = false ) = { val somOptions = Map( "clustering.som.nbrow" -> nbRow.toString, "clustering.som.nbcol" -> nbCol.toString, "clustering.som.tmin" -> tmin.toString, "clustering.som.tmax" -> tmax.toString, "clustering.som.initMap" -> initMap.toString, "clustering.som.initMapFile" -> initMapFile, "clustering.som.separator" -> sep, "clustering.som.nbRealVars" -> nbRealVars.toString ) val trainingDataset = intputFile println(s"nbRow: ${trainingDataset.count()}") val som = new SomTrainerA(metric) val startLearningTime = System.currentTimeMillis val model = som.training(trainingDataset, Some(somOptions), maxIter, convergeDist) val somDuration = (System.currentTimeMillis - startLearningTime) / 1000D val time = Output.write(outputDir, trainingDataset, model, nbRow, nbCol) (model, time) } }
Example 22
Source File: Output.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.spark.clustering.mtm import java.io._ import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.DenseVector import scala.sys.process._ import java.util.Calendar import java.text.SimpleDateFormat import java.io.File import java.io.FileWriter object Output extends Serializable { def saveStr(savingPath: String, value: String, fileName: String = "") = { s"mkdir -p ${savingPath}".! val finalPath = savingPath + fileName val fw = new FileWriter(finalPath, true) fw.write(value + "\n") fw.close() } def write(outputDir: String, datas: RDD[Array[Double]], model: AbstractModel, nbRowSOM:Int, nbColSOM: Int): String = { val now = Calendar.getInstance().getTime() val format = new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss") val time = format.format(now) val dim = datas.first.size val datasWithIndex = datas.zipWithIndex.map(_.swap) val path: String = outputDir + "/EXP-" + time + "/" s"mkdir -p ${path}".! val mapMin = Array.fill[Byte](dim)(0).mkString(",") var header = "# mapDim=2 mapSize={"+ nbRowSOM +"," + nbColSOM + "}" header += " pointDim=" + dim + " pointRealDim=" + dim + " mapMin={" + mapMin + "}" val prototypes = model.prototypes.map( d => (d.id, d.point)).sortBy(_._1).map(_._2) println("Write Prototypes...") val protosString = prototypes.map( d => d.toArray.mkString(",")).mkString("\n") // Utiliser fileWriter saveStr(path, header + "\n" + protosString, "maps") val sumAffectedDatas = datas.map( d => (model.findClosestPrototype(d).id, 1)).reduceByKey{ case (sum1, sum2) => sum1 + sum2 }.collectAsMap // fill in all the prototypes that have 0 observations val card = (0 until prototypes.length).map( d => if (sumAffectedDatas.contains(d)) sumAffectedDatas(d) + "" else "0" ) println("Write Cardinalities...") var cardHeader = "# mapDim=2 mapSize={"+ nbRowSOM +"," + nbColSOM + "}" cardHeader += "pointDim=1 pointRealDim=0 mapMin={0} mapMax={0}" val cardStr = card.mkString("\n") saveStr(path, cardHeader + "\n" + cardStr, "cards") val affHeader = "# mapDim=1 mapSize={" + datas.count() + "} pointDim=1 pointRealDim=0 mapMin={0} mapMax={0}" val aff = datasWithIndex.map(d => (d._1, model.findClosestPrototype(d._2).id + "")).sortByKey().values.collect.mkString("\n") println("Write Affiliate...") saveStr(path, affHeader + "\n" + aff, "affs") println("Write Maps...") val maps = prototypes.zip(card).map(d => d._1.toArray.mkString(",") + "," + d._2).mkString("\n") saveStr(path, maps, "mapscard") println("Write successfully...") time } }
Example 23
Source File: WriterCluster.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.spark.clustering.mtm import org.apache.spark.mllib.linalg.DenseVector import java.io._ import org.apache.spark.rdd.RDD object WriterClusters{ def js(data: RDD[NamedVector], model: AbstractModel, path: String) = { val writer = new PrintWriter(new File(path)) val dataArray = data.collect var str = "var dataset = [" dataArray.foreach{ d => val closestNeuron = model.findClosestPrototype(d.elements) if (d != dataArray.head) str += ',' str += d.toJSON(closestNeuron.id) } str += "];" writer.write(str) writer.close() } }
Example 24
Source File: DataGenerator.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.spark.clustering.mtm import org.apache.spark.mllib.linalg.DenseVector import util.Random import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.Array import scala.annotation.meta.param object DataGen extends Serializable { class Center(val cls: Int, val rayon: Double, val elements: Array[Double]) extends Serializable { def this(cls: Int, dims: Int, a: Double, b: Double, rayon: Double) = this(cls, rayon, Array.fill(dims)(new Random(42).nextGaussian() * a + b)) } def generate( @(transient @param) sc: SparkContext, numPoints: Int, nbCls: Int, d: Int, numPartitions: Int = 2): RDD[NamedVector] = { // First, generate some centers val rand = new Random(42) val r = 1D val centers = Array.fill(nbCls)(Array.fill(d)(rand.nextGaussian() * r)) // Then generate points around each center sc.parallelize(0 until numPoints, numPartitions).map( idx => { val cls = idx % nbCls val center = centers(cls) val rand2 = new Random(42 + idx) new NamedVector(Array.tabulate(d)(i => center(i) + rand2.nextGaussian()), cls) }) } } object DataGenerator extends Serializable { private case class DModel(a: Double, b: Double) { def gen = a * Random.nextDouble() + b } private case class PModel(cls: Int, dmodels: Array[DModel]) { def genVector = new DenseVector(dmodels.map(_.gen)) def genNamedVector = new NamedVector(dmodels.map(_.gen), cls) } private def PModel2D(cls: Int, a: Double, b: Double, c: Double) = PModel(cls, Array(DModel(a, b), DModel(a, c))) private def PModelND(cls: Int, dims: Int, a: Double, b: Double) = PModel(cls, Array.fill(dims)(DModel(a, b))) class SModel(N: Int, pmodels: Array[PModel]) { private def nextVector(i: Int) = pmodels(Random.nextInt(pmodels.size)).genVector private def nextNamedVector(i: Int) = pmodels(Random.nextInt(pmodels.size)).genNamedVector def getVector = Array.tabulate(N)(nextVector) def getNamedVector = Array.tabulate(N)(nextNamedVector) } val CLS_1 = 1 val CLS_2 = 2 val CLS_3 = 3 val CLS_4 = 4 def genH2Dims(n: Int) = new SModel( n, Array( PModel2D(CLS_1, 1, 1, 1), PModel2D(CLS_1, 1, 1, 2), PModel2D(CLS_1, 1, 1, 3), PModel2D(CLS_1, 1, 2, 2), PModel2D(CLS_1, 1, 3, 1), PModel2D(CLS_1, 1, 3, 2), PModel2D(CLS_1, 1, 3, 3) ) ) def gen2Cls2Dims(n: Int) = new SModel( n, Array( PModel2D(CLS_1, 1, 1, 1), PModel2D(CLS_2, 2, 2, 2) ) ) def gen2ClsNDims(n: Int, dims: Int) = new SModel( n, Array( PModelND(CLS_1, dims, 1, 1), PModelND(CLS_2, dims, 2, 2) ) ) }
Example 25
Source File: AbstractTrainer.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.spark.clustering.mtm import org.apache.spark.mllib.linalg.DenseVector import java.util.concurrent.TimeUnit._ import org.apache.spark.rdd.RDD import scala.concurrent.duration.{FiniteDuration, Duration} trait AbstractTrainer extends Serializable { private var iter = 0 def getLastIt = iter private var converge = 1D def getLastConvergence() = converge private var trainingDuration = Duration.Zero def getLastTrainingDuration = trainingDuration protected def initModel(dataset: RDD[Array[Double]], modelOptions: Option[Map[String, String]]) protected def trainingIteration(dataset: RDD[Array[Double]], currentIteration: Int, maxIteration: Int): Double protected def getModel: AbstractModel final def training( dataset: RDD[Array[Double]], modelOptions: Option[Map[String, String]] = None, maxIteration: Int = 100, endConvergeDistance: Double = 0.001 ): AbstractModel = { val datasetSize = dataset.count() val startLearningTime = System.currentTimeMillis() val model = initModel(dataset, modelOptions) iter = 0 converge = 1D while (converge > endConvergeDistance && iter < maxIteration) { // Training iteration val sumConvergence = trainingIteration(dataset, iter, maxIteration) // process convergence converge = sumConvergence / datasetSize iter += 1 } trainingDuration = Duration.create(System.currentTimeMillis() - startLearningTime, MILLISECONDS) println("le model apres training est : "+getModel) // return the model getModel } }
Example 26
Source File: SparseFeaturization.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.cli.featurize import com.collective.modelmatrix.ModelMatrix.ModelMatrixCatalogAccess import com.collective.modelmatrix.cli.{Source, _} import com.collective.modelmatrix.transform.Transformer import com.collective.modelmatrix.{Featurization, Labeling, ModelMatrix} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector} import org.apache.spark.sql.Row import org.apache.spark.sql.types._ import org.slf4j.LoggerFactory import scalaz._ case class SparseFeaturization( modelInstanceId: Int, source: Source, sink: Sink, idColumn: String, repartitionSource: Option[Int], cacheSource: Boolean ) extends Script with SourceTransformation with ModelMatrixCatalogAccess with CliSparkContext { private val log = LoggerFactory.getLogger(classOf[ValidateInputData]) private def sparseSchema(idType: DataType) = StructType(Seq( StructField(idColumn, idType), StructField("column_id", IntegerType), StructField("value", DoubleType) )) import com.collective.modelmatrix.cli.ASCIITableFormat._ import com.collective.modelmatrix.cli.ASCIITableFormats._ def run(): Unit = { log.info(s"Run sparse featurization using Model Matrix instance: $modelInstanceId. " + s"Input source: $source. " + s"Featurized sink: $sink. " + s"Id column: $idColumn") implicit val sqlContext = ModelMatrix.hiveContext(sc) val features = blockOn(db.run(modelInstanceFeatures.features(modelInstanceId))) require(features.nonEmpty, s"No features are defined for model instance: $modelInstanceId. " + s"Ensure that this model instance exists") val featurization = new Featurization(features) val df = toDataFrame(source) val idLabeling = Labeling(idColumn, identity[Any]) val idDataType = df.schema.fields .find(_.name == idColumn) .map(_.dataType) .getOrElse(sys.error(s"Can't find id column: $idColumn")) Transformer.extractFeatures(df, features.map(_.feature), idLabeling) match { // Feature extraction failed case -\/(extractionErrors) => Console.out.println(s"Feature extraction failed:") extractionErrors.printASCIITable() // Extracted feature type validation failed case \/-(extracted) if featurization.validateLabeled(extracted).exists(_.isLeft) => val errors = featurization.validateLabeled(extracted).collect { case -\/(err) => err } Console.out.println(s"Input schema errors:") errors.printASCIITable() // Looks good, let's do featurization case \/-(extracted) => val featurized = featurization.featurize(extracted, idLabeling) // Switch from 0-based Vector index to 1-based ColumnId val rows = featurized.flatMap { case (id, sparse: SparseVector) => (sparse.values zip sparse.indices).map { case (value, idx) => Row(id, idx + 1, value) } case (id, dense: DenseVector) => dense.values.zipWithIndex.map { case (value, idx) => Row(id, idx + 1, value) } } // Apply schema and save sink.saveDataFrame(sqlContext.createDataFrame(rows, sparseSchema(idDataType))) Console.out.println(s"Featurized data set was successfully saved to: $sink") } } }
Example 27
Source File: IDFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 28
Source File: Autoregression.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts.models import com.cloudera.sparkts.Lag import com.cloudera.sparkts.MatrixUtil.{matToRowArrs, toBreeze} import org.apache.commons.math3.random.RandomGenerator import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression import org.apache.spark.mllib.linalg.{DenseVector, Vector} object Autoregression { def fitModel(ts: Vector, maxLag: Int, noIntercept: Boolean = false): ARModel = { // This is loosely based off of the implementation in statsmodels: // https://github.com/statsmodels/statsmodels/blob/master/statsmodels/tsa/ar_model.py // Make left hand side val Y = toBreeze(ts)(maxLag until ts.size) // Make lagged right hand side val X = Lag.lagMatTrimBoth(ts, maxLag) val regression = new OLSMultipleLinearRegression() regression.setNoIntercept(noIntercept) // drop intercept in regression regression.newSampleData(Y.toArray, matToRowArrs(X)) val params = regression.estimateRegressionParameters() val (c, coeffs) = if (noIntercept) (0.0, params) else (params.head, params.tail) new ARModel(c, coeffs) } } class ARModel(val c: Double, val coefficients: Array[Double]) extends TimeSeriesModel { def this(c: Double, coef: Double) = this(c, Array(coef)) def removeTimeDependentEffects( ts: Vector, destTs: Vector = null): Vector = { val dest = if (destTs == null) new Array[Double](ts.size) else destTs.toArray var i = 0 while (i < ts.size) { dest(i) = ts(i) - c var j = 0 while (j < coefficients.length && i - j - 1 >= 0) { dest(i) -= ts(i - j - 1) * coefficients(j) j += 1 } i += 1 } new DenseVector(dest) } def addTimeDependentEffects(ts: Vector, destTs: Vector): Vector = { val dest = if (destTs == null) new Array[Double](ts.size) else destTs.toArray var i = 0 while (i < ts.size) { dest(i) = c + ts(i) var j = 0 while (j < coefficients.length && i - j - 1 >= 0) { dest(i) += dest(i - j - 1) * coefficients(j) j += 1 } i += 1 } new DenseVector(dest) } def sample(n: Int, rand: RandomGenerator): Vector = { val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian())) addTimeDependentEffects(vec, vec) } }
Example 29
Source File: LibSVMRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import com.google.common.base.Charsets import com.google.common.io.Files import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { var tempDir: File = _ var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin tempDir = Utils.createTempDir() val file = new File(tempDir, "part-00000") Files.write(lines, file, Charsets.US_ASCII) path = tempDir.toURI.toString } override def afterAll(): Unit = { Utils.deleteRecursively(tempDir) super.afterAll() } test("select as sparse vector") { val df = sqlContext.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = sqlContext.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = sqlContext.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } }
Example 30
Source File: IDFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } test("params") { ParamsSuite.checkParams(new IDF) val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0))) ParamsSuite.checkParams(model) } test("compute IDF with default parameter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((numOfData + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("IDF read/write") { val t = new IDF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinDocFreq(5) testDefaultReadWrite(t) } test("IDFModel read/write") { val instance = new IDFModel("myIDFModel", new OldIDFModel(Vectors.dense(1.0, 2.0))) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.idf === instance.idf) } }
Example 31
Source File: PolynomialExpansionSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("read/write") { val t = new PolynomialExpansion() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setDegree(3) testDefaultReadWrite(t) } }
Example 32
Source File: Normalizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 33
Source File: IDFSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 34
Source File: ElementwiseProductSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 35
Source File: Normalizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 36
Source File: LinopMatrixAdjoint.scala From spark-lp with Apache License 2.0 | 5 votes |
override def apply(x: DVector): DenseVector = { val n = this.n matrix.zipPartitions(x)((matrixPartition, xPartition) => Iterator.single( matrixPartition.checkedZip(xPartition.next.values.toIterator).aggregate( // NOTE A DenseVector result is assumed here (not sparse safe). Vectors.zeros(n).toDense)( seqop = (_, _) match { case (sum, (matrix_i, x_i)) => { // Multiply an element of x by its corresponding matrix row, and add to the // accumulation sum vector. BLAS.axpy(x_i, matrix_i, sum) sum } }, combop = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } )) ).treeAggregate(Vectors.zeros(n).toDense)( seqOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 }, combOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } , depth ) } }
Example 37
Source File: SolverSLP.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.apache.spark.mllib.linalg.{ BLAS, DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.mllib.optimization.tfocs.fs.dvector.double._ import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.double._ import org.apache.spark.mllib.optimization.tfocs.vs.dvector._ object SolverSLP { def run( c: DVector, A: DMatrix, b: DenseVector, mu: Double, x0: Option[DVector] = None, z0: Option[DenseVector] = None, numContinuations: Int = 10, tol: Double = 1e-4, initialTol: Double = 1e-3, dualTolCheckInterval: Int = 10): (DVector, Array[Double]) = { val minusB = b.copy BLAS.scal(-1.0, minusB) TFOCS_SCD.optimize(new ProxShiftRPlus(c), new LinopMatrixAdjoint(A, minusB), new ProxZero(), mu, x0.getOrElse(c.mapElements(_ => 0.0)), z0.getOrElse(Vectors.zeros(b.size).toDense), numContinuations, tol, initialTol, dualTolCheckInterval) } }
Example 38
Source File: SolverL1RLS.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.fs.dvector.double._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.double._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector._ import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.mllib.optimization.tfocs.vs.dvector._ import org.apache.spark.mllib.optimization.tfocs.vs.vector._ def run(A: DMatrix, b: DVector, lambda: Double, x0: Option[DenseVector] = None): (DenseVector, Array[Double]) = { val (x, TFOCS.OptimizationData(lossHistory, _, _)) = TFOCS.optimize(new SmoothQuad(b), new LinopMatrix(A), new ProxL1(lambda), x0.getOrElse(Vectors.zeros(A.first().size).toDense)) (x, lossHistory) } }
Example 39
Source File: TestLinearProgram.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.examples import scala.util.Random import org.apache.spark.mllib.linalg.DenseVector import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.SolverSLP import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint import org.apache.spark.mllib.random.{ RandomDataGenerator, RandomRDDs } import org.apache.spark.mllib.rdd.RandomVectorRDD import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.util.random.XORShiftRandom object TestLinearProgram { def main(args: Array[String]) { val rnd = new Random(34324) val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLinearProgram") val sc = new SparkContext(sparkConf) val n = 5000 // Tranpose constraint matrix row count. val m = n / 2 // Transpose constrint matrix column count. // Generate a starting 'x' vector, using normally generated values. val x = RandomRDDs.normalRDD(sc, n).map(_ + 10).glom.map(new DenseVector(_)) // Generate the transpose constraint matrix 'A' using sparse normally generated values. val A = new RandomVectorRDD(sc, n, m, sc.defaultMinPartitions, new SparseStandardNormalGenerator(0.01), rnd.nextLong) // Generate the cost vector 'c' using normally generated values. val c = RandomRDDs.normalRDD(sc, n, 0, rnd.nextLong).glom.map(new DenseVector(_)) // Compute 'b' using the starting 'x' vector. val b = new LinopMatrixAdjoint(A)(x) val mu = 1e-2 // Solve the linear program using SolverSLP, finding the optimal x vector 'optimalX'. val (optimalX, _) = SolverSLP.run(c, A, b, mu) println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 40
Source File: TestLASSO.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.examples import scala.util.Random import org.apache.spark.mllib.linalg.{ BLAS, DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.SolverL1RLS import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.{ SparkConf, SparkContext } object TestLASSO { def main(args: Array[String]) { val rnd = new Random(34324) val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLASSO") val sc = new SparkContext(sparkConf) val n = 1024 // Design matrix column count. val m = n / 2 // Design matrix row count. val k = m / 5 // Count of nonzero weights. // Generate the design matrix using random normal values, then normalize the columns. val unnormalizedA = RandomRDDs.normalVectorRDD(sc, m, n, 0, rnd.nextLong) val AColumnNormSq = unnormalizedA.treeAggregate(Vectors.zeros(n).toDense)( seqOp = (sum, rowA) => { val rowASq = Vectors.dense(rowA.toArray.map(rowA_i => rowA_i * rowA_i)) BLAS.axpy(1.0, rowASq, sum) sum }, combOp = (sum1, sum2) => { BLAS.axpy(1.0, sum2, sum1) sum1 }) val A = unnormalizedA.map(rowA => Vectors.dense(rowA.toArray.zip(AColumnNormSq.toArray).map { case (rowA_i, normsq_i) => rowA_i / math.sqrt(normsq_i) })) // Generate the actual 'x' vector, including 'k' nonzero values. val x = Vectors.zeros(n).toDense for (i <- rnd.shuffle(0 to n - 1).take(k)) { x.values(i) = rnd.nextGaussian } // Generate the 'b' vector using the design matrix and weights, adding gaussian noise. val bOriginal = new DenseVector(A.map(rowA => BLAS.dot(rowA, x)).collect) val snr = 30 // SNR in dB val sigma = math.pow(10, ((10 * math.log10(math.pow(Vectors.norm(bOriginal, 2), 2) / n) - snr) / 20)) val b = sc.parallelize(bOriginal.values.map(_ + sigma * rnd.nextGaussian)) .glom .map(new DenseVector(_)) // Set 'lambda' using the noise standard deviation. val lambda = 2 * sigma * math.sqrt(2 * math.log(n)) // Solve the lasso problem using SolverL1RLS, finding the estimated x vector 'estimatedX'. val (estimatedX, _) = SolverL1RLS.run(A, b, lambda) println("estimatedX: " + estimatedX.values.mkString(", ")) sc.stop() } }
Example 41
Source File: TestMPSLinearProgram.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.examples import java.io.File import com.joptimizer.optimizers.LPStandardConverter import com.joptimizer.util.MPSParser import org.apache.spark.mllib.linalg.{ DenseVector, Vector, Vectors } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.SolverSLP import org.apache.spark.{ SparkConf, SparkContext } object TestMPSLinearProgram { def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestMPSLinearProgram") val sc = new SparkContext(sparkConf) // Parse the provided MPS file. val parser = new MPSParser() var mpsFile = new File(args(0)) parser.parse(mpsFile) // Convert the parsed linear program to standard form. val converter = new LPStandardConverter(true) converter.toStandardForm(parser.getC, parser.getG, parser.getH, parser.getA, parser.getB, parser.getLb, parser.getUb) // Convert the parameters of the linear program to spark tfocs compatible formats. val c = sc.parallelize(converter.getStandardC.toArray).glom.map(new DenseVector(_)) val A = sc.parallelize(converter.getStandardA.toArray.transpose.map( Vectors.dense(_).toSparse: Vector)) val b = new DenseVector(converter.getStandardB.toArray) val n = converter.getStandardN val mu = 1e-2 // Solve the linear program using SolverSLP, finding the optimal x vector 'optimalX'. val (optimalX, _) = SolverSLP.run(c, A, b, mu) println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 42
Source File: LPSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace class LPSuite extends FunSuite with MLlibTestSparkContext { val numPartitions = 2 val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0) val BArray = Array( Array(12.0, 16.0, 30.0, 1.0, 0.0), Array(24.0, 16.0, 12.0, 0.0, 1.0), Array(-1.0, 0.0, 0.0, 0.0, 0.0), Array(0.0, -1.0, 0.0, 0.0, 0.0), Array(0.0, 0.0, -1.0, 0.0, 0.0), Array(0.0, 0.0, 0.0, 1.0, 0.0), Array(0.0, 0.0, 0.0, 0.0, 1.0)) val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0) lazy val c = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_)) lazy val rows = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_)) lazy val b = new DenseVector(bArray) test("LP solve is implemented properly") { val (v, x) = LP.solve(c, rows, b, sc=sc) // solution obtained from scipy.optimize.linprog and octave glgk lpsolver with fun_val = 12.083 val expectedSol = Vectors.dense( Array(1.66666667, 5.83333333, 40.0, 0.0, 0.0, 13.33333333, 9.16666667)) val xx = Vectors.dense(x.flatMap(_.toArray).collect()) println(s"$xx") println("optimal min value: " + v) assert(xx ~== expectedSol absTol 1e-6, "LP.solve x should return the correct answer.") } }
Example 43
Source File: InitializeSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, _} import org.apache.spark.mllib.optimization.tfocs.VectorSpace.{DMatrix, DVector} class InitializeSuite extends FunSuite with MLlibTestSparkContext { val numPartitions = 2 val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0) val BArray = Array( Array(12.0, 16.0, 30.0, 1.0, 0.0), Array(24.0, 16.0, 12.0, 0.0, 1.0), Array(-1.0, 0.0, 0.0, 0.0, 0.0), Array(0.0, -1.0, 0.0, 0.0, 0.0), Array(0.0, 0.0, -1.0, 0.0, 0.0), Array(0.0, 0.0, 0.0, 1.0, 0.0), Array(0.0, 0.0, 0.0, 0.0, 1.0)) val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0) lazy val c: DVector = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_)) lazy val rows: DMatrix = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_)) lazy val b: DenseVector = new DenseVector(bArray) val cBrz = new BDV[Double](cArray) val BBrz = new BDM[Double](7, 5, BArray.flatMap(x => x), offset = 0, majorStride = 5, isTranspose = true) val bBrz = new BDV[Double](bArray) // (BT * B) ^(-1) val BTBInv = inv(BBrz.t * BBrz) // xTilda = B * BTBInv * b val xTilda: BDV[Double] = BBrz * (BTBInv * bBrz) // lambdaTilda = BTBInv * (B^T * c) val lambdaTilda: BDV[Double] = BTBInv * (BBrz.t * cBrz) // sTilda = c - B * lambdaTilda val sTilda = cBrz - BBrz * lambdaTilda val deltax = Math.max(1.5 * max(xTilda), 0) val deltas = Math.max(1.5 * max(sTilda), 0) val xHat = xTilda :+ deltax val sHat = sTilda :+ deltas val deltaxHat: Double = 0.5 * (xHat.t * sHat) / sum(sHat) val deltasHat: Double = 0.5 * (xHat.t * sHat) / sum(xHat) // x = xHat + deltaxHat * e val expectedx: BDV[Double] = xHat :+ deltaxHat // val expectedLambda = lambdaTilda val expecteds: BDV[Double] = sHat :+ deltasHat test("Initialize.init is implemented properly") { val result = Initialize.init(c, rows, b) //println(LP.solve(c, rows, b, 1e-4, 1).collect()) assert(Vectors.dense(expectedx.toArray) ~= Vectors.dense(result._1.flatMap(_.toArray).collect()) relTol 1e-6, "Initialize.init x0 is not computed correctly.") assert(Vectors.dense(lambdaTilda.toArray) ~= Vectors.dense(result._2.toArray) relTol 1e-6, "Initialize.init lambda0 is not computed correctly.") assert(Vectors.dense(expecteds.toArray) ~= Vectors.dense(result._3.flatMap(_.toArray).collect()) relTol 1e-6, "Initialize.init s0 should return the correct answer.") } }
Example 44
Source File: SpLinopMatrixSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.fs.dvector.dmatrix._ class SpLinopMatrixSuite extends FunSuite with MLlibTestSparkContext { test("SpLinopMatrix.apply is implemented properly") { val matrix: DMatrix = sc.parallelize(Array( Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)), 2) val vector: DVector = sc.parallelize(Array(2.0, 3.0), 2).glom.map(new DenseVector(_)) val expectApply: DMatrix = sc.parallelize(Array( Vectors.dense(2.0 * 1.0, 2.0 * 2.0, 2.0 * 3.0), Vectors.dense(3.0 * 4.0, 3.0 * 5.0, 3.0 * 6.0)), 2) assert((new SpLinopMatrix(vector))(matrix).collect().deep == expectApply.collect().deep, // or sameElements "SpLinopMatrix.apply should return the correct result.") } }
Example 45
Source File: package.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.vs import org.apache.spark.mllib.linalg.{ BLAS, DenseVector } import org.apache.spark.mllib.optimization.tfocs.VectorSpace package object vector { implicit object DenseVectorSpace extends VectorSpace[DenseVector] { override def combine(alpha: Double, a: DenseVector, beta: Double, b: DenseVector): DenseVector = { val ret = a.copy BLAS.scal(alpha, ret) BLAS.axpy(beta, b, ret) ret } override def dot(a: DenseVector, b: DenseVector): Double = BLAS.dot(a, b) } }
Example 46
Source File: package.scala From spark-lp with Apache License 2.0 | 5 votes |
implicit object DVectorSpace extends VectorSpace[DVector] { override def combine(alpha: Double, a: DVector, beta: Double, b: DVector): DVector = if (alpha == 1.0 && beta == 1.0) { a.zip(b).map { case (aPart, bPart) => { BLAS.axpy(1.0, aPart, bPart) // bPart += aPart bPart } } } else { a.zip(b).map { case (aPart, bPart) => // NOTE A DenseVector result is assumed here (not sparse safe). DenseVectorSpace.combine(alpha, aPart, beta, bPart).toDense } } override def dot(a: DVector, b: DVector): Double = a.dot(b) override def entrywiseProd(a: DVector, b: DVector): DVector = { a.zip(b).map { case (aPart, bPart) => DenseVectorSpace.entrywiseProd(aPart, bPart).toDense } } override def entrywiseNegDiv(a: DVector, b: DVector): DVector = { a.zip(b).map { case (aPart, bPart) => DenseVectorSpace.entrywiseNegDiv(aPart, bPart) } } override def sum(a: DVector): Double = a.aggregate(0.0)( seqOp = (acc: Double, v: DenseVector) => acc + v.values.sum, combOp = (acc1: Double, acc2: Double) => acc1 + acc2 ) override def min(a: DVector): Double = a.aggregate(Double.PositiveInfinity)( (mi, x) => Math.min(mi, x.values.min), Math.min ) override def max(a: DVector): Double = a.aggregate(Double.NegativeInfinity)( (ma, x) => Math.max(ma, x.values.max), Math.max ) override def cache(a: DVector): Unit = if (a.getStorageLevel == StorageLevel.NONE) { a.cache() } } }
Example 47
Source File: package.scala From spark-lp with Apache License 2.0 | 5 votes |
implicit object DenseVectorSpace extends VectorSpace[DenseVector] { override def combine(alpha: Double, a: DenseVector, beta: Double, b: DenseVector): DenseVector = { val ret = a.copy BLAS.scal(alpha, ret) BLAS.axpy(beta, b, ret) ret } override def dot(a: DenseVector, b: DenseVector): Double = BLAS.dot(a, b) override def entrywiseProd(a: DenseVector, b: DenseVector): DenseVector = { val c = a.values.zip(b.values).map { case (i: Double, j: Double) => i * j } new DenseVector(c) } override def entrywiseNegDiv(a: DenseVector, b: DenseVector): DenseVector = { val c = a.values.zip(b.values).map { case (ai, bi) if bi < 0 => ai / Math.max(Math.abs(bi), 1e-15) case (_, bi) if bi >= 0 => Double.PositiveInfinity // Make Infinity value to be neglected in min } new DenseVector(c) } override def sum(a: DenseVector): Double = a.values.sum override def max(a: DenseVector): Double = a.values.max override def min(a: DenseVector): Double = a.values.min } }
Example 48
Source File: TestLPSolver.scala From spark-lp with Apache License 2.0 | 5 votes |
object TestLPSolver { def main(args: Array[String]) { val rnd = new Random(12345) val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLPSolver") val sc = new SparkContext(sparkConf) val n = 1000 // Transpose constraint matrix row count. val m = 100 // Transpose constraint matrix column count. val numPartitions = 2 // Generate the starting vector from uniform distribution U(3.0, 5.0) println("generate x") val x0 = RandomRDDs.uniformRDD(sc, n, numPartitions).map(v => 3.0 + 2.0 * v).glom.map(new DenseVector(_)) // Generate the transpose constraint matrix 'B' using sparse uniformly generated values. println("generate B") val B = new RandomVectorRDD(sc, n, m, numPartitions, new SparseStandardNormalGenerator(0.1), rnd.nextLong) // Generate the cost vector 'c' using uniformly generated values. println("generate c") val c = RandomRDDs.uniformRDD(sc, n, numPartitions, rnd.nextLong).glom.map(new DenseVector(_)) // Compute 'b' using the starting 'x' vector. println("generate b") val b = (new LinopMatrixAdjoint(B))(x0) // Solve the linear program using LP.solve, finding the optimal x vector 'optimalX'. println("Start solving ...") val (optimalVal, _) = LP.solve(c, B, b, sc=sc) println("optimalVal: " + optimalVal) //println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 49
Source File: TestMPSLinearProgramSolver.scala From spark-lp with Apache License 2.0 | 5 votes |
object TestMPSLinearProgramSolver { def main(args: Array[String]) { val conf = new SparkConf() .setMaster("local[2]") .setAppName("TestMPSLinearProgramSolver") val sc = new SparkContext(conf) // Parse the provided MPS file. val parser = new MPSParser() val mpsFile = new File(args(0)) parser.parse(mpsFile) // Convert the parsed linear program to standard form. val converter = new LPStandardConverter(true) converter.toStandardForm(parser.getC, parser.getG, parser.getH, parser.getA, parser.getB, parser.getLb, parser.getUb) // Convert the parameters of the linear program to spark lp compatible formats. val numPartitions = 2 val c: DVector = sc.parallelize(converter.getStandardC.toArray, numPartitions) .glom.map(new DenseVector(_)) val B: DMatrix = sc.parallelize(converter.getStandardA.toArray.transpose.map( Vectors.dense(_).toSparse: Vector), numPartitions) val b = new DenseVector(converter.getStandardB.toArray) println("Start solving ... ") val (optimalVal, optimalX) = LP.solve(c, B, b, sc=sc) println("optimalVal: " + optimalVal) //println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 50
Source File: BGRImgToImageVector.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.image import com.intel.analytics.bigdl.dataset.Transformer import org.apache.log4j.Logger import org.apache.spark.mllib.linalg.DenseVector import scala.collection.Iterator object BGRImgToImageVector { val logger = Logger.getLogger(getClass) def apply(): BGRImgToImageVector = { new BGRImgToImageVector() } } class BGRImgToImageVector() extends Transformer[LabeledBGRImage, DenseVector] { private var featureData: Array[Float] = null override def apply(prev: Iterator[LabeledBGRImage]): Iterator[DenseVector] = { prev.map( img => { if (null == featureData) { featureData = new Array[Float](3 * img.height() * img.width()) } img.copyTo(featureData, 0, true) new DenseVector(featureData.map(_.toDouble)) } ) } }
Example 51
Source File: MlUtils.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.example.imageclassification import com.intel.analytics.bigdl.Module import com.intel.analytics.bigdl.dataset.Transformer import com.intel.analytics.bigdl.dataset.image.{BGRImage, LocalLabeledImagePath} import com.intel.analytics.bigdl.nn.Module import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.dataset.DataSet.SeqFileFolder import org.apache.hadoop.io.Text import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.DenseVector import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import scopt.OptionParser import scala.reflect.ClassTag object MlUtils { val testMean = (0.485, 0.456, 0.406) val testStd = (0.229, 0.224, 0.225) val imageSize = 224 case class ByteImage(data: Array[Byte], imageName: String) def transformDF(data: DataFrame, f: Transformer[Row, DenseVector]): DataFrame = { val vectorRdd = data.select("data").rdd.mapPartitions(f(_)) val dataRDD = data.rdd.zipPartitions(vectorRdd) { (a, b) => b.zip(a.map(_.getAs[String]("imageName"))) .map( v => DfPoint(v._1, v._2) ) } data.sqlContext.createDataFrame(dataRDD) } def imagesLoad(paths: Array[LocalLabeledImagePath], scaleTo: Int): Array[ByteImage] = { var count = 1 val buffer = paths.map(imageFile => { count += 1 ByteImage(BGRImage.readImage(imageFile.path, scaleTo), imageFile.path.getFileName.toString) }) buffer } def imagesLoadSeq(url: String, sc: SparkContext, classNum: Int): RDD[ByteImage] = { sc.sequenceFile(url, classOf[Text], classOf[Text]).map(image => { ByteImage(image._2.copyBytes(), SeqFileFolder.readName(image._1)) }) } }
Example 52
Source File: IDFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 53
Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 54
Source File: LinearOperatorSuite.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.scalatest.FunSuite import org.apache.spark.SparkException import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvectordouble.{ LinopMatrix => LinopMatrixVector } import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector.{ LinopMatrixAdjoint => LinopMatrixVectorAdjoint } import org.apache.spark.mllib.util.MLlibTestSparkContext class LinearOperatorSuite extends FunSuite with MLlibTestSparkContext { lazy val matrix = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)), 2) lazy val vector = new DenseVector(Array(2.2, 3.3, 4.4)) test("LinopMatrix multiplies properly") { val f = new LinopMatrix(matrix) val x = new DenseVector(Array(7.0, 8.0, 9.0)) val result = f(x) val expectedResult = Vectors.dense(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9) assert(Vectors.dense(result.collectElements) == expectedResult, "should return the correct product") } test("LinopMatrixAdjoint multiplies properly") { val f = new LinopMatrixAdjoint(matrix) val y = sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2) val result = f(y) val expectedResult = Vectors.dense(1 * 5 + 4 * 6, 2 * 5 + 5 * 6, 3 * 5 + 6 * 6) assert(result == expectedResult, "should return the correct product") } test("LinopMatrixAdjoint checks for mismatched partition vectors") { val f = new LinopMatrixAdjoint(matrix) val y = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2) intercept[SparkException] { f(y) } } test("LinopMatrixVector multiplies properly") { val f = new LinopMatrixVector(matrix, vector) val x = new DenseVector(Array(7.0, 8.0, 9.0)) val result = f(x) val expectedResult = (new DenseVector(Array(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)), 7.0 * 2.2 + 8.0 * 3.3 + 9.0 * 4.4) assert(Vectors.dense(result._1.collectElements) == expectedResult._1, "should return the correct product") assert(result._2 == expectedResult._2, "should return the correct product") } test("LinopMatrixVectorAdjoint multiplies properly") { var f = new LinopMatrixVectorAdjoint(matrix, vector) val y = (sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2), 8.8) val result = f(y) val expectedResult = Vectors.dense(1 * 5 + 4 * 6 + 2.2, 2 * 5 + 5 * 6 + 3.3, 3 * 5 + 6 * 6 + 4.4) assert(result == expectedResult, "should return the correct product") } test("LinopMatrixVectorAdjoint checks for mismatched partition vectors") { val f = new LinopMatrixVectorAdjoint(matrix, vector) val y = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2), 8.8) intercept[SparkException] { f(y) } } }
Example 55
Source File: RandomProjection.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.linalg import java.util.Random import breeze.stats.distributions.CauchyDistribution import org.apache.spark.mllib.linalg.{ DenseMatrix, Matrices } import org.apache.spark.mllib.linalg.{ DenseVector, Vector } def generateGaussian(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = { val localMatrix = DenseMatrix.randn(projectedDim, originalDim, random) new RandomProjection(localMatrix) } def generateCauchy(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = { def randc(numRows: Int, numCols: Int): DenseMatrix = { require( numRows.toLong * numCols <= Int.MaxValue, s"$numRows x $numCols dense matrix is too large to allocate" ) val cauchyDistribution = new CauchyDistribution(0, 1) new DenseMatrix(numRows, numCols, cauchyDistribution.drawMany(numRows * numCols)) } val localMatrix = randc(projectedDim, originalDim) new RandomProjection(localMatrix) } }
Example 56
Source File: IDFSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 57
Source File: ElementwiseProductSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 58
Source File: Normalizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 59
Source File: HashFunctionsTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed.impl import org.scalacheck.Gen.{choose, oneOf, listOfN} import org.scalacheck.Arbitrary.arbitrary import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.apache.spark.mllib.linalg.DenseVector class HashFunctionsTest extends ImplPropSpec with GeneratorDrivenPropertyChecks with Matchers { import org.scalactic.Tolerance._ property( "simhash returns hashed vector whose dimension is at most the specified signature length") { forAll(simhashGen) { case (vector, signatureLength, simhash) => val bucket = simhash(0L, 0, vector) assert(bucket === simhash(0L, 0, vector.toSparse)) assert(bucket.signature.length <= signatureLength) } } property( "minhash returns hashed vector whose dimension is the specified signature length") { forAll(minhashGen) { case (vector, signatureLength, minhash) => val bucket = minhash(0L, 0, vector) assert(bucket === minhash(0L, 0, vector.toSparse)) assert(bucket.signature.length === signatureLength) } } property( "pstable returns hashed vector whose dimension is the specified signature length") { forAll(pstableGen) { case (vector, signatureLength, pstableL1, pstableL2) => val bucketL1 = pstableL1(0L, 0, vector) val bucketL2 = pstableL2(0L, 0, vector) assert(bucketL1 === pstableL1(0L, 0, vector.toSparse)) assert(bucketL2 === pstableL2(0L, 0, vector.toSparse)) assert(bucketL1.signature.length === signatureLength) assert(bucketL2.signature.length === signatureLength) } } property( "bit sampling returns hashed vector whose dimension is at most the specified signature length") { forAll(bsampleGen) { case (vector, signatureLength, bsample) => val bucket = bsample(0L, 0, vector) assert(bucket === bsample(0L, 0, vector.toSparse)) assert(bucket.signature.length <= signatureLength) } } }
Example 60
Source File: DistributedPropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import scala.reflect.ClassTag import org.scalacheck.Gen import org.scalacheck.Gen.{choose, listOfN} import org.scalatest.PropSpec import org.apache.spark.mllib.linalg.DenseVector import com.holdenkarau.spark.testing.SharedSparkContext abstract class DistributedPropSpec extends PropSpec with SharedSparkContext { private def arraysOfNM[T: ClassTag](numRows: Int, numCols: Int, gen: Gen[T]): Gen[Array[Array[T]]] = Gen.listOfN(numRows * numCols, gen).map { square => square.toArray.grouped(numCols).toArray } private def vectorsOfNM(numRows: Int, numCols: Int, gen: Gen[Double]): Gen[Array[DenseVector]] = for { arrays <- arraysOfNM(numRows, numCols, gen) } yield arrays.map(arr => new DenseVector(arr)) val coordinateMatrixGen = for { lrow <- choose(5, 10) lcol <- choose(5, 10) lvecs <- vectorsOfNM(lrow, lcol, choose(-10.0, 10.0)) rrow <- choose(5, 10) rcol <- choose(5, 10) rvecs <- vectorsOfNM(rrow, rcol, choose(-10.0, 10.0)) } yield ( new IndexedRowMatrix(sc.parallelize(lvecs.zipWithIndex.map { case (vector, i) => new IndexedRow(i, vector) })).toCoordinateMatrix, new IndexedRowMatrix(sc.parallelize(rvecs.zipWithIndex.map { case (vector, i) => new IndexedRow(i, vector) })).toCoordinateMatrix ) }
Example 61
Source File: SRAlgorithm.scala From pio-template-sr with Apache License 2.0 | 5 votes |
package org.template.sr import org.apache.predictionio.controller.P2LAlgorithm import org.apache.predictionio.controller.Params import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import grizzled.slf4j.Logger import org.apache.spark.mllib.linalg.{Vectors,DenseVector} import org.apache.spark.ml.feature.StandardScalerModel import org.apache.spark.ml.regression.{AFTSurvivalRegression,AFTSurvivalRegressionModel} case class AlgorithmParams( val quantileProbabilities: Array[Double], val fitIntercept: Boolean, val maxIter: Int, val convTolerance: Double ) extends Params class SRModel( val aAFTSRModel: AFTSurvivalRegressionModel, val ssModel: org.apache.spark.mllib.feature.StandardScalerModel, val useStandardScaler: Boolean ) extends Serializable {} class SRAlgorithm(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, SRModel, Query, PredictedResult] { @transient lazy val logger = Logger[this.type] def train(sc: SparkContext, data: PreparedData): SRModel = { println("Training SR model.") val aft = new AFTSurvivalRegression().setQuantileProbabilities(ap.quantileProbabilities).setQuantilesCol("quantiles").setFitIntercept(ap.fitIntercept).setMaxIter(ap.maxIter).setTol(ap.convTolerance) val model = aft.fit(data.rows) new SRModel(aAFTSRModel = model, ssModel=data.ssModel, useStandardScaler = data.dsp.useStandardScaler) } def predict(model: SRModel, query: Query): PredictedResult = { // val qryRow0 = Vectors.dense(query.features) val qryRow = if (model.useStandardScaler) { model.ssModel.transform(qryRow0) } else { qryRow0 } val score = model.aAFTSRModel.predict(qryRow) val quantilesVec = model.aAFTSRModel.predictQuantiles(qryRow) PredictedResult(coefficients = model.aAFTSRModel.coefficients.toArray, intercept = model.aAFTSRModel.intercept, scale = model.aAFTSRModel.scale, prediction = score, quantiles = quantilesVec.toArray) } }
Example 62
Source File: TestFFM.scala From spark-ffm with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification._ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.DenseVector import org.apache.spark.rdd.RDD object TestFFM extends App { override def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("TESTFFM").setMaster("local[4]")) if (args.length != 8) { println("testFFM <train_file> <k> <n_iters> <eta> <lambda> " + "<normal> <random>") } val data= sc.textFile(args(0)).map(_.split("\\s")).map(x => { val y = if(x(0).toInt > 0 ) 1.0 else -1.0 val nodeArray: Array[(Int, Int, Double)] = x.drop(1).map(_.split(":")).map(x => { (x(0).toInt, x(1).toInt, x(2).toDouble) }) (y, nodeArray) }).repartition(4) val splits = data.randomSplit(Array(0.7, 0.3)) val (training: RDD[(Double, Array[(Int, Int, Double)])], testing) = (splits(0), splits(1)) //sometimes the max feature/field number would be different in training/testing dataset, // so use the whole dataset to get the max feature/field number val m = data.flatMap(x=>x._2).map(_._1).collect.reduceLeft(_ max _) //+ 1 val n = data.flatMap(x=>x._2).map(_._2).collect.reduceLeft(_ max _) //+ 1 val ffm: FFMModel = FFMWithAdag.train(training, m, n, dim = (args(6).toBoolean, args(7).toBoolean, args(1).toInt), n_iters = args(2).toInt, eta = args(3).toDouble, regParam = (args(4).toDouble, args(5).toDouble), normalization = false, false, "adagrad") val scores: RDD[(Double, Double)] = testing.map(x => { val p = ffm.predict(x._2) val ret = if (p >= 0.5) 1.0 else -1.0 (ret, x._1) }) val metrics = new BinaryClassificationMetrics(scores) val auROC = metrics.areaUnderROC val auPRC = metrics.areaUnderPR val accuracy = scores.filter(x => x._1 == x._2).count().toDouble / scores.count() println(s"accuracy = $accuracy, Area under ROC = $auROC, Area under precision-recall curve = $auPRC") } }
Example 63
Source File: Normalizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 64
Source File: VectorSpaceSuite.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.mllib.optimization.tfocs.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.tfocs.vs.dvectordouble.DVectorDoubleSpace import org.apache.spark.mllib.optimization.tfocs.vs.vector.DenseVectorSpace class VectorSpaceSuite extends FunSuite with MLlibTestSparkContext { test("DenseVectorSpace.combine is implemented properly") { val alpha = 1.1 val a = new DenseVector(Array(2.0, 3.0)) val beta = 4.0 val b = new DenseVector(Array(5.0, 6.0)) val expectedCombination = Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0) assert(DenseVectorSpace.combine(alpha, a, beta, b) == expectedCombination, "DenseVectorSpace.combine should return the correct result.") } test("DenseVectorSpace.dot is implemented properly") { val a = new DenseVector(Array(2.0, 3.0)) val b = new DenseVector(Array(5.0, 6.0)) val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 assert(DenseVectorSpace.dot(a, b) == expectedDot, "DenseVectorSpace.dot should return the correct result.") } test("DVectorSpace.combine is implemented properly") { val alpha = 1.1 val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2) val beta = 4.0 val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2) val combination = DVectorSpace.combine(alpha, a, beta, b) val expectedCombination = Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0) assert(Vectors.dense(combination.collectElements) == expectedCombination, "DVectorSpace.combine should return the correct result.") } test("DVectorSpace.dot is implemented properly") { val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2) val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2) val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0 assert(DVectorSpace.dot(a, b) == expectedDot, "DVectorSpace.dot should return the correct result.") } test("DVectorDoubleSpace.combine is implemented properly") { val alpha = 1.1 val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2), 9.9) val beta = 4.0 val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2), 11.11) val combination = DVectorDoubleSpace.combine(alpha, a, beta, b) val expectedCombination = (Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0), 1.1 * 9.9 + 4.0 * 11.11) assert(Vectors.dense(combination._1.collectElements) == expectedCombination._1, "DVectorVectorSpace.combine should return the correct result.") assert(combination._2 == expectedCombination._2, "DVectorVectorSpace.combine should return the correct result.") } test("DVectorDoubleSpace.dot is implemented properly") { val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2), 9.9) val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2), 11.11) val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0 + 9.9 * 11.11 assert(DVectorDoubleSpace.dot(a, b) == expectedDot, "DVectorVectorSpace.dot should return the correct result.") } }
Example 65
Source File: TFOCS_SCD.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.apache.spark.mllib.linalg.DenseVector import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.mllib.optimization.tfocs.fs.generic.double._ import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.double._ import org.apache.spark.mllib.optimization.tfocs.vs.vector._ import org.apache.spark.mllib.optimization.tfocs.vs.dvectordouble._ object TFOCS_SCD { def optimize( objectiveF: ProxCapableFunction[DVector], affineF: LinearOperator[(DVector, Double), DenseVector], dualProxF: ProxCapableFunction[DenseVector], mu: Double, x0: DVector, z0: DenseVector, numContinuations: Int, tol: Double, initialTol: Double, dualTolCheckInterval: Int)( implicit cols: VectorSpace[DVector]): (DVector, Array[Double]) = { var x0Iter = x0 var z0Iter = z0 var x = x0 var xOld = x0 var L = 1.0 var hist = new Array[Double](0) // Find betaTol, the factor by which to decrease the convergence tolerance on each iteration. val betaTol = math.exp(math.log(initialTol / tol) / (numContinuations - 1)) // Find the initial convergence tolerance. var iterTol = tol * math.pow(betaTol, numContinuations) var hasConverged = false for (nIter <- 1 to numContinuations if !hasConverged) { // Run the convex optimizer until the iterTol tolerance is reached. iterTol = iterTol / betaTol val smoothFunction = new SmoothCombine(new SmoothDual(objectiveF, 1 / mu, x0Iter)) val (z, optimizationData) = TFOCS.optimize(smoothFunction, affineF.t, dualProxF, z0Iter, TFOCSMaxIterations, iterTol, L, true, dualTolCheckInterval) // Update the optimization loop parameters. x = optimizationData.dual.get._1 cols.cache(x) hist ++= optimizationData.lossHistory L = optimizationData.L // Update the prox center, applying acceleration to x. x0Iter = cols.combine(1.0 + (nIter - 1.0) / (nIter + 2.0), x, (1.0 - nIter) / (nIter + 2.0), xOld) z0Iter = z // Check for convergence. val dx = cols.combine(1, x, -1, xOld) val n1 = math.sqrt(cols.dot(dx, dx)) val n2 = math.sqrt(cols.dot(xOld, xOld)) hasConverged = n1 / n2 <= tol xOld = x } (x, hist) } private val TFOCSMaxIterations = 2000 }
Example 66
Source File: LinopMatrixAdjoint.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector import org.apache.spark.mllib.linalg.BLAS import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.CheckedIteratorFunctions._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix import org.apache.spark.mllib.optimization.tfocs.LinearOperator import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class LinopMatrixAdjoint(@transient private val matrix: DMatrix) extends LinearOperator[DVector, DenseVector] { if (matrix.getStorageLevel == StorageLevel.NONE) { matrix.cache() } private lazy val n = matrix.first().size override def apply(x: DVector): DenseVector = { val n = this.n matrix.zipPartitions(x)((matrixPartition, xPartition) => Iterator.single( matrixPartition.checkedZip(xPartition.next.values.toIterator).aggregate( // NOTE A DenseVector result is assumed here (not sparse safe). Vectors.zeros(n).toDense)( seqop = (_, _) match { case (sum, (matrix_i, x_i)) => { // Multiply an element of x by its corresponding matrix row, and add to the // accumulation sum vector. BLAS.axpy(x_i, matrix_i, sum) sum } }, combop = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } )) ).treeAggregate(Vectors.zeros(n).toDense)( seqOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 }, combOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } ) } override def t: LinearOperator[DenseVector, DVector] = new LinopMatrix(matrix) }
Example 67
Source File: LinopMatrix.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector import org.apache.spark.mllib.linalg.{ BLAS, DenseVector } import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint import org.apache.spark.mllib.optimization.tfocs.LinearOperator import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class LinopMatrix(private val matrix: DMatrix) extends LinearOperator[DenseVector, DVector] { if (matrix.getStorageLevel == StorageLevel.NONE) { matrix.cache() } override def apply(x: DenseVector): DVector = { val bcX = matrix.context.broadcast(x) // Take the dot product of each matrix row with x. // NOTE A DenseVector result is assumed here (not sparse safe). matrix.mapPartitions(partitionRows => Iterator.single(new DenseVector(partitionRows.map(row => BLAS.dot(row, bcX.value)).toArray))) } override def t: LinearOperator[DVector, DenseVector] = new LinopMatrixAdjoint(matrix) }
Example 68
Source File: ProxL1.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.vector.double import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.{ ProxCapableFunction, ProxMode, ProxValue } class ProxL1(q: Double) extends ProxCapableFunction[DenseVector] { require(q > 0) override def apply(z: DenseVector, t: Double, mode: ProxMode): ProxValue[DenseVector] = { // NOTE DenseVectors are assumed here (not sparse safe). val shrinkage = q * t val minimizer = shrinkage match { case 0.0 => z case _ => new DenseVector(z.values.map(z_i => z_i * (1.0 - math.min(shrinkage / math.abs(z_i), 1.0)))) } val f = if (mode.f) Some(apply(minimizer)) else None ProxValue(f, Some(minimizer)) } override def apply(x: DenseVector): Double = q * Vectors.norm(x, 1) }
Example 69
Source File: ProjBox.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.vector.double import org.apache.spark.mllib.linalg.DenseVector import org.apache.spark.mllib.optimization.tfocs.{ ProxCapableFunction, ProxMode, ProxValue } class ProjBox(l: DenseVector, u: DenseVector) extends ProxCapableFunction[DenseVector] { override def apply(z: DenseVector, t: Double, mode: ProxMode): ProxValue[DenseVector] = { val minimizer = if (mode.minimizer) { // NOTE DenseVectors are assumed here (not sparse safe). val ret = new Array[Double](z.size) var i = 0 while (i < ret.size) { // Bound each element using the lower and upper limit for that element. ret(i) = math.min(u(i), math.max(l(i), z(i))) i += 1 } Some(new DenseVector(ret)) } else { None } ProxValue(Some(0.0), minimizer) } override def apply(x: DenseVector): Double = { // NOTE DenseVectors are assumed here (not sparse safe). var ret = 0.0 var i = 0 while (i < x.size) { // If an element is outside of that element's bounds, return infinity. if (x(i) > u(i) || x(i) < l(i)) { ret = Double.PositiveInfinity } i += 1 } ret } }
Example 70
Source File: LinopMatrixAdjoint.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector import org.apache.spark.mllib.linalg.{ BLAS, DenseVector } import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.{ LinopMatrixAdjoint => Delegate } import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvectordouble.LinopMatrix import org.apache.spark.mllib.optimization.tfocs.LinearOperator import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ class LinopMatrixAdjoint(private val A: DMatrix, private val b: DenseVector) extends LinearOperator[(DVector, Double), DenseVector] { private val delegate = new Delegate(A) override def apply(x: (DVector, Double)): DenseVector = { val ret = delegate.apply(x._1) BLAS.axpy(1.0, b, ret) ret } override def t: LinearOperator[DenseVector, (DVector, Double)] = new LinopMatrix(A, b) }