org.apache.spark.mllib.linalg.SparseVector Scala Examples
The following examples show how to use org.apache.spark.mllib.linalg.SparseVector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: IDFSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 2
Source File: IDFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } test("params") { ParamsSuite.checkParams(new IDF) val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0))) ParamsSuite.checkParams(model) } test("compute IDF with default parameter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((numOfData + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } }
Example 3
Source File: NormalizerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var data: Array[Vector] = _ @transient var dataFrame: DataFrame = _ @transient var normalizer: Normalizer = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) val sqlContext = new SQLContext(sc) dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData)) normalizer = new Normalizer() .setInputCol("features") .setOutputCol("normalized_features") } def collectResult(result: DataFrame): Array[Vector] = { result.select("normalized_features").collect().map { case Row(features: Vector) => features } } def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { (vector1, vector2) => vector1 ~== vector2 absTol 1E-5 }, "The vector value is not correct after normalization.") } test("Normalization with default parameter") { val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l2Normalized) } test("Normalization with setter") { normalizer.setP(1) val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l1Normalized) } } private object NormalizerSuite { case class FeatureData(features: Vector) }
Example 4
Source File: ElementwiseProductSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 5
Source File: IDFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 6
Source File: Normalizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 7
Source File: ChiSqSelector.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val indices = Statistics.chiSqTest(data) .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) .map { case (_, indices) => indices } .sorted new ChiSqSelectorModel(indices) } }
Example 8
Source File: PolynomialExpansionSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") {//参数 ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") {//带有默认参数的多项式展开 val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") //transform()方法将DataFrame转化为另外一个DataFrame的算法 polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } //多项式展开设置 test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) //transform()方法将DataFrame转化为另外一个DataFrame的算法 polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } }
Example 9
Source File: IDFSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") {//设置IDF计算 val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df)//fit()方法将DataFrame转化为一个Transformer的算法 //transform()方法将DataFrame转化为另外一个DataFrame的算法 idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } }
Example 10
Source File: NormalizerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var data: Array[Vector] = _ @transient var dataFrame: DataFrame = _ @transient var normalizer: Normalizer = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) val sqlContext = new SQLContext(sc) dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData)) normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized_features") } //收集的结果 def collectResult(result: DataFrame): Array[Vector] = { result.select("normalized_features").collect().map { case Row(features: Vector) => features } } //向量的断言类型 def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } //断言值 def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { (vector1, vector2) => vector1 ~== vector2 absTol 1E-5 }, "The vector value is not correct after normalization.") } test("Normalization with default parameter") {//默认参数的正常化 //transform()方法将DataFrame转化为另外一个DataFrame的算法 normalizer.transform(dataFrame).show() val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l2Normalized) } test("Normalization with setter") {//规范化设置 normalizer.setP(1) //transform()方法将DataFrame转化为另外一个DataFrame的算法 normalizer.transform(dataFrame).show() val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l1Normalized) } } private object NormalizerSuite { case class FeatureData(features: Vector) }
Example 11
Source File: ElementwiseProductSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { //产品应适用于数据集在一个密集的矢量 test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) //批理变换和每个变换,得到相同的结果 //transform()方法将DataFrame转化为另外一个DataFrame的算法 val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } //元素(Hadamard)产品应正确运用向量的稀疏数据集 test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) //transform()方法将DataFrame转化为另外一个DataFrame的算法 val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 12
Source File: Normalizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 13
Source File: ElementwiseProductSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 14
Source File: PolynomialExpansionSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } }
Example 15
Source File: Normalizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 16
Source File: PolynomialExpansionSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("read/write") { val t = new PolynomialExpansion() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setDegree(3) testDefaultReadWrite(t) } }
Example 17
Source File: IDFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } test("params") { ParamsSuite.checkParams(new IDF) val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0))) ParamsSuite.checkParams(model) } test("compute IDF with default parameter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((numOfData + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("IDF read/write") { val t = new IDF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinDocFreq(5) testDefaultReadWrite(t) } test("IDFModel read/write") { val instance = new IDFModel("myIDFModel", new OldIDFModel(Vectors.dense(1.0, 2.0))) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.idf === instance.idf) } }
Example 18
Source File: LibSVMRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import com.google.common.base.Charsets import com.google.common.io.Files import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { var tempDir: File = _ var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin tempDir = Utils.createTempDir() val file = new File(tempDir, "part-00000") Files.write(lines, file, Charsets.US_ASCII) path = tempDir.toURI.toString } override def afterAll(): Unit = { Utils.deleteRecursively(tempDir) super.afterAll() } test("select as sparse vector") { val df = sqlContext.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = sqlContext.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = sqlContext.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } }
Example 19
Source File: ElementwiseProductSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 20
Source File: IDFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 21
package com.lendap.spark.lsh class LSH(data : RDD[(Long, SparseVector)] = null, m: Int = 0, numHashFunc : Int = 4, numHashTables : Int = 4) extends Serializable { def run() : LSHModel = { //create a new model object val model = new LSHModel(m, numHashFunc, numHashTables) val dataRDD = data.cache() //compute hash keys for each vector // - hash each vector numHashFunc times // - concat each hash value to create a hash key // - position hashTable id hash keys and vector id into a new RDD. // - creates RDD of ((hashTable#, hash_key), vec_id) tuples. model.hashTables = dataRDD .map(v => (model.hashFunctions.map(h => (h._1.hash(v._2), h._2 % numHashTables)), v._1)) .map(x => x._1.map(a => ((a._2, x._2), a._1))) .flatMap(a => a).groupByKey() .map(x => ((x._1._1, x._2.mkString("")), x._1._2)).cache() model } def cosine(a: SparseVector, b: SparseVector): Double = { val intersection = a.indices.intersect(b.indices) val magnitudeA = intersection.map(x => Math.pow(a.apply(x), 2)).sum val magnitudeB = intersection.map(x => Math.pow(b.apply(x), 2)).sum intersection.map(x => a.apply(x) * b.apply(x)).sum / (Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB)) } }
Example 22
Source File: SparseFeaturization.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.cli.featurize import com.collective.modelmatrix.ModelMatrix.ModelMatrixCatalogAccess import com.collective.modelmatrix.cli.{Source, _} import com.collective.modelmatrix.transform.Transformer import com.collective.modelmatrix.{Featurization, Labeling, ModelMatrix} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector} import org.apache.spark.sql.Row import org.apache.spark.sql.types._ import org.slf4j.LoggerFactory import scalaz._ case class SparseFeaturization( modelInstanceId: Int, source: Source, sink: Sink, idColumn: String, repartitionSource: Option[Int], cacheSource: Boolean ) extends Script with SourceTransformation with ModelMatrixCatalogAccess with CliSparkContext { private val log = LoggerFactory.getLogger(classOf[ValidateInputData]) private def sparseSchema(idType: DataType) = StructType(Seq( StructField(idColumn, idType), StructField("column_id", IntegerType), StructField("value", DoubleType) )) import com.collective.modelmatrix.cli.ASCIITableFormat._ import com.collective.modelmatrix.cli.ASCIITableFormats._ def run(): Unit = { log.info(s"Run sparse featurization using Model Matrix instance: $modelInstanceId. " + s"Input source: $source. " + s"Featurized sink: $sink. " + s"Id column: $idColumn") implicit val sqlContext = ModelMatrix.hiveContext(sc) val features = blockOn(db.run(modelInstanceFeatures.features(modelInstanceId))) require(features.nonEmpty, s"No features are defined for model instance: $modelInstanceId. " + s"Ensure that this model instance exists") val featurization = new Featurization(features) val df = toDataFrame(source) val idLabeling = Labeling(idColumn, identity[Any]) val idDataType = df.schema.fields .find(_.name == idColumn) .map(_.dataType) .getOrElse(sys.error(s"Can't find id column: $idColumn")) Transformer.extractFeatures(df, features.map(_.feature), idLabeling) match { // Feature extraction failed case -\/(extractionErrors) => Console.out.println(s"Feature extraction failed:") extractionErrors.printASCIITable() // Extracted feature type validation failed case \/-(extracted) if featurization.validateLabeled(extracted).exists(_.isLeft) => val errors = featurization.validateLabeled(extracted).collect { case -\/(err) => err } Console.out.println(s"Input schema errors:") errors.printASCIITable() // Looks good, let's do featurization case \/-(extracted) => val featurized = featurization.featurize(extracted, idLabeling) // Switch from 0-based Vector index to 1-based ColumnId val rows = featurized.flatMap { case (id, sparse: SparseVector) => (sparse.values zip sparse.indices).map { case (value, idx) => Row(id, idx + 1, value) } case (id, dense: DenseVector) => dense.values.zipWithIndex.map { case (value, idx) => Row(id, idx + 1, value) } } // Apply schema and save sink.saveDataFrame(sqlContext.createDataFrame(rows, sparseSchema(idDataType))) Console.out.println(s"Featurized data set was successfully saved to: $sink") } } }
Example 23
Source File: SignRandomProjectionLSH.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import scala.collection.immutable.BitSet import scala.collection.mutable.ArrayBuffer import scala.util.Random import scala.util.hashing.MurmurHash3 import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.Logging class SignRandomProjectionLSH(poolSize: Int = 10000) extends Serializable with Logging { val pool = SignRandomProjectionLSH.generatePool(poolSize) def computeSignature(vector: SparseVector, length: Int): BitSet = { val buf = ArrayBuffer.empty[Int] val elements = vector.indices.zip(vector.values) for (bit <- 1 to length) { val components = elements.map(e => { val hash = MurmurHash3.productHash((bit, e._1)) val poolIndex = ((hash % poolSize) + poolSize) % poolSize val result = e._2 * pool(poolIndex) result }) val dotProduct = components.reduce(_ + _) if (dotProduct > 0) { buf += bit } } BitSet(buf.toArray:_*) } } object SignRandomProjectionLSH { def signatureSet(length: Int): Set[BitSet] = { BitSet(1 to length:_*).subsets.toSet } def estimateCosine(a: BitSet, b: BitSet, length: Int): Double = { val hammingDistance = (a^b).size math.cos(hammingDistance.toDouble/length.toDouble*math.Pi) } private def generatePool(size: Int): Array[Double] = { val rand = new Random() val buf = ArrayBuffer.fill[Double](size)(rand.nextGaussian) buf.toArray } }
Example 24
Source File: DocumentSegmenter.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import chalk.text.analyze.PorterStemmer import chalk.text.segment.JavaSentenceSegmenter import chalk.text.tokenize.SimpleEnglishTokenizer case class Document(id: String, text: String) case class Sentence(id: Long, docId: String, text: String) case class SentenceTokens(id: Long, docId: String, tokens: Seq[String]) class DocumentSegmenter extends Serializable { def apply(documents: RDD[Document]) = { val sentences = extractSentences(documents) val tokenized = tokenize(sentences) (sentences, tokenized) } private def extractSentences(documents: RDD[Document]) : RDD[Sentence] = { documents .flatMap(d => segment(d.text).map(t => (d.id, t)) ) .zipWithIndex() .map({ case ((docId, sentenceText), sentenceId) => Sentence(sentenceId, docId, sentenceText) }) } private def tokenize(sentences: RDD[Sentence]) : RDD[SentenceTokens] = { val tokenizer = SimpleEnglishTokenizer() val nonWord = "[^a-z]*".r sentences.map(s => { val tokens = tokenizer(s.text.toLowerCase).toSeq .map(nonWord.replaceAllIn(_, "")) .filter(_.length > 3) .map(stem) SentenceTokens(s.id, s.docId, tokens) }) } private def segment(text: String) : Seq[String] = { JavaSentenceSegmenter(text).toSeq } private def stem(token: String) : String = { PorterStemmer(token) } }
Example 25
Source File: Featurizer.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.{SparseVector, Vector} case class SentenceFeatures(id: Long, docId: String, features: SparseVector) class Featurizer(numStopwords: Int = 0) extends Serializable { private val hashingTF = new HashingTF() private val byIDF = Ordering[Double].on[(Int,Double)](_._2) def apply(tokens: RDD[SentenceTokens]) : RDD[SentenceFeatures] = { val idf = new IDF(minDocFreq = 2) val termFrequencies = tokens.map(t => { (t.id, t.docId, hashingTF.transform(t.tokens)) }) val idfModel = idf.fit(termFrequencies.map({ case (_, _, tf) => tf })) val stopwordIndices = identifyStopwords(idfModel.idf.toSparse, numStopwords) termFrequencies .map({ case (id, docId, tf) => val tfidf = idfModel.transform(tf).toSparse val features = removeStopwords(tfidf, stopwordIndices) SentenceFeatures(id, docId, features) }) .filter(_.features.indices.size > 0) } def indexOf(token: String): Int = { hashingTF.indexOf(token) } private def identifyStopwords(idf: SparseVector, numStopwords: Int) = { featureTuples(idf).sorted(byIDF).take(numStopwords).map(_._1) } private def removeStopwords(tf: SparseVector, stopwordIndices: Array[Int]) = { val (indices, values) = featureTuples(tf) .filter(p => !stopwordIndices.contains(p._1)) .unzip new SparseVector(tf.size, indices.toArray, values.toArray) } private def featureTuples(featureVector: SparseVector) = { featureVector.indices.zip(featureVector.values) } }
Example 26
Source File: SignRandomProjectionLSHSuite.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.SparseVector class SignRandomProjectionLSHSuite extends FunSuite with TestSparkContext { val lshModel = new SignRandomProjectionLSH test("signatures are deterministic") { val nonzero = new SparseVector(3, Array(1,2,3), Array(1,1,1)) val signatureA = lshModel.computeSignature(nonzero, 2) val signatureB = lshModel.computeSignature(nonzero, 2) assert(signatureA === signatureB) } test("zero vectors get signatures") { val zero = new SparseVector(3, Array(1,2,3), Array(0,0,0)) val signature0 = lshModel.computeSignature(zero, 2) } }
Example 27
Source File: CollisionStrategySuite.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors import org.scalatest.FunSuite import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.SparseVector class CollisionStrategySuite extends FunSuite with TestSparkContext { val numPoints = 1000 val dimensions = 100 val density = 0.5 var points: RDD[(Long, SparseVector)] = _ override def beforeAll() { super.beforeAll() val localPoints = TestHelpers.generateRandomPoints(numPoints, dimensions, density) points = sc.parallelize(localPoints).zipWithIndex.map(_.swap) } test("SimpleCollisionStrategy produces the correct number of tuples") { val ann = new ANN(dimensions, "cosine") .setTables(1) .setSignatureLength(8) val model = ann.train(points) val hashTables = model.hashTables val collidable = model.collisionStrategy(hashTables) assert(collidable.count() == numPoints) } test("BandingCollisionStrategy produces the correct number of tuples") { val numBands = 4 val ann = new ANN(dimensions, "jaccard") .setTables(1) .setSignatureLength(8) .setBands(numBands) .setPrimeModulus(739) val model = ann.train(points) val hashTables = model.hashTables val collidable = model.collisionStrategy(hashTables) assert(collidable.count() == numPoints * numBands) } }
Example 28
Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 29
Source File: IDFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 30
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{ SparseVector => SV } object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.weightedFMeasure) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 31
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.mllib.util.MLUtils //import org.apache.spark.ml.feature.HashingTF //import org.apache.spark.ml.feature.IDF object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) println(zipped.first()) val train = zipped.map { case (topic, vector) => { LabeledPoint(newsgroupsMap(topic), vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(train,"./output/20news-by-date-train-libsvm") train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => { println(topic) println(vector) LabeledPoint(topic, vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(test,"./output/20news-by-date-test-libsvm") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.accuracy) println(metrics.weightedFalsePositiveRate) println(metrics.weightedPrecision) println(metrics.weightedFMeasure) println(metrics.weightedRecall) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 32
Source File: Normalizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 33
Source File: ElementwiseProductSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 34
Source File: IDFSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 35
Source File: BandingCollisionStrategy.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.collision import scala.util.hashing.MurmurHash3 import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import com.github.karlhigley.spark.neighbors.lsh.{ BitSignature, HashTableEntry, IntSignature } def apply(hashTables: RDD[_ <: HashTableEntry[_]]): RDD[(Product, Point)] = { val bandEntries = hashTables.flatMap(entry => { val elements = entry.sigElements val banded = elements.grouped(elements.size / bands).zipWithIndex banded.map { case (bandSig, bandNum) => { // Arrays are mutable and can't be used in RDD keys // Use a hash value (i.e. an int) as a substitute val bandSigHash = MurmurHash3.arrayHash(bandSig) val key = (entry.table, bandNum, bandSigHash).asInstanceOf[Product] (key, (entry.id, entry.point)) } } }) bandEntries } }
Example 36
Source File: SimpleCollisionStrategy.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.collision import scala.util.hashing.MurmurHash3 import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import com.github.karlhigley.spark.neighbors.lsh.{ BitSignature, HashTableEntry, IntSignature } def apply(hashTables: RDD[_ <: HashTableEntry[_]]): RDD[(Product, Point)] = { val entries = hashTables.map(entry => { // Arrays are mutable and can't be used in RDD keys // Use a hash value (i.e. an int) as a substitute val key = (entry.table, MurmurHash3.arrayHash(entry.sigElements)).asInstanceOf[Product] (key, (entry.id, entry.point)) }) entries } }
Example 37
Source File: Signature.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.lsh import scala.collection.immutable.BitSet import org.apache.spark.mllib.linalg.SparseVector private[neighbors] sealed abstract class HashTableEntry[+S <: Signature[_]] { val id: Long val table: Int val signature: S val point: SparseVector def sigElements: Array[Int] } private[neighbors] final case class BitHashTableEntry( id: Long, table: Int, signature: BitSignature, point: SparseVector ) extends HashTableEntry[BitSignature] { def sigElements: Array[Int] = { signature.elements.toArray } } private[neighbors] final case class IntHashTableEntry( id: Long, table: Int, signature: IntSignature, point: SparseVector ) extends HashTableEntry[IntSignature] { def sigElements: Array[Int] = { signature.elements } }
Example 38
Source File: BitSamplingFunction.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.lsh import java.util.Random import scala.collection.immutable.BitSet import org.apache.spark.mllib.linalg.SparseVector def generate( originalDim: Int, signatureLength: Int, random: Random = new Random ): BitSamplingFunction = { val indices = Array.fill(signatureLength) { random.nextInt(originalDim) } new BitSamplingFunction(indices) } }
Example 39
Source File: MinhashFunction.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.lsh import java.util.Random import org.apache.spark.mllib.linalg.SparseVector def generate( dimensions: Int, signatureLength: Int, prime: Int, random: Random = new Random ): MinhashFunction = { val perms = new Array[PermutationFunction](signatureLength) var i = 0 while (i < signatureLength) { perms(i) = PermutationFunction.random(dimensions, prime, random) i += 1 } new MinhashFunction(perms) } }
Example 40
Source File: Normalizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 41
Source File: ANNModelSuite.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors import org.scalatest.FunSuite import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.SparseVector import com.github.karlhigley.spark.neighbors.lsh.HashTableEntry class ANNModelSuite extends FunSuite with TestSparkContext { val numPoints = 1000 val dimensions = 100 val density = 0.5 var points: RDD[(Long, SparseVector)] = _ override def beforeAll() { super.beforeAll() val localPoints = TestHelpers.generateRandomPoints(numPoints, dimensions, density) points = sc.parallelize(localPoints).zipWithIndex.map(_.swap) } test("average selectivity is between zero and one") { val ann = new ANN(dimensions, "cosine") .setTables(1) .setSignatureLength(16) val model = ann.train(points) val selectivity = model.avgSelectivity() assert(selectivity > 0.0) assert(selectivity < 1.0) } test("average selectivity increases with more tables") { val ann = new ANN(dimensions, "cosine") .setTables(1) .setSignatureLength(16) val model1 = ann.train(points) ann.setTables(2) val model2 = ann.train(points) assert(model1.avgSelectivity() < model2.avgSelectivity()) } test("average selectivity decreases with signature length") { val ann = new ANN(dimensions, "cosine") .setTables(1) .setSignatureLength(4) val model4 = ann.train(points) ann.setSignatureLength(8) val model8 = ann.train(points) assert(model4.avgSelectivity() > model8.avgSelectivity()) } }
Example 42
Source File: TestHelpers.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors import scala.util.Random import org.apache.spark.mllib.linalg.SparseVector object TestHelpers { def generateRandomPoints(quantity: Int, dimensions: Int, density: Double) = { val numElements = math.floor(dimensions * density).toInt val points = new Array[SparseVector](quantity) var i = 0 while (i < quantity) { val indices = generateIndices(numElements, dimensions) val values = generateValues(numElements) points(i) = new SparseVector(dimensions, indices, values) i += 1 } points } def generateIndices(quantity: Int, dimensions: Int) = { val indices = new Array[Int](quantity) var i = 0 while (i < quantity) { val possible = Random.nextInt(dimensions) if (!indices.contains(possible)) { indices(i) = possible i += 1 } } indices } def generateValues(quantity: Int) = { val values = new Array[Double](quantity) var i = 0 while (i < quantity) { values(i) = Random.nextGaussian() i += 1 } values } }
Example 43
Source File: DistanceMeasureSuite.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors import org.scalatest.FunSuite import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.SparseVector import com.github.karlhigley.spark.neighbors.linalg._ class DistanceMeasureSuite extends FunSuite with TestSparkContext { import org.scalactic.Tolerance._ val values = Array(1.0, 1.0, 1.0, 1.0) val v1 = new SparseVector(10, Array(0, 3, 6, 8), values) val v2 = new SparseVector(10, Array(1, 4, 7, 9), values) val v3 = new SparseVector(10, Array(2, 5, 7, 9), values) test("Cosine distance") { assert(CosineDistance.compute(v1, v1) === 0.0) assert(CosineDistance.compute(v1, v2) === 1.0) assert(CosineDistance.compute(v2, v3) === 0.5) } test("Euclidean distance") { assert(EuclideanDistance.compute(v1, v1) === 0.0) assert(EuclideanDistance.compute(v1, v2) === 2.83 +- 0.01) assert(EuclideanDistance.compute(v2, v3) === 2.0) } test("Manhattan distance") { assert(ManhattanDistance.compute(v1, v1) === 0.0) assert(ManhattanDistance.compute(v1, v2) === 8.0) assert(ManhattanDistance.compute(v2, v3) === 4.0) } test("Hamming distance") { assert(HammingDistance.compute(v1, v1) === 0.0) assert(HammingDistance.compute(v1, v2) === 8.0) assert(HammingDistance.compute(v2, v3) === 4.0) } test("Jaccard distance") { assert(JaccardDistance.compute(v1, v1) === 0.0) assert(JaccardDistance.compute(v1, v2) === 1.0) assert(JaccardDistance.compute(v2, v3) === 0.67 +- 0.01) } }
Example 44
Source File: KMeanTest.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector} import scala.util.Random //spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9 //guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15 object ScalableKMeanTest { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}") val sc = new SparkContext(conf) val k = args(0).toInt val dimension = args(1).toInt val recordNum = args(2).toInt val sparsity = args(3).toDouble val iterations = args(4).toInt val means = args(5) val parNumber = args(6).toInt val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => { val ran = new Random() val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray val vec: Vector = new SparseVector(dimension, indexArr, valueArr) vec }).cache() println(args.mkString(", ")) println(data.count() + " records generated") val st = System.nanoTime() val model = if(means == "my") { println("running scalable kmeans") val model = new ScalableKMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } else { println("running mllib kmeans") val model = new KMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } println((System.nanoTime() - st) / 1e9 + " seconds cost") println("final clusters: " + model.clusterCenters.length) println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) sc.stop() } }
Example 45
Source File: lda-script.scala From practical-data-science-with-hadoop-and-spark with Apache License 2.0 | 5 votes |
import collection.JavaConversions._ import scala.collection.mutable import opennlp.tools.tokenize.SimpleTokenizer import opennlp.tools.stemmer.PorterStemmer import org.apache.spark.rdd._ import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA} import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors} import org.apache.spark.mllib.feature.IDF // add openNLP jar to the Spark Context sc.addJar("opennlp-tools-1.6.0.jar") // Load documents from text files, 1 element (text string) per file val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2) // read stop words from file val stopwordFile = "stop-words.txt" val st_words = sc.textFile(stopwordFile).collect() .flatMap(_.stripMargin.split("\\s+")).map(_.toLowerCase).toSet val stopwords = sc.broadcast(st_words) val minWordLength = 3 val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => val tokenizer = SimpleTokenizer.INSTANCE val stemmer = new PorterStemmer() val tokens = tokenizer.tokenize(text) val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w))) .map(w => stemmer.stem(w)) id -> words }.filter(_._2.length > 0) tokenized.cache() val numDocs = tokenized.count() val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => tokens.map(_ -> 1L) }.reduceByKey(_ + _) wordCounts.cache() val fullVocabSize = wordCounts.count() val vSize = 10000 val (vocab: Map[String, Int], selectedTokenCount: Long) = { val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)} (sortedWC.map(_._1).zipWithIndex.toMap, sortedWC.map(_._2).sum) } val documents = tokenized.map { case (id, tokens) => // Filter tokens by vocabulary, and create word count vector representation of document. val wc = new mutable.HashMap[Int, Int]() tokens.foreach { term => if (vocab.contains(term)) { val termIndex = vocab(term) wc(termIndex) = wc.getOrElse(termIndex, 0) + 1 } } val indices = wc.keys.toArray.sorted val values = indices.map(i => wc(i).toDouble) val sb = Vectors.sparse(vocab.size, indices, values) (id, sb) } val vocabArray = new Array[String](vocab.size) vocab.foreach { case (term, i) => vocabArray(i) = term } val tf = documents.map { case (id, vec) => vec }.cache() val idfVals = new IDF().fit(tf).idf.toArray val tfidfDocs: RDD[(Long, Vector)] = documents.map { case (id, vec) => val indices = vec.asInstanceOf[SparseVector].indices val counts = new mutable.HashMap[Int, Double]() for (idx <- indices) { counts(idx) = vec(idx) * idfVals(idx) } (id, Vectors.sparse(vocab.size, counts.toSeq)) } val numTopics = 5 val numIterations = 50 val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online") val ldaModel = lda.run(tfidfDocs) val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5) topicIndices.foreach { case (terms, termWeights) => println("TOPIC:") terms.zip(termWeights).foreach { case (term, weight) => println(s"${vocabArray(term.toInt)}\t$weight") } println() }
Example 46
Source File: Normalizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 47
Source File: ElementwiseProductSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 48
Source File: IDFSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 49
Source File: MllibHelper.scala From twitter-stream-ml with GNU General Public License v3.0 | 5 votes |
package com.giorgioinf.twtml.spark import java.text.Normalizer import org.apache.spark.Logging import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import scala.math.BigDecimal import twitter4j.Status object MllibHelper extends Logging { val numNumberFeatures = 4 var numRetweetBegin = 100 var numRetweetEnd = 1000 var numTextFeatures = 1000 var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray def reset(conf:ConfArguments) { numRetweetBegin = conf.numRetweetBegin numRetweetEnd = conf.numRetweetEnd numTextFeatures = conf.numTextFeatures var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures") } def featurizeText(statuses: Status): SparseVector = { val text = statuses.getRetweetedStatus .getText .toLowerCase // Separate accents from characters and then remove non-unicode // characters val noAccentText = Normalizer .normalize(text, Normalizer.Form.NFD) .replaceAll("\\p{M}", "") // bigrams hashText.transform(text.sliding(2).toSeq) .asInstanceOf[SparseVector] } def featurizeNumbers(statuses: Status): Vector = { val user = statuses.getRetweetedStatus.getUser val created = statuses.getRetweetedStatus.getCreatedAt val timeLeft = (System.currentTimeMillis - created.getTime) Vectors.dense( user.getFollowersCount * Math.pow(10, -12), user.getFavouritesCount * Math.pow(10, -12), user.getFriendsCount * Math.pow(10, -12), timeLeft * Math.pow(10, -14) //retweeted.getURLEntities.length, //retweeted.getUserMentionEntities.length ) } def featurize(statuses: Status): LabeledPoint = { val textFeatures = featurizeText(statuses) val numberFeatures = featurizeNumbers(statuses) val features = Vectors.sparse( numFeatures, textFeatures.indices ++ numberFeatureIndices, textFeatures.values ++ numberFeatures.toArray ) LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features ) } def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = { val n = statuses.getRetweetedStatus.getRetweetCount (n >= start && n <= end) } def filtrate(statuses: Status): Boolean = { ( statuses.isRetweet && //statuses.getLang == "en" && retweetInterval(statuses, numRetweetBegin, numRetweetEnd) ) } }
Example 50
Source File: Normalizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 51
Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new VectorAssembler) } test("assemble") { import org.apache.spark.ml.feature.VectorAssembler.assemble assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty)) assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0))) val dv = Vectors.dense(2.0, 0.0) assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0))) val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0)) assert(assemble(0.0, dv, 1.0, sv) === Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0))) for (v <- Seq(1, "a", null)) { intercept[SparkException](assemble(v)) intercept[SparkException](assemble(1.0, v)) } } test("assemble should compress vectors") { import org.apache.spark.ml.feature.VectorAssembler.assemble val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) assert(v1.isInstanceOf[SparseVector]) val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) assert(v2.isInstanceOf[DenseVector]) } test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) )).toDF("id", "x", "y", "name", "z", "n") val assembler = new VectorAssembler() .setInputCols(Array("x", "y", "z", "n")) .setOutputCol("features") assembler.transform(df).select("features").collect().foreach { case Row(v: Vector) => assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0))) } } test("ML attributes") { val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari") val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0) val user = new AttributeGroup("user", Array( NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"), NumericAttribute.defaultAttr.withName("salary"))) val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0))) val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad") .select( col("browser").as("browser", browser.toMetadata()), col("hour").as("hour", hour.toMetadata()), col("count"), // "count" is an integer column without ML attribute col("user").as("user", user.toMetadata()), col("ad")) // "ad" is a vector column without ML attribute val assembler = new VectorAssembler() .setInputCols(Array("browser", "hour", "count", "user", "ad")) .setOutputCol("features") val output = assembler.transform(df) val schema = output.schema val features = AttributeGroup.fromStructField(schema("features")) assert(features.size === 7) val browserOut = features.getAttr(0) assert(browserOut === browser.withIndex(0).withName("browser")) val hourOut = features.getAttr(1) assert(hourOut === hour.withIndex(1).withName("hour")) val countOut = features.getAttr(2) assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2)) val userGenderOut = features.getAttr(3) assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3)) val userSalaryOut = features.getAttr(4) assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4)) assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5)) assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6)) } }