org.apache.spark.mllib.util.MLlibTestSparkContext Scala Examples
The following examples show how to use org.apache.spark.mllib.util.MLlibTestSparkContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AreaUnderCurveSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext { test("auc computation") { val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0)) val auc = 4.0 assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5) } test("auc of an empty curve") { val curve = Seq.empty[(Double, Double)] assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } test("auc of a curve with a single point") { val curve = Seq((1.0, 1.0)) assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } }
Example 2
Source File: ChiSqSelectorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = ChiSqSelectorModel.load(sc, path) ChiSqSelectorSuite.checkEqual(model, sameModel) } finally { Utils.deleteRecursively(tempDir) } } } object ChiSqSelectorSuite extends SparkFunSuite { def createModel(): ChiSqSelectorModel = { val arr = Array(1, 2, 3, 4) new ChiSqSelectorModel(arr) } def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = { assert(a.selectedFeatures.deep == b.selectedFeatures.deep) } }
Example 3
Source File: Word2VecSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.Utils class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { // TODO: add more tests test("Word2Vec") { val sentence = "a b " * 100 + "a c " * 10 val localDoc = Seq(sentence, sentence) val doc = sc.parallelize(localDoc) .map(line => line.split(" ").toSeq) val model = new Word2Vec().setVectorSize(10).setSeed(42L).fit(doc) val syms = model.findSynonyms("a", 2) assert(syms.length == 2) assert(syms(0)._1 == "b") assert(syms(1)._1 == "c") // Test that model built using Word2Vec, i.e wordVectors and wordIndec // and a Word2VecMap give the same values. val word2VecMap = model.getVectors val newModel = new Word2VecModel(word2VecMap) assert(newModel.getVectors.mapValues(_.toSeq) === word2VecMap.mapValues(_.toSeq)) } test("Word2Vec throws exception when vocabulary is empty") { intercept[IllegalArgumentException] { val sentence = "a b c" val localDoc = Seq(sentence, sentence) val doc = sc.parallelize(localDoc) .map(line => line.split(" ").toSeq) new Word2Vec().setMinCount(10).fit(doc) } } test("Word2VecModel") { val num = 2 val word2VecMap = Map( ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)), ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)), ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)), ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)) ) val model = new Word2VecModel(word2VecMap) val syms = model.findSynonyms("china", num) assert(syms.length == num) assert(syms(0)._1 == "taiwan") assert(syms(1)._1 == "japan") } test("model load / save") { val word2VecMap = Map( ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)), ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)), ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)), ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)) ) val model = new Word2VecModel(word2VecMap) val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = Word2VecModel.load(sc, path) assert(sameModel.getVectors.mapValues(_.toSeq) === model.getVectors.mapValues(_.toSeq)) } finally { Utils.deleteRecursively(tempDir) } } }
Example 4
Source File: ElementwiseProductSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 5
Source File: IDFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 6
Source File: PCASuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val pc = mat.computePrincipalComponents(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() assert(pca_transform.toSet === mat_multiply.toSet) } }
Example 7
Source File: HashingTFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("hashing tf on a single doc") { val hashingTF = new HashingTF(1000) val doc = "a a b b c d".split(" ") val n = hashingTF.numFeatures val termFreqs = Seq( (hashingTF.indexOf("a"), 2.0), (hashingTF.indexOf("b"), 2.0), (hashingTF.indexOf("c"), 1.0), (hashingTF.indexOf("d"), 1.0)) assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n), "index must be in range [0, #features)") assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing") val expected = Vectors.sparse(n, termFreqs) assert(hashingTF.transform(doc) === expected) } test("hashing tf on an RDD") { val hashingTF = new HashingTF val localDocs: Seq[Seq[String]] = Seq( "a a b b b c d".split(" "), "a b c d a b c".split(" "), "c b a c b a a".split(" ")) val docs = sc.parallelize(localDocs, 2) assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet) } }
Example 8
Source File: BaggedPointSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.tree.EnsembleTestHelper import org.apache.spark.mllib.util.MLlibTestSparkContext class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext { test("BaggedPoint RDD: without subsampling") { val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42) baggedRDD.collect().foreach { baggedPoint => assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 1.0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample)) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample))) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } }
Example 9
Source File: MatrixFactorizationModelSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.recommendation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext { val rank = 2 var userFeatures: RDD[(Int, Array[Double])] = _ var prodFeatures: RDD[(Int, Array[Double])] = _ override def beforeAll(): Unit = { super.beforeAll() userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0)))) prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0)))) } test("constructor") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) assert(model.predict(0, 2) ~== 17.0 relTol 1e-14) intercept[IllegalArgumentException] { new MatrixFactorizationModel(1, userFeatures, prodFeatures) } val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures1, prodFeatures) } val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures, prodFeatures1) } } test("save/load") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = { features.mapValues(_.toSeq).collect().toSet } try { model.save(sc, path) val newModel = MatrixFactorizationModel.load(sc, path) assert(newModel.rank === rank) assert(collect(newModel.userFeatures) === collect(userFeatures)) assert(collect(newModel.productFeatures) === collect(prodFeatures)) } finally { Utils.deleteRecursively(tempDir) } } test("batch predict API recommendProductsForUsers") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendProductsForUsers(topK).collectAsMap() assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14) assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14) } test("batch predict API recommendUsersForProducts") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendUsersForProducts(topK).collectAsMap() assert(recommendations(2)(0).user == 1) assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14) assert(recommendations(2)(1).user == 0) assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14) } }
Example 10
Source File: RankingMetricsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("Ranking metrics: map, ndcg") { val predictionAndLabels = sc.parallelize( Seq( (Array[Int](1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array[Int](1, 2, 3, 4, 5)), (Array[Int](4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array[Int](1, 2, 3)), (Array[Int](1, 2, 3, 4, 5), Array[Int]()) ), 2) val eps: Double = 1E-5 val metrics = new RankingMetrics(predictionAndLabels) val map = metrics.meanAveragePrecision assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps) assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps) assert(map ~== 0.355026 absTol eps) assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps) assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps) assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps) assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps) } }
Example 11
Source File: DecisionTreeRegressorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContext { import DecisionTreeRegressorSuite.compareAPIs private var categoricalDataPointsRDD: RDD[LabeledPoint] = _ override def beforeAll() { super.beforeAll() categoricalDataPointsRDD = sc.parallelize(OldDecisionTreeSuite.generateCategoricalDataPoints()) } ///////////////////////////////////////////////////////////////////////////// // Tests calling train() ///////////////////////////////////////////////////////////////////////////// test("Regression stump with 3-ary (ordered) categorical features") { val dt = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(100) .setSeed(1) val categoricalFeatures = Map(0 -> 3, 1-> 3) compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures) } test("Regression stump with binary (ordered) categorical features") { val dt = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(100) val categoricalFeatures = Map(0 -> 2, 1-> 2) compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures) } test("copied model must have the same parent") { val categoricalFeatures = Map(0 -> 2, 1-> 2) val df = TreeTests.setMetadata(categoricalDataPointsRDD, categoricalFeatures, numClasses = 0) val model = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(8).fit(df) MLTestingUtils.checkCopy(model) } ///////////////////////////////////////////////////////////////////////////// // Tests of model save/load ///////////////////////////////////////////////////////////////////////////// // TODO: test("model save/load") SPARK-6725 } private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite { def compareAPIs( data: RDD[LabeledPoint], dt: DecisionTreeRegressor, categoricalFeatures: Map[Int, Int]): Unit = { val numFeatures = data.first().features.size val oldStrategy = dt.getOldStrategy(categoricalFeatures) val oldTree = OldDecisionTree.train(data, oldStrategy) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val newTree = dt.fit(newData) // Use parent from newTree since this is not checked anyways. val oldTreeAsNew = DecisionTreeRegressionModel.fromOld( oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures) TreeTests.checkEqual(oldTreeAsNew, newTree) assert(newTree.numFeatures === numFeatures) } }
Example 12
Source File: RegressionMetricsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class RegressionMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("regression metrics for unbiased (includes intercept term) predictor") { val predictionAndObservations = sc.parallelize( Seq((2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)), 2) val metrics = new RegressionMetrics(predictionAndObservations) assert(metrics.explainedVariance ~== 8.85937 absTol 1E-5, "explained variance regression score mismatch") assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch") assert(metrics.meanSquaredError ~== 0.375 absTol 1E-5, "mean squared error mismatch") assert(metrics.rootMeanSquaredError ~== 0.61237 absTol 1E-5, "root mean squared error mismatch") assert(metrics.r2 ~== 0.94860 absTol 1E-5, "r2 score mismatch") } test("regression metrics with complete fitting") { val predictionAndObservations = sc.parallelize( Seq((3.0, 3.0), (0.0, 0.0), (2.0, 2.0), (8.0, 8.0)), 2) val metrics = new RegressionMetrics(predictionAndObservations) assert(metrics.explainedVariance ~== 8.6875 absTol 1E-5, "explained variance regression score mismatch") assert(metrics.meanAbsoluteError ~== 0.0 absTol 1E-5, "mean absolute error mismatch") assert(metrics.meanSquaredError ~== 0.0 absTol 1E-5, "mean squared error mismatch") assert(metrics.rootMeanSquaredError ~== 0.0 absTol 1E-5, "root mean squared error mismatch") assert(metrics.r2 ~== 1.0 absTol 1E-5, "r2 score mismatch") } }
Example 13
Source File: MulticlassMetricsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Matrices import org.apache.spark.mllib.util.MLlibTestSparkContext class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("Multiclass evaluation metrics") { val confusionMatrix = Matrices.dense(3, 3, Array(2, 1, 0, 1, 3, 0, 1, 0, 1)) val labels = Array(0.0, 1.0, 2.0) val predictionAndLabels = sc.parallelize( Seq((0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2) val metrics = new MulticlassMetrics(predictionAndLabels) val delta = 0.0000001 val fpRate0 = 1.0 / (9 - 4) val fpRate1 = 1.0 / (9 - 4) val fpRate2 = 1.0 / (9 - 1) val precision0 = 2.0 / (2 + 1) val precision1 = 3.0 / (3 + 1) val precision2 = 1.0 / (1 + 1) val recall0 = 2.0 / (2 + 2) val recall1 = 3.0 / (3 + 1) val recall2 = 1.0 / (1 + 0) val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0) val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1) val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2) val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0) val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1) val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2) assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray)) assert(math.abs(metrics.falsePositiveRate(0.0) - fpRate0) < delta) assert(math.abs(metrics.falsePositiveRate(1.0) - fpRate1) < delta) assert(math.abs(metrics.falsePositiveRate(2.0) - fpRate2) < delta) assert(math.abs(metrics.precision(0.0) - precision0) < delta) assert(math.abs(metrics.precision(1.0) - precision1) < delta) assert(math.abs(metrics.precision(2.0) - precision2) < delta) assert(math.abs(metrics.recall(0.0) - recall0) < delta) assert(math.abs(metrics.recall(1.0) - recall1) < delta) assert(math.abs(metrics.recall(2.0) - recall2) < delta) assert(math.abs(metrics.fMeasure(0.0) - f1measure0) < delta) assert(math.abs(metrics.fMeasure(1.0) - f1measure1) < delta) assert(math.abs(metrics.fMeasure(2.0) - f1measure2) < delta) assert(math.abs(metrics.fMeasure(0.0, 2.0) - f2measure0) < delta) assert(math.abs(metrics.fMeasure(1.0, 2.0) - f2measure1) < delta) assert(math.abs(metrics.fMeasure(2.0, 2.0) - f2measure2) < delta) assert(math.abs(metrics.recall - (2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta) assert(math.abs(metrics.recall - metrics.precision) < delta) assert(math.abs(metrics.recall - metrics.fMeasure) < delta) assert(math.abs(metrics.recall - metrics.weightedRecall) < delta) assert(math.abs(metrics.weightedFalsePositiveRate - ((4.0 / 9) * fpRate0 + (4.0 / 9) * fpRate1 + (1.0 / 9) * fpRate2)) < delta) assert(math.abs(metrics.weightedPrecision - ((4.0 / 9) * precision0 + (4.0 / 9) * precision1 + (1.0 / 9) * precision2)) < delta) assert(math.abs(metrics.weightedRecall - ((4.0 / 9) * recall0 + (4.0 / 9) * recall1 + (1.0 / 9) * recall2)) < delta) assert(math.abs(metrics.weightedFMeasure - ((4.0 / 9) * f1measure0 + (4.0 / 9) * f1measure1 + (1.0 / 9) * f1measure2)) < delta) assert(math.abs(metrics.weightedFMeasure(2.0) - ((4.0 / 9) * f2measure0 + (4.0 / 9) * f2measure1 + (1.0 / 9) * f2measure2)) < delta) assert(metrics.labels.sameElements(labels)) } }
Example 14
Source File: FPTreeSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import scala.language.existentials import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext { test("add transaction") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) assert(tree.root.children.size == 2) assert(tree.root.children.contains("a")) assert(tree.root.children("a").item.equals("a")) assert(tree.root.children("a").count == 2) assert(tree.root.children.contains("b")) assert(tree.root.children("b").item.equals("b")) assert(tree.root.children("b").count == 1) var child = tree.root.children("a") assert(child.children.size == 1) assert(child.children.contains("b")) assert(child.children("b").item.equals("b")) assert(child.children("b").count == 2) child = child.children("b") assert(child.children.size == 2) assert(child.children.contains("c")) assert(child.children.contains("y")) assert(child.children("c").item.equals("c")) assert(child.children("y").item.equals("y")) assert(child.children("c").count == 1) assert(child.children("y").count == 1) } test("merge tree") { val tree1 = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) val tree2 = new FPTree[String] .add(Seq("a", "b")) .add(Seq("a", "b", "c")) .add(Seq("a", "b", "c", "d")) .add(Seq("a", "x")) .add(Seq("a", "x", "y")) .add(Seq("c", "n")) .add(Seq("c", "m")) val tree3 = tree1.merge(tree2) assert(tree3.root.children.size == 3) assert(tree3.root.children("a").count == 7) assert(tree3.root.children("b").count == 1) assert(tree3.root.children("c").count == 2) val child1 = tree3.root.children("a") assert(child1.children.size == 2) assert(child1.children("b").count == 5) assert(child1.children("x").count == 2) val child2 = child1.children("b") assert(child2.children.size == 2) assert(child2.children("y").count == 1) assert(child2.children("c").count == 3) val child3 = child2.children("c") assert(child3.children.size == 1) assert(child3.children("d").count == 1) val child4 = child1.children("x") assert(child4.children.size == 1) assert(child4.children("y").count == 1) val child5 = tree3.root.children("c") assert(child5.children.size == 2) assert(child5.children("n").count == 1) assert(child5.children("m").count == 1) } test("extract freq itemsets") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("a", "b")) .add(Seq("a")) .add(Seq("b")) .add(Seq("b", "n")) val freqItemsets = tree.extract(3L).map { case (items, count) => (items.toSet, count) }.toSet val expected = Set( (Set("a"), 4L), (Set("b"), 5L), (Set("a", "b"), 3L)) assert(freqItemsets === expected) } }
Example 15
Source File: AssociationRulesSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext { test("association rules using String type") { val freqItemsets = sc.parallelize(Seq( (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L), (Set("r"), 3L), (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L), (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L), (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L), (Set("t", "y", "x"), 3L), (Set("t", "y", "x", "z"), 3L) ).map { case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq) }) val ar = new AssociationRules() val results1 = ar .setMinConfidence(0.9) .run(freqItemsets) .collect() assert(results2.size === 30) assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23) } }
Example 16
Source File: KernelDensitySuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 17
Source File: MultivariateGaussianSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{ Vectors, Matrices } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 18
Source File: CoordinateMatrixSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.linalg.Vectors class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { val m = 5 val n = 4 var mat: CoordinateMatrix = _ override def beforeAll() { super.beforeAll() val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } mat = new CoordinateMatrix(entries) } test("size") { assert(mat.numRows() === m) assert(mat.numCols() === n) } test("empty entries") { val entries = sc.parallelize(Seq[MatrixEntry](), 1) val emptyMat = new CoordinateMatrix(entries) intercept[RuntimeException] { emptyMat.numCols() } intercept[RuntimeException] { emptyMat.numRows() } } test("toBreeze") { val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(mat.toBreeze() === expected) } test("transpose") { val transposed = mat.transpose() assert(mat.toBreeze().t === transposed.toBreeze()) } test("toIndexedRowMatrix") { val indexedRowMatrix = mat.toIndexedRowMatrix() val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(indexedRowMatrix.toBreeze() === expected) } test("toRowMatrix") { val rowMatrix = mat.toRowMatrix() val rows = rowMatrix.rows.collect().toSet val expected = Set( Vectors.dense(1.0, 2.0, 0.0, 0.0), Vectors.dense(0.0, 3.0, 4.0, 0.0), Vectors.dense(0.0, 0.0, 5.0, 6.0), Vectors.dense(7.0, 0.0, 0.0, 8.0), Vectors.dense(0.0, 9.0, 0.0, 0.0)) assert(rows === expected) } test("toBlockMatrix") { val blockMat = mat.toBlockMatrix(2, 2) assert(blockMat.numRows() === m) assert(blockMat.numCols() === n) assert(blockMat.toBreeze() === mat.toBreeze()) intercept[IllegalArgumentException] { mat.toBlockMatrix(-1, 2) } intercept[IllegalArgumentException] { mat.toBlockMatrix(2, 0) } } }
Example 19
Source File: MLPairRDDFunctionsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("topByKey") { val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5, 1), (3, 5)), 2) .topByKey(5) .collectAsMap() assert(topMap.size === 3) assert(topMap(1) === Array(7, 6, 3, 2, 1)) assert(topMap(3) === Array(7, 5, 2)) assert(topMap(5) === Array(1)) } }
Example 20
Source File: RDDFunctionsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.rdd.RDDFunctions._ class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("sliding") { val data = 0 until 6 for (numPartitions <- 1 to 8) { val rdd = sc.parallelize(data, numPartitions) for (windowSize <- 1 to 6) { for (step <- 1 to 3) { val sliding = rdd.sliding(windowSize, step).collect().map(_.toList).toList val expected = data.sliding(windowSize, step) .map(_.toList).toList.filter(l => l.size == windowSize) assert(sliding === expected) } } assert(rdd.sliding(7).collect().isEmpty, "Should return an empty RDD if the window size is greater than the number of items.") } } test("sliding with empty partitions") { val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7)) val rdd = sc.parallelize(data, data.length).flatMap(s => s) assert(rdd.partitions.length === data.length) val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq) val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq) assert(sliding === expected) } }
Example 21
Source File: QuantileDiscretizerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.attribute.{Attribute, NominalAttribute} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.{SparkContext, SparkFunSuite} class QuantileDiscretizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.QuantileDiscretizerSuite._ test("Test quantile discretizer") { checkDiscretizedData(sc, Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), 10, Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity")) checkDiscretizedData(sc, Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), 4, Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity")) checkDiscretizedData(sc, Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), 3, Array[Double](0, 1, 2, 2, 2, 2, 2, 2, 2), Array("-Infinity, 2.0", "2.0, 3.0", "3.0, Infinity")) checkDiscretizedData(sc, Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3), 2, Array[Double](0, 1, 1, 1, 1, 1, 1, 1, 1), Array("-Infinity, 2.0", "2.0, Infinity")) } test("Test getting splits") { val splitTestPoints = Array( Array[Double]() -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), Array(Double.NegativeInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), Array(Double.PositiveInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), Array(Double.NegativeInfinity, Double.PositiveInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), Array(0.0) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity), Array(1.0) -> Array(Double.NegativeInfinity, 1, Double.PositiveInfinity), Array(0.0, 1.0) -> Array(Double.NegativeInfinity, 0, 1, Double.PositiveInfinity) ) for ((ori, res) <- splitTestPoints) { assert(QuantileDiscretizer.getSplits(ori) === res, "Returned splits are invalid.") } } test("read/write") { val t = new QuantileDiscretizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumBuckets(6) testDefaultReadWrite(t) } } private object QuantileDiscretizerSuite extends SparkFunSuite { def checkDiscretizedData( sc: SparkContext, data: Array[Double], numBucket: Int, expectedResult: Array[Double], expectedAttrs: Array[String]): Unit = { val sqlCtx = SQLContext.getOrCreate(sc) import sqlCtx.implicits._ val df = sc.parallelize(data.map(Tuple1.apply)).toDF("input") val discretizer = new QuantileDiscretizer().setInputCol("input").setOutputCol("result") .setNumBuckets(numBucket) val result = discretizer.fit(df).transform(df) val transformedFeatures = result.select("result").collect() .map { case Row(transformedFeature: Double) => transformedFeature } val transformedAttrs = Attribute.fromStructField(result.schema("result")) .asInstanceOf[NominalAttribute].values.get assert(transformedFeatures === expectedResult, "Transformed features do not equal expected features.") assert(transformedAttrs === expectedAttrs, "Transformed attributes do not equal expected attributes.") } }
Example 22
Source File: BinarizerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @transient var data: Array[Double] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4) } test("params") { ParamsSuite.checkParams(new Binarizer) } test("Binarize continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = sqlContext.createDataFrame( data.zip(defaultBinarized)).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize continuous features with setter") { val threshold: Double = 0.2 val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = sqlContext.createDataFrame( data.zip(thresholdBinarized)).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("read/write") { val t = new Binarizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setThreshold(0.1) testDefaultReadWrite(t) } }
Example 23
Source File: SQLTransformerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext class SQLTransformerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new SQLTransformer()) } test("transform numeric data") { val original = sqlContext.createDataFrame( Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val result = sqlTrans.transform(original) val resultSchema = sqlTrans.transformSchema(original.schema) val expected = sqlContext.createDataFrame( Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))) .toDF("id", "v1", "v2", "v3", "v4") assert(result.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(result.collect().toSeq == expected.collect().toSeq) } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } }
Example 24
Source File: TokenizerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new Tokenizer) } test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.RegexTokenizerSuite._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) )) testRegexTokenizer(tokenizer0, dataset0) val dataset1 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) )) tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) )) testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = sqlContext.createDataFrame(Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) )) testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 25
Source File: MinMaxScalerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SQLContext} class MinMaxScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("MinMaxScaler fit basic case") { val sqlContext = new SQLContext(sc) val data = Array( Vectors.dense(1, 0, Long.MinValue), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(3, Long.MaxValue)), Vectors.sparse(3, Array(0), Array(1.5))) val expected: Array[Vector] = Array( Vectors.dense(-5, 0, -5), Vectors.dense(0, 0, 0), Vectors.sparse(3, Array(0, 2), Array(5, 5)), Vectors.sparse(3, Array(0), Array(-2.5))) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaled") .setMin(-5) .setMax(5) val model = scaler.fit(df) model.transform(df).select("expected", "scaled").collect() .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1.equals(vector2), "Transformed vector is different with expected.") } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } test("MinMaxScaler arguments max must be larger than min") { withClue("arguments max must be larger than min") { intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(10).setMax(0) scaler.validateParams() } intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(0).setMax(0) scaler.validateParams() } } } test("MinMaxScaler read/write") { val t = new MinMaxScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMax(1.0) .setMin(-1.0) testDefaultReadWrite(t) } test("MinMaxScalerModel read/write") { val instance = new MinMaxScalerModel( "myMinMaxScalerModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMin(-1.0) .setMax(1.0) val newInstance = testDefaultReadWrite(instance) assert(newInstance.originalMin === instance.originalMin) assert(newInstance.originalMax === instance.originalMax) } }
Example 26
Source File: PolynomialExpansionSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("read/write") { val t = new PolynomialExpansion() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setDegree(3) testDefaultReadWrite(t) } }
Example 27
Source File: IDFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } test("params") { ParamsSuite.checkParams(new IDF) val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0))) ParamsSuite.checkParams(model) } test("compute IDF with default parameter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((numOfData + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("IDF read/write") { val t = new IDF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinDocFreq(5) testDefaultReadWrite(t) } test("IDFModel read/write") { val instance = new IDFModel("myIDFModel", new OldIDFModel(Vectors.dense(1.0, 2.0))) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.idf === instance.idf) } }
Example 28
Source File: NGramSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.NGramSuite._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") ))) testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") ))) testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array(), Array() ))) testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("a", "b", "c", "d", "e"), Array() ))) testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } } object NGramSuite extends SparkFunSuite { def testNGram(t: NGram, dataset: DataFrame): Unit = { t.transform(dataset) .select("nGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => assert(actualNGrams === wantedNGrams) } } }
Example 29
Source File: PCASuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.feature.{PCAModel => OldPCAModel} import org.apache.spark.sql.Row class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new PCA) val mat = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix] val model = new PCAModel("pca", mat) ParamsSuite.checkParams(model) } test("pca") { val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val dataRDD = sc.parallelize(data, 2) val mat = new RowMatrix(dataRDD) val pc = mat.computePrincipalComponents(3) val expected = mat.multiply(pc).rows val df = sqlContext.createDataFrame(dataRDD.zip(expected)).toDF("features", "expected") val pca = new PCA() .setInputCol("features") .setOutputCol("pca_features") .setK(3) .fit(df) // copied model must have the same parent. MLTestingUtils.checkCopy(pca) pca.transform(df).select("pca_features", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("PCA read/write") { val t = new PCA() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setK(3) testDefaultReadWrite(t) } test("PCAModel read/write") { val instance = new PCAModel("myPCAModel", Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix]) val newInstance = testDefaultReadWrite(instance) assert(newInstance.pc === instance.pc) } }
Example 30
Source File: HashingTFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.Utils class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val df = sqlContext.createDataFrame(Seq( (0, "a a b b c d".split(" ").toSeq) )).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(n)) val features = output.select("features").first().getAs[Vector](0) // Assume perfect hash on "a", "b", "c", and "d". def idx(any: Any): Int = Utils.nonNegativeMod(any.##, n) val expected = Vectors.sparse(n, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) assert(features ~== expected absTol 1e-14) } test("read/write") { val t = new HashingTF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumFeatures(10) testDefaultReadWrite(t) } }
Example 31
Source File: StopWordsRemoverSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} object StopWordsRemoverSuite extends SparkFunSuite { def testStopWordsRemover(t: StopWordsRemover, dataset: DataFrame): Unit = { t.transform(dataset) .select("filtered", "expected") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } } class StopWordsRemoverSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import StopWordsRemoverSuite._ test("StopWordsRemover default") { val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") val dataSet = sqlContext.createDataFrame(Seq( (Seq("test", "test"), Seq("test", "test")), (Seq("a", "b", "c", "d"), Seq("b", "c", "d")), (Seq("a", "the", "an"), Seq()), (Seq("A", "The", "AN"), Seq()), (Seq(null), Seq(null)), (Seq(), Seq()) )).toDF("raw", "expected") testStopWordsRemover(remover, dataSet) } test("StopWordsRemover case sensitive") { val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") .setCaseSensitive(true) val dataSet = sqlContext.createDataFrame(Seq( (Seq("A"), Seq("A")), (Seq("The", "the"), Seq("The")) )).toDF("raw", "expected") testStopWordsRemover(remover, dataSet) } test("StopWordsRemover with additional words") { val stopWords = StopWords.English ++ Array("python", "scala") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords) val dataSet = sqlContext.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq()), (Seq("Python", "Scala", "swift"), Seq("swift")) )).toDF("raw", "expected") testStopWordsRemover(remover, dataSet) } test("read/write") { val t = new StopWordsRemover() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setStopWords(Array("the", "a")) .setCaseSensitive(true) testDefaultReadWrite(t) } test("StopWordsRemover output column already exists") { val outputCol = "expected" val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol(outputCol) val dataSet = sqlContext.createDataFrame(Seq( (Seq("The", "the", "swift"), Seq("swift")) )).toDF("raw", outputCol) val thrown = intercept[IllegalArgumentException] { testStopWordsRemover(remover, dataSet) } assert(thrown.getMessage == s"requirement failed: Column $outputCol already exists.") } }
Example 32
Source File: RandomForestSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.impl.TreeTests import org.apache.spark.ml.tree.{ContinuousSplit, DecisionTreeModel, LeafNode, Node} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.tree.impurity.GiniCalculator import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.collection.OpenHashMap val leftImp = new GiniCalculator(Array(3.0, 2.0, 1.0)) val left = new LeafNode(0.0, leftImp.calculate(), leftImp) val rightImp = new GiniCalculator(Array(1.0, 2.0, 5.0)) val right = new LeafNode(2.0, rightImp.calculate(), rightImp) val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5)) val parentImp = parent.impurityStats val left2Imp = new GiniCalculator(Array(1.0, 6.0, 1.0)) val left2 = new LeafNode(0.0, left2Imp.calculate(), left2Imp) val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0)) val grandImp = grandParent.impurityStats // Test feature importance computed at different subtrees. def testNode(node: Node, expected: Map[Int, Double]): Unit = { val map = new OpenHashMap[Int, Double]() RandomForest.computeFeatureImportance(node, map) assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } // Leaf node testNode(left, Map.empty[Int, Double]) // Internal node with 2 leaf children val feature0importance = parentImp.calculate() * parentImp.count - (leftImp.calculate() * leftImp.count + rightImp.calculate() * rightImp.count) testNode(parent, Map(0 -> feature0importance)) // Full tree val feature1importance = grandImp.calculate() * grandImp.count - (left2Imp.calculate() * left2Imp.count + parentImp.calculate() * parentImp.count) testNode(grandParent, Map(0 -> feature0importance, 1 -> feature1importance)) // Forest consisting of (full tree) + (internal node with 2 leafs) val trees = Array(parent, grandParent).map { root => new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3) .asInstanceOf[DecisionTreeModel] } val importances: Vector = RandomForest.featureImportances(trees, 2) val tree2norm = feature0importance + feature1importance val expected = Vectors.dense((1.0 + feature0importance / tree2norm) / 2.0, (feature1importance / tree2norm) / 2.0) assert(importances ~== expected relTol 0.01) } test("normalizeMapValues") { val map = new OpenHashMap[Int, Double]() map(0) = 1.0 map(2) = 2.0 RandomForest.normalizeMapValues(map) val expected = Map(0 -> 1.0 / 3.0, 2 -> 2.0 / 3.0) assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } } private object RandomForestSuite { def mapToVec(map: Map[Int, Double]): Vector = { val size = (map.keys.toSeq :+ 0).max + 1 val (indices, values) = map.toSeq.sortBy(_._1).unzip Vectors.sparse(size, indices.toArray, values.toArray) } }
Example 33
Source File: BinaryClassificationEvaluatorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext class BinaryClassificationEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new BinaryClassificationEvaluator) } test("read/write") { val evaluator = new BinaryClassificationEvaluator() .setRawPredictionCol("myRawPrediction") .setLabelCol("myLabel") .setMetricName("areaUnderPR") testDefaultReadWrite(evaluator) } }
Example 34
Source File: MulticlassClassificationEvaluatorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext class MulticlassClassificationEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new MulticlassClassificationEvaluator) } test("read/write") { val evaluator = new MulticlassClassificationEvaluator() .setPredictionCol("myPrediction") .setLabelCol("myLabel") .setMetricName("recall") testDefaultReadWrite(evaluator) } }
Example 35
Source File: RegressionEvaluatorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new RegressionEvaluator) } test("Regression Evaluator: default params") { val trainer = new LinearRegression val model = trainer.fit(dataset) val predictions = model.transform(dataset) // default = rmse val evaluator = new RegressionEvaluator() assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001) // r2 score evaluator.setMetricName("r2") assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001) // mae evaluator.setMetricName("mae") assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001) } test("read/write") { val evaluator = new RegressionEvaluator() .setPredictionCol("myPrediction") .setLabelCol("myLabel") .setMetricName("r2") testDefaultReadWrite(evaluator) } }
Example 36
Source File: LibSVMRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import com.google.common.base.Charsets import com.google.common.io.Files import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { var tempDir: File = _ var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin tempDir = Utils.createTempDir() val file = new File(tempDir, "part-00000") Files.write(lines, file, Charsets.US_ASCII) path = tempDir.toURI.toString } override def afterAll(): Unit = { Utils.deleteRecursively(tempDir) super.afterAll() } test("select as sparse vector") { val df = sqlContext.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = sqlContext.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = sqlContext.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } }
Example 37
Source File: MultilayerPerceptronClassifierSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.LogisticRegressionSuite._ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSparkContext { test("XOR function learning as binary classification problem with two outputs.") { val dataFrame = sqlContext.createDataFrame(Seq( (Vectors.dense(0.0, 0.0), 0.0), (Vectors.dense(0.0, 1.0), 1.0), (Vectors.dense(1.0, 0.0), 1.0), (Vectors.dense(1.0, 1.0), 0.0)) ).toDF("features", "label") val layers = Array[Int](2, 5, 2) val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(1) .setSeed(11L) .setMaxIter(100) val model = trainer.fit(dataFrame) val result = model.transform(dataFrame) val predictionAndLabels = result.select("prediction", "label").collect() predictionAndLabels.foreach { case Row(p: Double, l: Double) => assert(p == l) } } // TODO: implement a more rigorous test test("3 class classification with 2 hidden layers") { val nPoints = 1000 // The following coefficients are taken from OneVsRestSuite.scala // they represent 3-class iris dataset val coefficients = Array( -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) val xMean = Array(5.843, 3.057, 3.758, 1.199) val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) // the input seed is somewhat magic, to make this test pass val rdd = sc.parallelize(generateMultinomialLogisticInput( coefficients, xMean, xVariance, true, nPoints, 1), 2) val dataFrame = sqlContext.createDataFrame(rdd).toDF("label", "features") val numClasses = 3 val numIterations = 100 val layers = Array[Int](4, 5, 4, numClasses) val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(1) .setSeed(11L) // currently this seed is ignored .setMaxIter(numIterations) val model = trainer.fit(dataFrame) val numFeatures = dataFrame.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label") .map { case Row(p: Double, l: Double) => (p, l) } // train multinomial logistic regression val lr = new LogisticRegressionWithLBFGS() .setIntercept(true) .setNumClasses(numClasses) lr.optimizer.setRegParam(0.0) .setNumIterations(numIterations) val lrModel = lr.run(rdd) val lrPredictionAndLabels = lrModel.predict(rdd.map(_.features)).zip(rdd.map(_.label)) // MLP's predictions should not differ a lot from LR's. val lrMetrics = new MulticlassMetrics(lrPredictionAndLabels) val mlpMetrics = new MulticlassMetrics(mlpPredictionAndLabels) assert(mlpMetrics.confusionMatrix ~== lrMetrics.confusionMatrix absTol 100) } }
Example 38
Source File: StopwatchSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import java.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext { import StopwatchSuite._ private def testStopwatchOnDriver(sw: Stopwatch): Unit = { assert(sw.name === "sw") assert(sw.elapsed() === 0L) assert(!sw.isRunning) intercept[AssertionError] { sw.stop() } val duration = checkStopwatch(sw) val elapsed = sw.elapsed() assert(elapsed === duration) val duration2 = checkStopwatch(sw) val elapsed2 = sw.elapsed() assert(elapsed2 === duration + duration2) assert(sw.toString === s"sw: ${elapsed2}ms") sw.start() assert(sw.isRunning) intercept[AssertionError] { sw.start() } } test("LocalStopwatch") { val sw = new LocalStopwatch("sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on driver") { val sw = new DistributedStopwatch(sc, "sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on executors") { val sw = new DistributedStopwatch(sc, "sw") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.accumulator(0L) rdd.foreach { i => acc += checkStopwatch(sw) } assert(!sw.isRunning) val elapsed = sw.elapsed() assert(elapsed === acc.value) } test("MultiStopwatch") { val sw = new MultiStopwatch(sc) .addLocal("local") .addDistributed("spark") assert(sw("local").name === "local") assert(sw("spark").name === "spark") intercept[NoSuchElementException] { sw("some") } assert(sw.toString === "{\n local: 0ms,\n spark: 0ms\n}") val localDuration = checkStopwatch(sw("local")) val sparkDuration = checkStopwatch(sw("spark")) val localElapsed = sw("local").elapsed() val sparkElapsed = sw("spark").elapsed() assert(localElapsed === localDuration) assert(sparkElapsed === sparkDuration) assert(sw.toString === s"{\n local: ${localElapsed}ms,\n spark: ${sparkElapsed}ms\n}") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.accumulator(0L) rdd.foreach { i => sw("local").start() val duration = checkStopwatch(sw("spark")) sw("local").stop() acc += duration } val localElapsed2 = sw("local").elapsed() assert(localElapsed2 === localElapsed) val sparkElapsed2 = sw("spark").elapsed() assert(sparkElapsed2 === sparkElapsed + acc.value) } } private object StopwatchSuite extends SparkFunSuite { private def now: Long = System.currentTimeMillis() }
Example 39
Source File: WeightedLeastSquaresSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.Instance import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext { private var instances: RDD[Instance] = _ override def beforeAll(): Unit = { super.beforeAll() val expected = Seq( Vectors.dense(0.0, -3.727117, 3.009982), Vectors.dense(0.0, -3.727117, 3.009982), Vectors.dense(0.0, -3.307532, 2.924206), Vectors.dense(0.0, -2.914790, 2.840627), Vectors.dense(0.0, -1.526575, 2.558158), Vectors.dense(0.0, 0.06984238, 2.20488344), Vectors.dense(18.0799727, 6.0799832, -0.5999941), Vectors.dense(18.0799727, 6.0799832, -0.5999941), Vectors.dense(13.5356178, 3.2714044, 0.3770744), Vectors.dense(14.064629, 3.565802, 0.269593), Vectors.dense(10.1238013, 0.9708569, 1.1475466), Vectors.dense(13.1860638, 2.1761382, 0.6213134)) var idx = 0 for (fitIntercept <- Seq(false, true); regParam <- Seq(0.0, 0.1, 1.0); standardizeFeatures <- Seq(false, true)) { val wls = new WeightedLeastSquares( fitIntercept, regParam, standardizeFeatures, standardizeLabel = true) .fit(instances) val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1)) assert(actual ~== expected(idx) absTol 1e-4) idx += 1 } } }
Example 40
Source File: ANNSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ANNSuite extends SparkFunSuite with MLlibTestSparkContext { // TODO: test for weights comparison with Weka MLP test("ANN with Sigmoid learns XOR function with LBFGS optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array(0.0, 1.0, 1.0, 0.0) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights() val trainer = new FeedForwardTrainer(topology, 2, 1) trainer.setWeights(initialWeights) trainer.LBFGSOptimizer.setNumIterations(20) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input)(0), label(0)) }.collect() predictionAndLabels.foreach { case (p, l) => assert(math.round(p) === l) } } test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array( Array(1.0, 0.0), Array(0.0, 1.0), Array(0.0, 1.0), Array(1.0, 0.0) ) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights() val trainer = new FeedForwardTrainer(topology, 2, 2) trainer.SGDOptimizer.setNumIterations(2000) trainer.setWeights(initialWeights) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input), label) }.collect() predictionAndLabels.foreach { case (p, l) => assert(p ~== l absTol 0.5) } } }
Example 41
Source File: MulticlassClassificationEvaluatorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext class MulticlassClassificationEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new MulticlassClassificationEvaluator) } test("read/write") { val evaluator = new MulticlassClassificationEvaluator() .setPredictionCol("myPrediction") .setLabelCol("myLabel") .setMetricName("accuracy") testDefaultReadWrite(evaluator) } test("should support all NumericType labels and not support other types") { MLTestingUtils.checkNumericTypes(new MulticlassClassificationEvaluator, spark) } }
Example 42
Source File: RegressionEvaluatorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new RegressionEvaluator) } test("Regression Evaluator: default params") { val trainer = new LinearRegression val model = trainer.fit(dataset) val predictions = model.transform(dataset) // default = rmse val evaluator = new RegressionEvaluator() assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01) // r2 score evaluator.setMetricName("r2") assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01) // mae evaluator.setMetricName("mae") assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01) } test("read/write") { val evaluator = new RegressionEvaluator() .setPredictionCol("myPrediction") .setLabelCol("myLabel") .setMetricName("r2") testDefaultReadWrite(evaluator) } test("should support all NumericType labels and not support other types") { MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark) } }
Example 43
Source File: RWrapperUtilsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.{RFormula, RFormulaModel} import org.apache.spark.mllib.util.MLlibTestSparkContext class RWrapperUtilsSuite extends SparkFunSuite with MLlibTestSparkContext { test("avoid libsvm data column name conflicting") { val rFormula = new RFormula().setFormula("label ~ features") val data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt") // if not checking column name, then IllegalArgumentException intercept[IllegalArgumentException] { rFormula.fit(data) } // after checking, model build is ok RWrapperUtils.checkDataColumns(rFormula, data) assert(rFormula.getLabelCol == "label") assert(rFormula.getFeaturesCol.startsWith("features_")) val model = rFormula.fit(data) assert(model.isInstanceOf[RFormulaModel]) assert(model.getLabelCol == "label") assert(model.getFeaturesCol.startsWith("features_")) } }
Example 44
Source File: PredictorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext { import PredictorSuite._ test("should support all NumericType labels and not support other types") { val df = spark.createDataFrame(Seq( (0, Vectors.dense(0, 2, 3)), (1, Vectors.dense(0, 3, 9)), (0, Vectors.dense(0, 2, 6)) )).toDF("label", "features") val types = Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0)) val predictor = new MockPredictor() types.foreach { t => predictor.fit(df.select(col("label").cast(t), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label").cast(StringType), col("features"))) } } } object PredictorSuite { class MockPredictor(override val uid: String) extends Predictor[Vector, MockPredictor, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictor")) override def train(dataset: Dataset[_]): MockPredictionModel = { require(dataset.schema("label").dataType == DoubleType) new MockPredictionModel(uid) } override def copy(extra: ParamMap): MockPredictor = throw new NotImplementedError() } class MockPredictionModel(override val uid: String) extends PredictionModel[Vector, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictormodel")) override def predict(features: Vector): Double = throw new NotImplementedError() override def copy(extra: ParamMap): MockPredictionModel = throw new NotImplementedError() } }
Example 45
Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 46
Source File: StopwatchSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import java.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext { import StopwatchSuite._ private def testStopwatchOnDriver(sw: Stopwatch): Unit = { assert(sw.name === "sw") assert(sw.elapsed() === 0L) assert(!sw.isRunning) intercept[AssertionError] { sw.stop() } val duration = checkStopwatch(sw) val elapsed = sw.elapsed() assert(elapsed === duration) val duration2 = checkStopwatch(sw) val elapsed2 = sw.elapsed() assert(elapsed2 === duration + duration2) assert(sw.toString === s"sw: ${elapsed2}ms") sw.start() assert(sw.isRunning) intercept[AssertionError] { sw.start() } } test("LocalStopwatch") { val sw = new LocalStopwatch("sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on driver") { val sw = new DistributedStopwatch(sc, "sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on executors") { val sw = new DistributedStopwatch(sc, "sw") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => acc.add(checkStopwatch(sw)) } assert(!sw.isRunning) val elapsed = sw.elapsed() assert(elapsed === acc.value) } test("MultiStopwatch") { val sw = new MultiStopwatch(sc) .addLocal("local") .addDistributed("spark") assert(sw("local").name === "local") assert(sw("spark").name === "spark") intercept[NoSuchElementException] { sw("some") } assert(sw.toString === "{\n local: 0ms,\n spark: 0ms\n}") val localDuration = checkStopwatch(sw("local")) val sparkDuration = checkStopwatch(sw("spark")) val localElapsed = sw("local").elapsed() val sparkElapsed = sw("spark").elapsed() assert(localElapsed === localDuration) assert(sparkElapsed === sparkDuration) assert(sw.toString === s"{\n local: ${localElapsed}ms,\n spark: ${sparkElapsed}ms\n}") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => sw("local").start() val duration = checkStopwatch(sw("spark")) sw("local").stop() acc.add(duration) } val localElapsed2 = sw("local").elapsed() assert(localElapsed2 === localElapsed) val sparkElapsed2 = sw("spark").elapsed() assert(sparkElapsed2 === sparkElapsed + acc.value) } } private object StopwatchSuite extends SparkFunSuite { private def now: Long = System.currentTimeMillis() }
Example 47
Source File: ANNSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext class ANNSuite extends SparkFunSuite with MLlibTestSparkContext { // TODO: test for weights comparison with Weka MLP test("ANN with Sigmoid learns XOR function with LBFGS optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array(0.0, 1.0, 1.0, 0.0) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights val trainer = new FeedForwardTrainer(topology, 2, 1) trainer.setWeights(initialWeights) trainer.LBFGSOptimizer.setNumIterations(20) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input)(0), label(0)) }.collect() predictionAndLabels.foreach { case (p, l) => assert(math.round(p) === l) } } test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array( Array(1.0, 0.0), Array(0.0, 1.0), Array(0.0, 1.0), Array(1.0, 0.0) ) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights val trainer = new FeedForwardTrainer(topology, 2, 2) // TODO: add a test for SGD trainer.LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(20) trainer.setWeights(initialWeights).setStackSize(1) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input), label) }.collect() predictionAndLabels.foreach { case (p, l) => assert(p ~== l absTol 0.5) } } }
Example 48
Source File: GradientSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientSuite extends SparkFunSuite with MLlibTestSparkContext { test("Gradient computation against numerical differentiation") { val input = new BDM[Double](3, 1, Array(1.0, 1.0, 1.0)) // output must contain zeros and one 1 for SoftMax val target = new BDM[Double](2, 1, Array(0.0, 1.0)) val topology = FeedForwardTopology.multiLayerPerceptron(Array(3, 4, 2), softmaxOnTop = false) val layersWithErrors = Seq( new SigmoidLayerWithSquaredError(), new SoftmaxLayerWithCrossEntropyLoss() ) // check all layers that provide loss computation // 1) compute loss and gradient given the model and initial weights // 2) modify weights with small number epsilon (per dimension i) // 3) compute new loss // 4) ((newLoss - loss) / epsilon) should be close to the i-th component of the gradient for (layerWithError <- layersWithErrors) { topology.layers(topology.layers.length - 1) = layerWithError val model = topology.model(seed = 12L) val weights = model.weights.toArray val numWeights = weights.size val gradient = Vectors.dense(Array.fill[Double](numWeights)(0.0)) val loss = model.computeGradient(input, target, gradient, 1) val eps = 1e-4 var i = 0 val tol = 1e-4 while (i < numWeights) { val originalValue = weights(i) weights(i) += eps val newModel = topology.model(Vectors.dense(weights)) val newLoss = computeLoss(input, target, newModel) val derivativeEstimate = (newLoss - loss) / eps assert(math.abs(gradient(i) - derivativeEstimate) < tol, "Layer failed gradient check: " + layerWithError.getClass) weights(i) = originalValue i += 1 } } } private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = { val outputs = model.forward(input) model.layerModels.last match { case layerWithLoss: LossFunction => layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols)) case _ => throw new UnsupportedOperationException("Top layer is required to have loss." + " Failed layer:" + model.layerModels.last.getClass) } } }
Example 49
Source File: ChiSqSelectorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Seq(LabeledPoint(0.0, Vectors.dense(Array(8.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSeq assert(filteredData === preFilteredData) } test("ChiSqSelector by fpr transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) val preFilteredData = Seq(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) val model: ChiSqSelectorModel = new ChiSqSelector().setSelectorType("fpr") .setFpr(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSeq assert(filteredData === preFilteredData) } test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = ChiSqSelectorModel.load(sc, path) ChiSqSelectorSuite.checkEqual(model, sameModel) } finally { Utils.deleteRecursively(tempDir) } } } object ChiSqSelectorSuite extends SparkFunSuite { def createModel(): ChiSqSelectorModel = { val arr = Array(1, 2, 3, 4) new ChiSqSelectorModel(arr) } def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = { assert(a.selectedFeatures.deep == b.selectedFeatures.deep) } }
Example 50
Source File: ElementwiseProductSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 51
Source File: IDFSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 52
Source File: PCASuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 53
Source File: HashingTFSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("hashing tf on a single doc") { val hashingTF = new HashingTF(1000) val doc = "a a b b c d".split(" ") val n = hashingTF.numFeatures val termFreqs = Seq( (hashingTF.indexOf("a"), 2.0), (hashingTF.indexOf("b"), 2.0), (hashingTF.indexOf("c"), 1.0), (hashingTF.indexOf("d"), 1.0)) assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n), "index must be in range [0, #features)") assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing") val expected = Vectors.sparse(n, termFreqs) assert(hashingTF.transform(doc) === expected) } test("hashing tf on an RDD") { val hashingTF = new HashingTF val localDocs: Seq[Seq[String]] = Seq( "a a b b b c d".split(" "), "a b c d a b c".split(" "), "c b a c b a a".split(" ")) val docs = sc.parallelize(localDocs, 2) assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet) } test("applying binary term freqs") { val hashingTF = new HashingTF(100).setBinary(true) val doc = "a a b c c c".split(" ") val n = hashingTF.numFeatures val expected = Vectors.sparse(n, Seq( (hashingTF.indexOf("a"), 1.0), (hashingTF.indexOf("b"), 1.0), (hashingTF.indexOf("c"), 1.0))) assert(hashingTF.transform(doc) ~== expected absTol 1e-14) } }
Example 54
Source File: MatrixFactorizationModelSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.recommendation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext { val rank = 2 var userFeatures: RDD[(Int, Array[Double])] = _ var prodFeatures: RDD[(Int, Array[Double])] = _ override def beforeAll(): Unit = { super.beforeAll() userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0)))) prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0)))) } test("constructor") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) assert(model.predict(0, 2) ~== 17.0 relTol 1e-14) intercept[IllegalArgumentException] { new MatrixFactorizationModel(1, userFeatures, prodFeatures) } val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures1, prodFeatures) } val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures, prodFeatures1) } } test("save/load") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = { features.mapValues(_.toSeq).collect().toSet } try { model.save(sc, path) val newModel = MatrixFactorizationModel.load(sc, path) assert(newModel.rank === rank) assert(collect(newModel.userFeatures) === collect(userFeatures)) assert(collect(newModel.productFeatures) === collect(prodFeatures)) } finally { Utils.deleteRecursively(tempDir) } } test("batch predict API recommendProductsForUsers") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendProductsForUsers(topK).collectAsMap() assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14) assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14) } test("batch predict API recommendUsersForProducts") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendUsersForProducts(topK).collectAsMap() assert(recommendations(2)(0).user == 1) assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14) assert(recommendations(2)(1).user == 0) assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14) } }
Example 55
Source File: RankingMetricsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("Ranking metrics: MAP, NDCG") { val predictionAndLabels = sc.parallelize( Seq( (Array(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array(1, 2, 3, 4, 5)), (Array(4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array(1, 2, 3)), (Array(1, 2, 3, 4, 5), Array.empty[Int]) ), 2) val eps = 1.0E-5 val metrics = new RankingMetrics(predictionAndLabels) val map = metrics.meanAveragePrecision assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps) assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps) assert(map ~== 0.355026 absTol eps) assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps) assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps) assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps) assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps) } test("MAP, NDCG with few predictions (SPARK-14886)") { val predictionAndLabels = sc.parallelize( Seq( (Array(1, 6, 2), Array(1, 2, 3, 4, 5)), (Array.empty[Int], Array(1, 2, 3)) ), 2) val eps = 1.0E-5 val metrics = new RankingMetrics(predictionAndLabels) assert(metrics.precisionAt(1) ~== 0.5 absTol eps) assert(metrics.precisionAt(2) ~== 0.25 absTol eps) assert(metrics.ndcgAt(1) ~== 0.5 absTol eps) assert(metrics.ndcgAt(2) ~== 0.30657 absTol eps) } }
Example 56
Source File: AreaUnderCurveSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext { test("auc computation") { val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0)) val auc = 4.0 assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5) } test("auc of an empty curve") { val curve = Seq.empty[(Double, Double)] assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } test("auc of a curve with a single point") { val curve = Seq((1.0, 1.0)) assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } }
Example 57
Source File: FPTreeSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import scala.language.existentials import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext { test("add transaction") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) assert(tree.root.children.size == 2) assert(tree.root.children.contains("a")) assert(tree.root.children("a").item.equals("a")) assert(tree.root.children("a").count == 2) assert(tree.root.children.contains("b")) assert(tree.root.children("b").item.equals("b")) assert(tree.root.children("b").count == 1) var child = tree.root.children("a") assert(child.children.size == 1) assert(child.children.contains("b")) assert(child.children("b").item.equals("b")) assert(child.children("b").count == 2) child = child.children("b") assert(child.children.size == 2) assert(child.children.contains("c")) assert(child.children.contains("y")) assert(child.children("c").item.equals("c")) assert(child.children("y").item.equals("y")) assert(child.children("c").count == 1) assert(child.children("y").count == 1) } test("merge tree") { val tree1 = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) val tree2 = new FPTree[String] .add(Seq("a", "b")) .add(Seq("a", "b", "c")) .add(Seq("a", "b", "c", "d")) .add(Seq("a", "x")) .add(Seq("a", "x", "y")) .add(Seq("c", "n")) .add(Seq("c", "m")) val tree3 = tree1.merge(tree2) assert(tree3.root.children.size == 3) assert(tree3.root.children("a").count == 7) assert(tree3.root.children("b").count == 1) assert(tree3.root.children("c").count == 2) val child1 = tree3.root.children("a") assert(child1.children.size == 2) assert(child1.children("b").count == 5) assert(child1.children("x").count == 2) val child2 = child1.children("b") assert(child2.children.size == 2) assert(child2.children("y").count == 1) assert(child2.children("c").count == 3) val child3 = child2.children("c") assert(child3.children.size == 1) assert(child3.children("d").count == 1) val child4 = child1.children("x") assert(child4.children.size == 1) assert(child4.children("y").count == 1) val child5 = tree3.root.children("c") assert(child5.children.size == 2) assert(child5.children("n").count == 1) assert(child5.children("m").count == 1) } test("extract freq itemsets") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("a", "b")) .add(Seq("a")) .add(Seq("b")) .add(Seq("b", "n")) val freqItemsets = tree.extract(3L).map { case (items, count) => (items.toSet, count) }.toSet val expected = Set( (Set("a"), 4L), (Set("b"), 5L), (Set("a", "b"), 3L)) assert(freqItemsets === expected) } }
Example 58
Source File: AssociationRulesSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext { test("association rules using String type") { val freqItemsets = sc.parallelize(Seq( (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L), (Set("r"), 3L), (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L), (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L), (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L), (Set("t", "y", "x"), 3L), (Set("t", "y", "x", "z"), 3L) ).map { case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq) }) val ar = new AssociationRules() val results1 = ar .setMinConfidence(0.9) .run(freqItemsets) .collect() assert(results2.size === 30) assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23) } }
Example 59
Source File: KernelDensitySuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 60
Source File: MultivariateGaussianSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Matrices, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 61
Source File: CoordinateMatrixSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { val m = 5 val n = 4 var mat: CoordinateMatrix = _ override def beforeAll() { super.beforeAll() val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } mat = new CoordinateMatrix(entries) } test("size") { assert(mat.numRows() === m) assert(mat.numCols() === n) } test("empty entries") { val entries = sc.parallelize(Seq[MatrixEntry](), 1) val emptyMat = new CoordinateMatrix(entries) intercept[RuntimeException] { emptyMat.numCols() } intercept[RuntimeException] { emptyMat.numRows() } } test("toBreeze") { val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(mat.toBreeze() === expected) } test("transpose") { val transposed = mat.transpose() assert(mat.toBreeze().t === transposed.toBreeze()) } test("toIndexedRowMatrix") { val indexedRowMatrix = mat.toIndexedRowMatrix() val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(indexedRowMatrix.toBreeze() === expected) } test("toRowMatrix") { val rowMatrix = mat.toRowMatrix() val rows = rowMatrix.rows.collect().toSet val expected = Set( Vectors.dense(1.0, 2.0, 0.0, 0.0), Vectors.dense(0.0, 3.0, 4.0, 0.0), Vectors.dense(0.0, 0.0, 5.0, 6.0), Vectors.dense(7.0, 0.0, 0.0, 8.0), Vectors.dense(0.0, 9.0, 0.0, 0.0)) assert(rows === expected) } test("toBlockMatrix") { val blockMat = mat.toBlockMatrix(2, 2) assert(blockMat.numRows() === m) assert(blockMat.numCols() === n) assert(blockMat.toBreeze() === mat.toBreeze()) intercept[IllegalArgumentException] { mat.toBlockMatrix(-1, 2) } intercept[IllegalArgumentException] { mat.toBlockMatrix(2, 0) } } }
Example 62
Source File: MLPairRDDFunctionsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ import org.apache.spark.mllib.util.MLlibTestSparkContext class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("topByKey") { val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5, 1), (3, 5)), 2) .topByKey(5) .collectAsMap() assert(topMap.size === 3) assert(topMap(1) === Array(7, 6, 3, 2, 1)) assert(topMap(3) === Array(7, 5, 2)) assert(topMap(5) === Array(1)) } }
Example 63
Source File: RDDFunctionsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.rdd.RDDFunctions._ import org.apache.spark.mllib.util.MLlibTestSparkContext class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("sliding") { val data = 0 until 6 for (numPartitions <- 1 to 8) { val rdd = sc.parallelize(data, numPartitions) for (windowSize <- 1 to 6) { for (step <- 1 to 3) { val sliding = rdd.sliding(windowSize, step).collect().map(_.toList).toList val expected = data.sliding(windowSize, step) .map(_.toList).toList.filter(l => l.size == windowSize) assert(sliding === expected) } } assert(rdd.sliding(7).collect().isEmpty, "Should return an empty RDD if the window size is greater than the number of items.") } } test("sliding with empty partitions") { val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7)) val rdd = sc.parallelize(data, data.length).flatMap(s => s) assert(rdd.partitions.length === data.length) val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq) val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq) assert(sliding === expected) } }
Example 64
Source File: OneHotEncoderSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext { def stringIndexed(): DataFrame = { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) val df = sqlContext.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") .fit(df) indexer.transform(df) } test("params") { ParamsSuite.checkParams(new OneHotEncoder) } test("OneHotEncoder dropLast = false") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") .setDropLast(false) val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1), vec(2)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0), (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0)) assert(output === expected) } test("OneHotEncoder dropLast = true") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0), (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0)) assert(output === expected) } test("input column with ML attribute") { val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large") val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size") .select(col("size").as("size", attr.toMetadata())) val encoder = new OneHotEncoder() .setInputCol("size") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("size_is_small").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("size_is_medium").withIndex(1)) } test("input column without ML attribute") { val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index") val encoder = new OneHotEncoder() .setInputCol("index") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("index_is_0").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("index_is_1").withIndex(1)) } }
Example 65
Source File: Word2VecSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel} class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new Word2Vec) val model = new Word2VecModel("w2v", new OldWord2VecModel(Map("a" -> Array(0.0f)))) ParamsSuite.checkParams(model) } test("Word2Vec") { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val sentence = "a b " * 100 + "a c " * 10 val numOfWords = sentence.split(" ").size val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" ")) val codes = Map( "a" -> Array(-0.2811822295188904, -0.6356269121170044, -0.3020961284637451), "b" -> Array(1.0309048891067505, -1.29472815990448, 0.22276712954044342), "c" -> Array(-0.08456747233867645, 0.5137411952018738, 0.11731560528278351) ) val expected = doc.map { sentence => Vectors.dense(sentence.map(codes.apply).reduce((word1, word2) => word1.zip(word2).map { case (v1, v2) => v1 + v2 } ).map(_ / numOfWords)) } val docDF = doc.zip(expected).toDF("text", "expected") val model = new Word2Vec() .setVectorSize(3) .setInputCol("text") .setOutputCol("result") .setSeed(42L) .fit(docDF) model.transform(docDF).select("result", "expected").collect().foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1 ~== vector2 absTol 1E-5, "Transformed vector is different with expected.") } } }
Example 66
Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new VectorAssembler) } test("assemble") { import org.apache.spark.ml.feature.VectorAssembler.assemble assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty)) assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0))) val dv = Vectors.dense(2.0, 0.0) assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0))) val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0)) assert(assemble(0.0, dv, 1.0, sv) === Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0))) for (v <- Seq(1, "a", null)) { intercept[SparkException](assemble(v)) intercept[SparkException](assemble(1.0, v)) } } test("assemble should compress vectors") { import org.apache.spark.ml.feature.VectorAssembler.assemble val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) assert(v1.isInstanceOf[SparseVector]) val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) assert(v2.isInstanceOf[DenseVector]) } test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) )).toDF("id", "x", "y", "name", "z", "n") val assembler = new VectorAssembler() .setInputCols(Array("x", "y", "z", "n")) .setOutputCol("features") assembler.transform(df).select("features").collect().foreach { case Row(v: Vector) => assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0))) } } test("ML attributes") { val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari") val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0) val user = new AttributeGroup("user", Array( NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"), NumericAttribute.defaultAttr.withName("salary"))) val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0))) val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad") .select( col("browser").as("browser", browser.toMetadata()), col("hour").as("hour", hour.toMetadata()), col("count"), // "count" is an integer column without ML attribute col("user").as("user", user.toMetadata()), col("ad")) // "ad" is a vector column without ML attribute val assembler = new VectorAssembler() .setInputCols(Array("browser", "hour", "count", "user", "ad")) .setOutputCol("features") val output = assembler.transform(df) val schema = output.schema val features = AttributeGroup.fromStructField(schema("features")) assert(features.size === 7) val browserOut = features.getAttr(0) assert(browserOut === browser.withIndex(0).withName("browser")) val hourOut = features.getAttr(1) assert(hourOut === hour.withIndex(1).withName("hour")) val countOut = features.getAttr(2) assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2)) val userGenderOut = features.getAttr(3) assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3)) val userSalaryOut = features.getAttr(4) assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4)) assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5)) assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6)) } }
Example 67
Source File: BinarizerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var data: Array[Double] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4) } test("params") { ParamsSuite.checkParams(new Binarizer) } test("Binarize continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = sqlContext.createDataFrame( data.zip(defaultBinarized)).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize continuous features with setter") { val threshold: Double = 0.2 val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = sqlContext.createDataFrame( data.zip(thresholdBinarized)).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } }
Example 68
Source File: TokenizerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite { test("params") { ParamsSuite.checkParams(new Tokenizer) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext { import org.apache.spark.ml.feature.RegexTokenizerSuite._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct")) )) testRegexTokenizer(tokenizer0, dataset0) val dataset1 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) )) tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("Te,st.", "punct")) )) testRegexTokenizer(tokenizer2, dataset2) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 69
Source File: PolynomialExpansionSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } }
Example 70
Source File: IDFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } test("params") { ParamsSuite.checkParams(new IDF) val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0))) ParamsSuite.checkParams(model) } test("compute IDF with default parameter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((numOfData + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } }
Example 71
Source File: NormalizerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var data: Array[Vector] = _ @transient var dataFrame: DataFrame = _ @transient var normalizer: Normalizer = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) val sqlContext = new SQLContext(sc) dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData)) normalizer = new Normalizer() .setInputCol("features") .setOutputCol("normalized_features") } def collectResult(result: DataFrame): Array[Vector] = { result.select("normalized_features").collect().map { case Row(features: Vector) => features } } def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { (vector1, vector2) => vector1 ~== vector2 absTol 1E-5 }, "The vector value is not correct after normalization.") } test("Normalization with default parameter") { val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l2Normalized) } test("Normalization with setter") { normalizer.setP(1) val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l1Normalized) } } private object NormalizerSuite { case class FeatureData(features: Vector) }
Example 72
Source File: HashingTFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.Utils class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val df = sqlContext.createDataFrame(Seq( (0, "a a b b c d".split(" ").toSeq) )).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(n)) val features = output.select("features").first().getAs[Vector](0) // Assume perfect hash on "a", "b", "c", and "d". def idx(any: Any): Int = Utils.nonNegativeMod(any.##, n) val expected = Vectors.sparse(n, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) assert(features ~== expected absTol 1e-14) } }
Example 73
Source File: StringIndexerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, NominalAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new StringIndexer) val model = new StringIndexerModel("indexer", Array("a", "b")) ParamsSuite.checkParams(model) } test("StringIndexer") { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) val df = sqlContext.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") .fit(df) val transformed = indexer.transform(df) val attr = Attribute.fromStructField(transformed.schema("labelIndex")) .asInstanceOf[NominalAttribute] assert(attr.values.get === Array("a", "c", "b")) val output = transformed.select("id", "labelIndex").map { r => (r.getInt(0), r.getDouble(1)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)) assert(output === expected) } test("StringIndexer with a numeric input column") { val data = sc.parallelize(Seq((0, 100), (1, 200), (2, 300), (3, 100), (4, 100), (5, 300)), 2) val df = sqlContext.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") .fit(df) val transformed = indexer.transform(df) val attr = Attribute.fromStructField(transformed.schema("labelIndex")) .asInstanceOf[NominalAttribute] assert(attr.values.get === Array("100", "300", "200")) val output = transformed.select("id", "labelIndex").map { r => (r.getInt(0), r.getDouble(1)) }.collect().toSet // 100 -> 0, 200 -> 2, 300 -> 1 val expected = Set((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)) assert(output === expected) } test("StringIndexerModel should keep silent if the input column does not exist.") { val indexerModel = new StringIndexerModel("indexer", Array("a", "b", "c")) .setInputCol("label") .setOutputCol("labelIndex") val df = sqlContext.range(0L, 10L) assert(indexerModel.transform(df).eq(df)) } }
Example 74
Source File: RegressionEvaluatorSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new RegressionEvaluator) } test("Regression Evaluator: default params") { val trainer = new LinearRegression val model = trainer.fit(dataset) val predictions = model.transform(dataset) // default = rmse val evaluator = new RegressionEvaluator() assert(evaluator.evaluate(predictions) ~== -0.1019382 absTol 0.001) // r2 score evaluator.setMetricName("r2") assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001) // mae evaluator.setMetricName("mae") assert(evaluator.evaluate(predictions) ~== -0.08036075 absTol 0.001) } }
Example 75
Source File: RandomForestRegressorSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame def compareAPIs( data: RDD[LabeledPoint], rf: RandomForestRegressor, categoricalFeatures: Map[Int, Int]): Unit = { val oldStrategy = rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity) val oldModel = OldRandomForest.trainRegressor( data, oldStrategy, rf.getNumTrees, rf.getFeatureSubsetStrategy, rf.getSeed.toInt) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val newModel = rf.fit(newData) // Use parent from newTree since this is not checked anyways. val oldModelAsNew = RandomForestRegressionModel.fromOld( oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) } }
Example 76
Source File: DecisionTreeRegressorSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContext { import DecisionTreeRegressorSuite.compareAPIs private var categoricalDataPointsRDD: RDD[LabeledPoint] = _ override def beforeAll() { super.beforeAll() categoricalDataPointsRDD = sc.parallelize(OldDecisionTreeSuite.generateCategoricalDataPoints()) } ///////////////////////////////////////////////////////////////////////////// // Tests calling train() ///////////////////////////////////////////////////////////////////////////// test("Regression stump with 3-ary (ordered) categorical features") { val dt = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(100) val categoricalFeatures = Map(0 -> 3, 1-> 3) compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures) } test("Regression stump with binary (ordered) categorical features") { val dt = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(100) val categoricalFeatures = Map(0 -> 2, 1-> 2) compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures) } ///////////////////////////////////////////////////////////////////////////// // Tests of model save/load ///////////////////////////////////////////////////////////////////////////// // TODO: test("model save/load") SPARK-6725 } private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite { def compareAPIs( data: RDD[LabeledPoint], dt: DecisionTreeRegressor, categoricalFeatures: Map[Int, Int]): Unit = { val oldStrategy = dt.getOldStrategy(categoricalFeatures) val oldTree = OldDecisionTree.train(data, oldStrategy) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val newTree = dt.fit(newData) // Use parent from newTree since this is not checked anyways. val oldTreeAsNew = DecisionTreeRegressionModel.fromOld( oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures) TreeTests.checkEqual(oldTreeAsNew, newTree) } }
Example 77
Source File: ChiSqSelectorSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } }
Example 78
Source File: Word2VecSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.Utils class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { // TODO: add more tests test("Word2Vec") { val sentence = "a b " * 100 + "a c " * 10 val localDoc = Seq(sentence, sentence) val doc = sc.parallelize(localDoc) .map(line => line.split(" ").toSeq) val model = new Word2Vec().setVectorSize(10).setSeed(42L).fit(doc) val syms = model.findSynonyms("a", 2) assert(syms.length == 2) assert(syms(0)._1 == "b") assert(syms(1)._1 == "c") } test("Word2VecModel") { val num = 2 val word2VecMap = Map( ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)), ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)), ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)), ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)) ) val model = new Word2VecModel(word2VecMap) val syms = model.findSynonyms("china", num) assert(syms.length == num) assert(syms(0)._1 == "taiwan") assert(syms(1)._1 == "japan") } test("model load / save") { val word2VecMap = Map( ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)), ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)), ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)), ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)) ) val model = new Word2VecModel(word2VecMap) val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = Word2VecModel.load(sc, path) assert(sameModel.getVectors.mapValues(_.toSeq) === model.getVectors.mapValues(_.toSeq)) } finally { Utils.deleteRecursively(tempDir) } } }
Example 79
Source File: ElementwiseProductSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 80
Source File: IDFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 81
Source File: PCASuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val pc = mat.computePrincipalComponents(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() assert(pca_transform.toSet === mat_multiply.toSet) } }
Example 82
Source File: HashingTFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("hashing tf on a single doc") { val hashingTF = new HashingTF(1000) val doc = "a a b b c d".split(" ") val n = hashingTF.numFeatures val termFreqs = Seq( (hashingTF.indexOf("a"), 2.0), (hashingTF.indexOf("b"), 2.0), (hashingTF.indexOf("c"), 1.0), (hashingTF.indexOf("d"), 1.0)) assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n), "index must be in range [0, #features)") assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing") val expected = Vectors.sparse(n, termFreqs) assert(hashingTF.transform(doc) === expected) } test("hashing tf on an RDD") { val hashingTF = new HashingTF val localDocs: Seq[Seq[String]] = Seq( "a a b b b c d".split(" "), "a b c d a b c".split(" "), "c b a c b a a".split(" ")) val docs = sc.parallelize(localDocs, 2) assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet) } }
Example 83
Source File: ImpuritySuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.tree.impurity.{EntropyAggregator, GiniAggregator} import org.apache.spark.mllib.util.MLlibTestSparkContext class ImpuritySuite extends SparkFunSuite with MLlibTestSparkContext { test("Gini impurity does not support negative labels") { val gini = new GiniAggregator(2) intercept[IllegalArgumentException] { gini.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0) } } test("Entropy does not support negative labels") { val entropy = new EntropyAggregator(2) intercept[IllegalArgumentException] { entropy.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0) } } }
Example 84
Source File: BaggedPointSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.tree.EnsembleTestHelper import org.apache.spark.mllib.util.MLlibTestSparkContext class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext { test("BaggedPoint RDD: without subsampling") { val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42) baggedRDD.collect().foreach { baggedPoint => assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 1.0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample)) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample))) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } }
Example 85
Source File: MatrixFactorizationModelSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.recommendation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext { val rank = 2 var userFeatures: RDD[(Int, Array[Double])] = _ var prodFeatures: RDD[(Int, Array[Double])] = _ override def beforeAll(): Unit = { super.beforeAll() userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0)))) prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0)))) } test("constructor") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) assert(model.predict(0, 2) ~== 17.0 relTol 1e-14) intercept[IllegalArgumentException] { new MatrixFactorizationModel(1, userFeatures, prodFeatures) } val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures1, prodFeatures) } val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures, prodFeatures1) } } test("save/load") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = { features.mapValues(_.toSeq).collect().toSet } try { model.save(sc, path) val newModel = MatrixFactorizationModel.load(sc, path) assert(newModel.rank === rank) assert(collect(newModel.userFeatures) === collect(userFeatures)) assert(collect(newModel.productFeatures) === collect(prodFeatures)) } finally { Utils.deleteRecursively(tempDir) } } test("batch predict API recommendProductsForUsers") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendProductsForUsers(topK).collectAsMap() assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14) assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14) } test("batch predict API recommendUsersForProducts") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendUsersForProducts(topK).collectAsMap() assert(recommendations(2)(0).user == 1) assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14) assert(recommendations(2)(1).user == 0) assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14) } }
Example 86
Source File: RankingMetricsSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("Ranking metrics: map, ndcg") { val predictionAndLabels = sc.parallelize( Seq( (Array[Int](1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array[Int](1, 2, 3, 4, 5)), (Array[Int](4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array[Int](1, 2, 3)), (Array[Int](1, 2, 3, 4, 5), Array[Int]()) ), 2) val eps: Double = 1E-5 val metrics = new RankingMetrics(predictionAndLabels) val map = metrics.meanAveragePrecision assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps) assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps) assert(map ~== 0.355026 absTol eps) assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps) assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps) assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps) assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps) } }
Example 87
Source File: AreaUnderCurveSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext { test("auc computation") { val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0)) val auc = 4.0 assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5) } test("auc of an empty curve") { val curve = Seq.empty[(Double, Double)] assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } test("auc of a curve with a single point") { val curve = Seq((1.0, 1.0)) assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } }
Example 88
Source File: RegressionMetricsSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class RegressionMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("regression metrics") { val predictionAndObservations = sc.parallelize( Seq((2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)), 2) val metrics = new RegressionMetrics(predictionAndObservations) assert(metrics.explainedVariance ~== 0.95717 absTol 1E-5, "explained variance regression score mismatch") assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch") assert(metrics.meanSquaredError ~== 0.375 absTol 1E-5, "mean squared error mismatch") assert(metrics.rootMeanSquaredError ~== 0.61237 absTol 1E-5, "root mean squared error mismatch") assert(metrics.r2 ~== 0.94861 absTol 1E-5, "r2 score mismatch") } test("regression metrics with complete fitting") { val predictionAndObservations = sc.parallelize( Seq((3.0, 3.0), (0.0, 0.0), (2.0, 2.0), (8.0, 8.0)), 2) val metrics = new RegressionMetrics(predictionAndObservations) assert(metrics.explainedVariance ~== 1.0 absTol 1E-5, "explained variance regression score mismatch") assert(metrics.meanAbsoluteError ~== 0.0 absTol 1E-5, "mean absolute error mismatch") assert(metrics.meanSquaredError ~== 0.0 absTol 1E-5, "mean squared error mismatch") assert(metrics.rootMeanSquaredError ~== 0.0 absTol 1E-5, "root mean squared error mismatch") assert(metrics.r2 ~== 1.0 absTol 1E-5, "r2 score mismatch") } }
Example 89
Source File: MulticlassMetricsSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Matrices import org.apache.spark.mllib.util.MLlibTestSparkContext class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("Multiclass evaluation metrics") { val confusionMatrix = Matrices.dense(3, 3, Array(2, 1, 0, 1, 3, 0, 1, 0, 1)) val labels = Array(0.0, 1.0, 2.0) val predictionAndLabels = sc.parallelize( Seq((0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2) val metrics = new MulticlassMetrics(predictionAndLabels) val delta = 0.0000001 val fpRate0 = 1.0 / (9 - 4) val fpRate1 = 1.0 / (9 - 4) val fpRate2 = 1.0 / (9 - 1) val precision0 = 2.0 / (2 + 1) val precision1 = 3.0 / (3 + 1) val precision2 = 1.0 / (1 + 1) val recall0 = 2.0 / (2 + 2) val recall1 = 3.0 / (3 + 1) val recall2 = 1.0 / (1 + 0) val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0) val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1) val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2) val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0) val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1) val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2) assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray)) assert(math.abs(metrics.falsePositiveRate(0.0) - fpRate0) < delta) assert(math.abs(metrics.falsePositiveRate(1.0) - fpRate1) < delta) assert(math.abs(metrics.falsePositiveRate(2.0) - fpRate2) < delta) assert(math.abs(metrics.precision(0.0) - precision0) < delta) assert(math.abs(metrics.precision(1.0) - precision1) < delta) assert(math.abs(metrics.precision(2.0) - precision2) < delta) assert(math.abs(metrics.recall(0.0) - recall0) < delta) assert(math.abs(metrics.recall(1.0) - recall1) < delta) assert(math.abs(metrics.recall(2.0) - recall2) < delta) assert(math.abs(metrics.fMeasure(0.0) - f1measure0) < delta) assert(math.abs(metrics.fMeasure(1.0) - f1measure1) < delta) assert(math.abs(metrics.fMeasure(2.0) - f1measure2) < delta) assert(math.abs(metrics.fMeasure(0.0, 2.0) - f2measure0) < delta) assert(math.abs(metrics.fMeasure(1.0, 2.0) - f2measure1) < delta) assert(math.abs(metrics.fMeasure(2.0, 2.0) - f2measure2) < delta) assert(math.abs(metrics.recall - (2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta) assert(math.abs(metrics.recall - metrics.precision) < delta) assert(math.abs(metrics.recall - metrics.fMeasure) < delta) assert(math.abs(metrics.recall - metrics.weightedRecall) < delta) assert(math.abs(metrics.weightedFalsePositiveRate - ((4.0 / 9) * fpRate0 + (4.0 / 9) * fpRate1 + (1.0 / 9) * fpRate2)) < delta) assert(math.abs(metrics.weightedPrecision - ((4.0 / 9) * precision0 + (4.0 / 9) * precision1 + (1.0 / 9) * precision2)) < delta) assert(math.abs(metrics.weightedRecall - ((4.0 / 9) * recall0 + (4.0 / 9) * recall1 + (1.0 / 9) * recall2)) < delta) assert(math.abs(metrics.weightedFMeasure - ((4.0 / 9) * f1measure0 + (4.0 / 9) * f1measure1 + (1.0 / 9) * f1measure2)) < delta) assert(math.abs(metrics.weightedFMeasure(2.0) - ((4.0 / 9) * f2measure0 + (4.0 / 9) * f2measure1 + (1.0 / 9) * f2measure2)) < delta) assert(metrics.labels.sameElements(labels)) } }
Example 90
Source File: FPTreeSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import scala.language.existentials import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext { test("add transaction") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) assert(tree.root.children.size == 2) assert(tree.root.children.contains("a")) assert(tree.root.children("a").item.equals("a")) assert(tree.root.children("a").count == 2) assert(tree.root.children.contains("b")) assert(tree.root.children("b").item.equals("b")) assert(tree.root.children("b").count == 1) var child = tree.root.children("a") assert(child.children.size == 1) assert(child.children.contains("b")) assert(child.children("b").item.equals("b")) assert(child.children("b").count == 2) child = child.children("b") assert(child.children.size == 2) assert(child.children.contains("c")) assert(child.children.contains("y")) assert(child.children("c").item.equals("c")) assert(child.children("y").item.equals("y")) assert(child.children("c").count == 1) assert(child.children("y").count == 1) } test("merge tree") { val tree1 = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) val tree2 = new FPTree[String] .add(Seq("a", "b")) .add(Seq("a", "b", "c")) .add(Seq("a", "b", "c", "d")) .add(Seq("a", "x")) .add(Seq("a", "x", "y")) .add(Seq("c", "n")) .add(Seq("c", "m")) val tree3 = tree1.merge(tree2) assert(tree3.root.children.size == 3) assert(tree3.root.children("a").count == 7) assert(tree3.root.children("b").count == 1) assert(tree3.root.children("c").count == 2) val child1 = tree3.root.children("a") assert(child1.children.size == 2) assert(child1.children("b").count == 5) assert(child1.children("x").count == 2) val child2 = child1.children("b") assert(child2.children.size == 2) assert(child2.children("y").count == 1) assert(child2.children("c").count == 3) val child3 = child2.children("c") assert(child3.children.size == 1) assert(child3.children("d").count == 1) val child4 = child1.children("x") assert(child4.children.size == 1) assert(child4.children("y").count == 1) val child5 = tree3.root.children("c") assert(child5.children.size == 2) assert(child5.children("n").count == 1) assert(child5.children("m").count == 1) } test("extract freq itemsets") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("a", "b")) .add(Seq("a")) .add(Seq("b")) .add(Seq("b", "n")) val freqItemsets = tree.extract(3L).map { case (items, count) => (items.toSet, count) }.toSet val expected = Set( (Set("a"), 4L), (Set("b"), 5L), (Set("a", "b"), 3L)) assert(freqItemsets === expected) } }
Example 91
Source File: KernelDensitySuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 92
Source File: MultivariateGaussianSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{ Vectors, Matrices } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 93
Source File: CoordinateMatrixSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.linalg.Vectors class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { val m = 5 val n = 4 var mat: CoordinateMatrix = _ override def beforeAll() { super.beforeAll() val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } mat = new CoordinateMatrix(entries) } test("size") { assert(mat.numRows() === m) assert(mat.numCols() === n) } test("empty entries") { val entries = sc.parallelize(Seq[MatrixEntry](), 1) val emptyMat = new CoordinateMatrix(entries) intercept[RuntimeException] { emptyMat.numCols() } intercept[RuntimeException] { emptyMat.numRows() } } test("toBreeze") { val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(mat.toBreeze() === expected) } test("transpose") { val transposed = mat.transpose() assert(mat.toBreeze().t === transposed.toBreeze()) } test("toIndexedRowMatrix") { val indexedRowMatrix = mat.toIndexedRowMatrix() val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(indexedRowMatrix.toBreeze() === expected) } test("toRowMatrix") { val rowMatrix = mat.toRowMatrix() val rows = rowMatrix.rows.collect().toSet val expected = Set( Vectors.dense(1.0, 2.0, 0.0, 0.0), Vectors.dense(0.0, 3.0, 4.0, 0.0), Vectors.dense(0.0, 0.0, 5.0, 6.0), Vectors.dense(7.0, 0.0, 0.0, 8.0), Vectors.dense(0.0, 9.0, 0.0, 0.0)) assert(rows === expected) } test("toBlockMatrix") { val blockMat = mat.toBlockMatrix(2, 2) assert(blockMat.numRows() === m) assert(blockMat.numCols() === n) assert(blockMat.toBreeze() === mat.toBreeze()) intercept[IllegalArgumentException] { mat.toBlockMatrix(-1, 2) } intercept[IllegalArgumentException] { mat.toBlockMatrix(2, 0) } } }
Example 94
Source File: MLPairRDDFunctionsSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("topByKey") { val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5, 1), (3, 5)), 2) .topByKey(5) .collectAsMap() assert(topMap.size === 3) assert(topMap(1) === Array(7, 6, 3, 2, 1)) assert(topMap(3) === Array(7, 5, 2)) assert(topMap(5) === Array(1)) } }
Example 95
Source File: RDDFunctionsSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.rdd.RDDFunctions._ class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("sliding") { val data = 0 until 6 for (numPartitions <- 1 to 8) { val rdd = sc.parallelize(data, numPartitions) for (windowSize <- 1 to 6) { val sliding = rdd.sliding(windowSize).collect().map(_.toList).toList val expected = data.sliding(windowSize).map(_.toList).toList assert(sliding === expected) } assert(rdd.sliding(7).collect().isEmpty, "Should return an empty RDD if the window size is greater than the number of items.") } } test("sliding with empty partitions") { val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7)) val rdd = sc.parallelize(data, data.length).flatMap(s => s) assert(rdd.partitions.size === data.length) val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq) val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq) assert(sliding === expected) } }
Example 96
Source File: VectorSlicerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SQLContext} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") {//参数 val slicer = new VectorSlicer ParamsSuite.checkParams(slicer) //指数 assert(slicer.getIndices.length === 0) //名称 assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.validateParams() } } } test("feature validity checks") {//特征有效性检查 import VectorSlicer._ //如果给定的特征索引是有效的,返回true assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) //如果给定的特征名称有效,返回true assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") {//测试向量机 val sqlContext = new SQLContext(sc) val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 //预计在选择指数1,4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = sqlContext.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) //VectorSlicer是一个转换器输入特征向量,输出原始特征向量子集. val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df))//transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame validateResults(vectorSlicer.transform(df)) } }
Example 97
Source File: DCTSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext { //正向离散余弦变换jtransforms比赛结果 test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } //逆离散余弦变换jtransforms比赛结果 test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true) } else { (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = sqlContext.createDataFrame(Seq( DCTTestData(data, expectedResult) )) val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) //transform()方法将DataFrame转化为另外一个DataFrame的算法 transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 98
Source File: StopWordsRemoverSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} object StopWordsRemoverSuite extends SparkFunSuite { def testStopWordsRemover(t: StopWordsRemover, dataset: DataFrame): Unit = { t.transform(dataset) //transform()方法将DataFrame转化为另外一个DataFrame的算法 .select("filtered", "expected") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } } dataSet.show() testStopWordsRemover(remover, dataSet) } }
Example 99
Source File: TokenizerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) dataset1.show() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("Te,st.", "punct")) )) testRegexTokenizer(tokenizer2, dataset2) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = { //transform()方法将DataFrame转化为另外一个DataFrame的算法 t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 100
Source File: MinMaxScalerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SQLContext} println(vector1+"|||"+vector2) assert(vector1.equals(vector2), "Transformed vector is different with expected.") } } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } //MinMaxScaler将所有特征向量线性变换到用户指定最大-最小值之间 test("MinMaxScaler arguments max must be larger than min") { withClue("arguments max must be larger than min") { intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(10).setMax(0) scaler.validateParams() } intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(0).setMax(0) scaler.validateParams() } } } }
Example 101
Source File: PolynomialExpansionSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") {//参数 ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") {//带有默认参数的多项式展开 val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") //transform()方法将DataFrame转化为另外一个DataFrame的算法 polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } //多项式展开设置 test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) //transform()方法将DataFrame转化为另外一个DataFrame的算法 polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } }
Example 102
Source File: IDFSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") {//设置IDF计算 val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df)//fit()方法将DataFrame转化为一个Transformer的算法 //transform()方法将DataFrame转化为另外一个DataFrame的算法 idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } }
Example 103
Source File: NormalizerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var data: Array[Vector] = _ @transient var dataFrame: DataFrame = _ @transient var normalizer: Normalizer = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) val sqlContext = new SQLContext(sc) dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData)) normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized_features") } //收集的结果 def collectResult(result: DataFrame): Array[Vector] = { result.select("normalized_features").collect().map { case Row(features: Vector) => features } } //向量的断言类型 def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } //断言值 def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { (vector1, vector2) => vector1 ~== vector2 absTol 1E-5 }, "The vector value is not correct after normalization.") } test("Normalization with default parameter") {//默认参数的正常化 //transform()方法将DataFrame转化为另外一个DataFrame的算法 normalizer.transform(dataFrame).show() val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l2Normalized) } test("Normalization with setter") {//规范化设置 normalizer.setP(1) //transform()方法将DataFrame转化为另外一个DataFrame的算法 normalizer.transform(dataFrame).show() val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l1Normalized) } } private object NormalizerSuite { case class FeatureData(features: Vector) }
Example 104
Source File: RegressionEvaluatorSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new RegressionEvaluator) } test("Regression Evaluator: default params") {//评估回归:默认参数 val trainer = new LinearRegression //fit()方法将DataFrame转化为一个Transformer的算法 val model = trainer.fit(dataset) //转换 //Prediction 预测 //transform()方法将DataFrame转化为另外一个DataFrame的算法 val predictions = model.transform(dataset) predictions.collect() // default = rmse //默认rmse均方根误差说明样本的离散程度 val evaluator = new RegressionEvaluator() println("==MetricName="+evaluator.getMetricName+"=LabelCol="+evaluator.getLabelCol+"=PredictionCol="+evaluator.getPredictionCol) //==MetricName=rmse=LabelCol=label=PredictionCol=prediction,默认rmse均方根误差说明样本的离散程度 assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001) // r2 score 评分 //R2平方系统也称判定系数,用来评估模型拟合数据的好坏 evaluator.setMetricName("r2") assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001) //MAE平均绝对误差是所有单个观测值与算术平均值的偏差的绝对值的平均 evaluator.setMetricName("mae") assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001) } }
Example 105
Source File: ANNSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ANNSuite extends SparkFunSuite with MLlibTestSparkContext { // TODO: test for weights comparison with Weka MLP //人工神经网络与乙状结肠学习LBFGS优化器异或函数 test("ANN with Sigmoid learns XOR function with LBFGS optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array(0.0, 1.0, 1.0, 0.0) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights() val trainer = new FeedForwardTrainer(topology, 2, 1) //initialWeights初始取值,默认是0向量 trainer.setWeights(initialWeights) trainer.LBFGSOptimizer.setNumIterations(20) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input)(0), label(0)) }.collect() predictionAndLabels.foreach { case (p, l) => assert(math.round(p) === l) } } //人工神经网络与学习两输出和批量SoftMax GD优化器异或函数 test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array( Array(1.0, 0.0), Array(0.0, 1.0), Array(0.0, 1.0), Array(1.0, 0.0) ) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights() val trainer = new FeedForwardTrainer(topology, 2, 2) //(SGD随机梯度下降) trainer.SGDOptimizer.setNumIterations(2000) //initialWeights初始取值,默认是0向量 trainer.setWeights(initialWeights) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input), label) }.collect() predictionAndLabels.foreach { case (p, l) => assert(p ~== l absTol 0.5) } } }
Example 106
Source File: ChiSqSelectorSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext //特征提取和转换 卡方选择(ChiSqSelector)稀疏和稠密向量 test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize(//标记的离散数据 Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData =//预过滤数据 //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) //fit()方法将DataFrame转化为一个Transformer的算法 val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => //transform()方法将DataFrame转化为另外一个DataFrame的算法 LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } }
Example 107
Source File: ElementwiseProductSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { //产品应适用于数据集在一个密集的矢量 test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) //批理变换和每个变换,得到相同的结果 //transform()方法将DataFrame转化为另外一个DataFrame的算法 val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } //元素(Hadamard)产品应正确运用向量的稀疏数据集 test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) //transform()方法将DataFrame转化为另外一个DataFrame的算法 val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 108
Source File: PCASuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") {//正确的计算使用一个主成分分析包装 val k = dataRDD.count().toInt //fit()方法将DataFrame转化为一个Transformer的算法 val pca = new PCA(k).fit(dataRDD) //转换分布式矩阵分 val mat = new RowMatrix(dataRDD) //计算主成分析,将维度降为K val pc = mat.computePrincipalComponents(k) //PCA变换 //transform()方法将DataFrame转化为另外一个DataFrame的算法 val pca_transform = pca.transform(dataRDD).collect() //Mat _相乘 val mat_multiply = mat.multiply(pc).rows.collect() assert(pca_transform.toSet === mat_multiply.toSet) } }
Example 109
Source File: HashingTFSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext //把字符转换特征哈希值,返回词的频率 class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("hashing tf on a single doc") {//散列在一个单一的文件 val hashingTF = new HashingTF(1000) val doc = "a a b b c d".split(" ") val n = hashingTF.numFeatures //词的频率 val termFreqs = Seq( (hashingTF.indexOf("a"), 2.0), (hashingTF.indexOf("b"), 2.0), (hashingTF.indexOf("c"), 1.0), (hashingTF.indexOf("d"), 1.0)) //termFreqs: Seq[(Int, Double)] = List((97,2.0), (98,2.0), (99,1.0), (100,1.0)) assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n), "index must be in range [0, #features)")//索引必须在范围内 assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")//期待完美的哈希 val expected = Vectors.sparse(n, termFreqs) //transform 把每个输入文档映射到一个Vector对象 //transform()方法将DataFrame转化为另外一个DataFrame的算法 assert(hashingTF.transform(doc) === expected) } test("hashing tf on an RDD") {//散列TF在RDD val hashingTF = new HashingTF val localDocs: Seq[Seq[String]] = Seq( "a a b b b c d".split(" "), "a b c d a b c".split(" "), "c b a c b a a".split(" ")) val docs = sc.parallelize(localDocs, 2) //transform()方法将DataFrame转化为另外一个DataFrame的算法 assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet) } }
Example 110
Source File: ImpuritySuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.tree.impurity.{EntropyAggregator, GiniAggregator} import org.apache.spark.mllib.util.MLlibTestSparkContext class ImpuritySuite extends SparkFunSuite with MLlibTestSparkContext { test("Gini impurity does not support negative labels") {//基尼杂质不支持负标签 val gini = new GiniAggregator(2) intercept[IllegalArgumentException] { gini.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0) } } test("Entropy does not support negative labels") {//熵不支持负标签 val entropy = new EntropyAggregator(2) intercept[IllegalArgumentException] { entropy.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0) } } }
Example 111
Source File: BaggedPointSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.tree.EnsembleTestHelper import org.apache.spark.mllib.util.MLlibTestSparkContext class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext { test("BaggedPoint RDD: without subsampling") { val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42) baggedRDD.collect().foreach { baggedPoint => assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 1.0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, //epsilon代收敛的阀值 expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 //math.abs返回数的绝对值 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample)) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 //math.abs返回数的绝对值 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample))) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } }
Example 112
Source File: MatrixFactorizationModelSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.recommendation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils sqlContext.createDataFrame(prodFeatures).show() } test("constructor") {//构造函数 val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) //预测得分,用户ID,产品ID println("========"+model.predict(0, 2)) //17.0 assert(model.predict(0, 2) ~== 17.0 relTol 1e-14) intercept[IllegalArgumentException] { new MatrixFactorizationModel(1, userFeatures, prodFeatures) } //userFeatures 用户特征 val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures1, prodFeatures) } //prodFeatures 产品特征 val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures, prodFeatures1) } } test("save/load") {//保存/加载 val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = { features.mapValues(_.toSeq).collect().toSet } try { model.save(sc, path) val newModel = MatrixFactorizationModel.load(sc, path) assert(newModel.rank === rank) //用户特征 assert(collect(newModel.userFeatures) === collect(userFeatures)) //产品特征 assert(collect(newModel.productFeatures) === collect(prodFeatures)) } finally { Utils.deleteRecursively(tempDir) } } test("batch predict API recommendProductsForUsers") {//批量预测API recommendproductsforusers val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 //为用户推荐个数为num的商品 val recommendations = model.recommendProductsForUsers(topK).collectAsMap() assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14) assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14) } test("batch predict API recommendUsersForProducts") { //userFeatures用户因子,prodFeatures商品因子,rank因子个数,因子个数一般越多越好,普通取值10到200 val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 //为用户推荐个数为num的商品 val recommendations = model.recommendUsersForProducts(topK).collectAsMap() assert(recommendations(2)(0).user == 1) assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14) assert(recommendations(2)(1).user == 0) assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14) } }
Example 113
Source File: AreaUnderCurveSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext { test("auc computation") {//AUC计算 //曲线 val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0)) val auc = 4.0 assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5) //1e-5的意思就是1乘以10的负5次幂.就是0.000001 val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5) } test("auc of an empty curve") {//AUC空曲线 //曲线 val curve = Seq.empty[(Double, Double)] assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } test("auc of a curve with a single point") {//单点与曲线的AUC val curve = Seq((1.0, 1.0)) assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } }
Example 114
Source File: FPTreeSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import scala.language.existentials import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext { test("add transaction") {//增加转换 val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) assert(tree.root.children.size == 2) assert(tree.root.children.contains("a")) assert(tree.root.children("a").item.equals("a")) assert(tree.root.children("a").count == 2) assert(tree.root.children.contains("b")) assert(tree.root.children("b").item.equals("b")) assert(tree.root.children("b").count == 1) var child = tree.root.children("a") assert(child.children.size == 1) assert(child.children.contains("b")) assert(child.children("b").item.equals("b")) assert(child.children("b").count == 2) child = child.children("b") assert(child.children.size == 2) assert(child.children.contains("c")) assert(child.children.contains("y")) assert(child.children("c").item.equals("c")) assert(child.children("y").item.equals("y")) assert(child.children("c").count == 1) assert(child.children("y").count == 1) } test("merge tree") {//合并树 val tree1 = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) val tree2 = new FPTree[String] .add(Seq("a", "b")) .add(Seq("a", "b", "c")) .add(Seq("a", "b", "c", "d")) .add(Seq("a", "x")) .add(Seq("a", "x", "y")) .add(Seq("c", "n")) .add(Seq("c", "m")) val tree3 = tree1.merge(tree2) assert(tree3.root.children.size == 3) assert(tree3.root.children("a").count == 7) assert(tree3.root.children("b").count == 1) assert(tree3.root.children("c").count == 2) val child1 = tree3.root.children("a") assert(child1.children.size == 2) assert(child1.children("b").count == 5) assert(child1.children("x").count == 2) val child2 = child1.children("b") assert(child2.children.size == 2) assert(child2.children("y").count == 1) assert(child2.children("c").count == 3) val child3 = child2.children("c") assert(child3.children.size == 1) assert(child3.children("d").count == 1) val child4 = child1.children("x") assert(child4.children.size == 1) assert(child4.children("y").count == 1) val child5 = tree3.root.children("c") assert(child5.children.size == 2) assert(child5.children("n").count == 1) assert(child5.children("m").count == 1) } test("extract freq itemsets") {//频繁项集的提取物 val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("a", "b")) .add(Seq("a")) .add(Seq("b")) .add(Seq("b", "n")) val freqItemsets = tree.extract(3L).map { case (items, count) => (items.toSet, count) }.toSet val expected = Set( (Set("a"), 4L), (Set("b"), 5L), (Set("a", "b"), 3L)) assert(freqItemsets === expected) } }
Example 115
Source File: AssociationRulesSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext //频繁模式挖掘-Association Rules class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext { test("association rules using String type") {//使用字符串类型的关联规则 val freqItemsets = sc.parallelize(Seq(//频繁项集 (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L), (Set("r"), 3L), (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L), (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L), (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L), (Set("t", "y", "x"), 3L), (Set("t", "y", "x", "z"), 3L) ).map { case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq) }) //频繁模式挖掘-Association Rules val ar = new AssociationRules() val results1 = ar .setMinConfidence(0.9) .run(freqItemsets) .collect() assert(results2.size === 30) //math.abs返回数的绝对值 assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23) } }
Example 116
Source File: KernelDensitySuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") {//核密度单样本 val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 //math.abs返回数的绝对值 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") {//核密度多样本 val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 //math.abs返回数的绝对值 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 117
Source File: MultivariateGaussianSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{ Vectors, Matrices } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext { test("univariate") {//单变量 val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) //密集矩阵 val sigma1 = Matrices.dense(1, 1, Array(1.0)) //多元高斯 val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") {//多变量 val x1 = Vectors.dense(0.0, 0.0)//创建密集向量 val x2 = Vectors.dense(1.0, 1.0)//创建密集向量 val mu = Vectors.dense(0.0, 0.0)//创建密集向量 val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") {//多元退化 val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 118
Source File: CoordinateMatrixSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.linalg.Vectors val blockMat = mat.toBlockMatrix(2, 2) assert(blockMat.numRows() === m) assert(blockMat.numCols() === n) assert(blockMat.toBreeze() === mat.toBreeze()) intercept[IllegalArgumentException] { mat.toBlockMatrix(-1, 2) } intercept[IllegalArgumentException] { mat.toBlockMatrix(2, 0) } } }
Example 119
Source File: MLPairRDDFunctionsSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("topByKey") { val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5, 1), (3, 5)), 2) // .topByKey(5) //以k转换map数组 .collectAsMap() assert(topMap.size === 3) assert(topMap(1) === Array(7, 6, 3, 2, 1)) assert(topMap(3) === Array(7, 5, 2)) assert(topMap(5) === Array(1)) } }
Example 120
Source File: RDDFunctionsSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.rdd.RDDFunctions._ class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("sliding") {//滑动 val data = 0 until 6 for (numPartitions <- 1 to 8) { val rdd = sc.parallelize(data, numPartitions) for (windowSize <- 1 to 6) { val sliding = rdd.sliding(windowSize).collect().map(_.toList).toList val expected = data.sliding(windowSize).map(_.toList).toList assert(sliding === expected) } assert(rdd.sliding(7).collect().isEmpty, //应该返回一个空盘如果窗口大小大于物品的数量 "Should return an empty RDD if the window size is greater than the number of items.") } } test("sliding with empty partitions") {//带空分区的滑动 val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7)) // Array(1, 2, 3, 4, 5, 6, 7) val rdd = sc.parallelize(data, data.length).flatMap(s => s) //data.length = 5 assert(rdd.partitions.size === data.length) //设置数据平滑窗口 val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq) //expected: Seq[Seq[Int]] = Stream(List(1, 2, 3), ?) val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq) assert(sliding === expected) } }
Example 121
Source File: BaggedPointSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.tree.EnsembleTestHelper import org.apache.spark.mllib.util.MLlibTestSparkContext class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext { test("BaggedPoint RDD: without subsampling") { val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42) baggedRDD.collect().foreach { baggedPoint => assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 1.0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample)) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample))) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } }
Example 122
Source File: GradientBoostedTreesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L, "all") val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L, "all") val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 123
Source File: TopByKeyAggregatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.recommendation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset class TopByKeyAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { private def getTopK(k: Int): Dataset[(Int, Array[(Int, Float)])] = { val sqlContext = spark.sqlContext import sqlContext.implicits._ val topKAggregator = new TopByKeyAggregator[Int, Int, Float](k, Ordering.by(_._2)) Seq( (0, 3, 54f), (0, 4, 44f), (0, 5, 42f), (0, 6, 28f), (1, 3, 39f), (2, 3, 51f), (2, 5, 45f), (2, 6, 18f) ).toDS().groupByKey(_._1).agg(topKAggregator.toColumn) } test("topByKey with k < #items") { val topK = getTopK(2) assert(topK.count() === 3) val expected = Map( 0 -> Array((3, 54f), (4, 44f)), 1 -> Array((3, 39f)), 2 -> Array((3, 51f), (5, 45f)) ) checkTopK(topK, expected) } test("topByKey with k > #items") { val topK = getTopK(5) assert(topK.count() === 3) val expected = Map( 0 -> Array((3, 54f), (4, 44f), (5, 42f), (6, 28f)), 1 -> Array((3, 39f)), 2 -> Array((3, 51f), (5, 45f), (6, 18f)) ) checkTopK(topK, expected) } private def checkTopK( topK: Dataset[(Int, Array[(Int, Float)])], expected: Map[Int, Array[(Int, Float)]]): Unit = { topK.collect().foreach { case (id, recs) => assert(recs === expected(id)) } } }
Example 124
Source File: BinaryClassificationEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext class BinaryClassificationEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new BinaryClassificationEvaluator) } test("read/write") { val evaluator = new BinaryClassificationEvaluator() .setRawPredictionCol("myRawPrediction") .setLabelCol("myLabel") .setMetricName("areaUnderPR") testDefaultReadWrite(evaluator) } test("should accept both vector and double raw prediction col") { val evaluator = new BinaryClassificationEvaluator() .setMetricName("areaUnderPR") val vectorDF = Seq( (0d, Vectors.dense(12, 2.5)), (1d, Vectors.dense(1, 3)), (0d, Vectors.dense(10, 2)) ).toDF("label", "rawPrediction") assert(evaluator.evaluate(vectorDF) === 1.0) val doubleDF = Seq( (0d, 0d), (1d, 1d), (0d, 0d) ).toDF("label", "rawPrediction") assert(evaluator.evaluate(doubleDF) === 1.0) val stringDF = Seq( (0d, "0d"), (1d, "1d"), (0d, "0d") ).toDF("label", "rawPrediction") val thrown = intercept[IllegalArgumentException] { evaluator.evaluate(stringDF) } assert(thrown.getMessage.replace("\n", "") contains "Column rawPrediction must be of type " + "equal to one of the following types: [DoubleType, ") assert(thrown.getMessage.replace("\n", "") contains "but was actually of type StringType.") } test("should support all NumericType labels and not support other types") { val evaluator = new BinaryClassificationEvaluator().setRawPredictionCol("prediction") MLTestingUtils.checkNumericTypes(evaluator, spark) } }
Example 125
Source File: MulticlassClassificationEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext class MulticlassClassificationEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new MulticlassClassificationEvaluator) } test("read/write") { val evaluator = new MulticlassClassificationEvaluator() .setPredictionCol("myPrediction") .setLabelCol("myLabel") .setMetricName("accuracy") testDefaultReadWrite(evaluator) } test("should support all NumericType labels and not support other types") { MLTestingUtils.checkNumericTypes(new MulticlassClassificationEvaluator, spark) } }
Example 126
Source File: ClusteringEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset class ClusteringEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var irisDataset: Dataset[_] = _ override def beforeAll(): Unit = { super.beforeAll() irisDataset = spark.read.format("libsvm").load("../data/mllib/iris_libsvm.txt") } test("params") { ParamsSuite.checkParams(new ClusteringEvaluator) } test("read/write") { val evaluator = new ClusteringEvaluator() .setPredictionCol("myPrediction") .setFeaturesCol("myLabel") testDefaultReadWrite(evaluator) } test("squared euclidean Silhouette") { val evaluator = new ClusteringEvaluator() .setFeaturesCol("features") .setPredictionCol("label") assert(evaluator.evaluate(irisDataset) ~== 0.6564679231 relTol 1e-5) } test("number of clusters must be greater than one") { val singleClusterDataset = irisDataset.where($"label" === 0.0) val evaluator = new ClusteringEvaluator() .setFeaturesCol("features") .setPredictionCol("label") val e = intercept[AssertionError]{ evaluator.evaluate(singleClusterDataset) } assert(e.getMessage.contains("Number of clusters must be greater than one")) } }
Example 127
Source File: RegressionEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new RegressionEvaluator) } test("Regression Evaluator: default params") { val trainer = new LinearRegression val model = trainer.fit(dataset) val predictions = model.transform(dataset) // default = rmse val evaluator = new RegressionEvaluator() assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01) // r2 score evaluator.setMetricName("r2") assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01) // mae evaluator.setMetricName("mae") assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01) } test("read/write") { val evaluator = new RegressionEvaluator() .setPredictionCol("myPrediction") .setLabelCol("myLabel") .setMetricName("r2") testDefaultReadWrite(evaluator) } test("should support all NumericType labels and not support other types") { MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark) } }
Example 128
Source File: RWrapperUtilsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.{RFormula, RFormulaModel} import org.apache.spark.mllib.util.MLlibTestSparkContext class RWrapperUtilsSuite extends SparkFunSuite with MLlibTestSparkContext { test("avoid libsvm data column name conflicting") { val rFormula = new RFormula().setFormula("label ~ features") val data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt") // if not checking column name, then IllegalArgumentException intercept[IllegalArgumentException] { rFormula.fit(data) } // after checking, model build is ok RWrapperUtils.checkDataColumns(rFormula, data) assert(rFormula.getLabelCol == "label") assert(rFormula.getFeaturesCol.startsWith("features_")) val model = rFormula.fit(data) assert(model.isInstanceOf[RFormulaModel]) assert(model.getLabelCol == "label") assert(model.getFeaturesCol.startsWith("features_")) } }
Example 129
Source File: PredictorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasWeightCol import org.apache.spark.ml.util._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext { import PredictorSuite._ test("should support all NumericType labels and weights, and not support other types") { val df = spark.createDataFrame(Seq( (0, 1, Vectors.dense(0, 2, 3)), (1, 2, Vectors.dense(0, 3, 9)), (0, 3, Vectors.dense(0, 2, 6)) )).toDF("label", "weight", "features") val types = Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0)) val predictor = new MockPredictor().setWeightCol("weight") types.foreach { t => predictor.fit(df.select(col("label").cast(t), col("weight").cast(t), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label").cast(StringType), col("weight"), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label"), col("weight").cast(StringType), col("features"))) } } } object PredictorSuite { class MockPredictor(override val uid: String) extends Predictor[Vector, MockPredictor, MockPredictionModel] with HasWeightCol { def this() = this(Identifiable.randomUID("mockpredictor")) def setWeightCol(value: String): this.type = set(weightCol, value) override def train(dataset: Dataset[_]): MockPredictionModel = { require(dataset.schema("label").dataType == DoubleType) require(dataset.schema("weight").dataType == DoubleType) new MockPredictionModel(uid) } override def copy(extra: ParamMap): MockPredictor = throw new NotImplementedError() } class MockPredictionModel(override val uid: String) extends PredictionModel[Vector, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictormodel")) override def predict(features: Vector): Double = throw new NotImplementedError() override def copy(extra: ParamMap): MockPredictionModel = throw new NotImplementedError() } }
Example 130
Source File: CorrelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg.{Matrices, Matrix, Vectors} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { val xData = Array(1.0, 0.0, -2.0) val yData = Array(4.0, 5.0, 3.0) val zeros = new Array[Double](3) val data = Seq( Vectors.dense(1.0, 0.0, 0.0, -2.0), Vectors.dense(4.0, 5.0, 0.0, 3.0), Vectors.dense(6.0, 7.0, 0.0, 8.0), Vectors.dense(9.0, 0.0, 0.0, 1.0) ) private def X = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") private def extract(df: DataFrame): BDM[Double] = { val Array(Row(mat: Matrix)) = df.collect() mat.asBreeze.toDenseMatrix } test("corr(X) default, pearson") { val defaultMat = Correlation.corr(X, "features") val pearsonMat = Correlation.corr(X, "features", "pearson") // scalastyle:off val expected = Matrices.fromBreeze(BDM( (1.00000000, 0.05564149, Double.NaN, 0.4004714), (0.05564149, 1.00000000, Double.NaN, 0.9135959), (Double.NaN, Double.NaN, 1.00000000, Double.NaN), (0.40047142, 0.91359586, Double.NaN, 1.0000000))) // scalastyle:on assert(Matrices.fromBreeze(extract(defaultMat)) ~== expected absTol 1e-4) assert(Matrices.fromBreeze(extract(pearsonMat)) ~== expected absTol 1e-4) } test("corr(X) spearman") { val spearmanMat = Correlation.corr(X, "features", "spearman") // scalastyle:off val expected = Matrices.fromBreeze(BDM( (1.0000000, 0.1054093, Double.NaN, 0.4000000), (0.1054093, 1.0000000, Double.NaN, 0.9486833), (Double.NaN, Double.NaN, 1.00000000, Double.NaN), (0.4000000, 0.9486833, Double.NaN, 1.0000000))) // scalastyle:on assert(Matrices.fromBreeze(extract(spearmanMat)) ~== expected absTol 1e-4) } }
Example 131
Source File: ChiSquareTestSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import java.util.Random import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.stat.test.ChiSqTest import org.apache.spark.mllib.util.MLlibTestSparkContext class ChiSquareTestSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("test DataFrame of labeled points") { // labels: 1.0 (2 / 6), 0.0 (4 / 6) // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6) // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6) val data = Seq( LabeledPoint(0.0, Vectors.dense(0.5, 10.0)), LabeledPoint(0.0, Vectors.dense(1.5, 20.0)), LabeledPoint(1.0, Vectors.dense(1.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 40.0)), LabeledPoint(1.0, Vectors.dense(3.5, 40.0))) for (numParts <- List(2, 4, 6, 8)) { val df = spark.createDataFrame(sc.parallelize(data, numParts)) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4) assert(degreesOfFreedom === Array(2, 3)) assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4) } } test("large number of features (SPARK-3087)") { // Test that the right number of results is returned val numCols = 1001 val sparseData = Array( LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))), LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0))))) val df = spark.createDataFrame(sparseData) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues.size === numCols) assert(degreesOfFreedom.length === numCols) assert(statistics.size === numCols) assert(pValues(1000) !== null) // SPARK-3087 } test("fail on continuous features or labels") { val tooManyCategories: Int = 100000 assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " + "tooManyCategories be large enough to cause ChiSqTest to throw an exception.") val random = new Random(11L) val continuousLabel = Seq.fill(tooManyCategories)( LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2)))) withClue("ChiSquare should throw an exception when given a continuous-valued label") { intercept[SparkException] { val df = spark.createDataFrame(continuousLabel) ChiSquareTest.test(df, "features", "label") } } val continuousFeature = Seq.fill(tooManyCategories)( LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble()))) withClue("ChiSquare should throw an exception when given continuous-valued features") { intercept[SparkException] { val df = spark.createDataFrame(continuousFeature) ChiSquareTest.test(df, "features", "label") } } } }
Example 132
Source File: StopwatchSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import java.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext { import StopwatchSuite._ private def testStopwatchOnDriver(sw: Stopwatch): Unit = { assert(sw.name === "sw") assert(sw.elapsed() === 0L) assert(!sw.isRunning) intercept[AssertionError] { sw.stop() } val duration = checkStopwatch(sw) val elapsed = sw.elapsed() assert(elapsed === duration) val duration2 = checkStopwatch(sw) val elapsed2 = sw.elapsed() assert(elapsed2 === duration + duration2) assert(sw.toString === s"sw: ${elapsed2}ms") sw.start() assert(sw.isRunning) intercept[AssertionError] { sw.start() } } test("LocalStopwatch") { val sw = new LocalStopwatch("sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on driver") { val sw = new DistributedStopwatch(sc, "sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on executors") { val sw = new DistributedStopwatch(sc, "sw") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => acc.add(checkStopwatch(sw)) } assert(!sw.isRunning) val elapsed = sw.elapsed() assert(elapsed === acc.value) } test("MultiStopwatch") { val sw = new MultiStopwatch(sc) .addLocal("local") .addDistributed("spark") assert(sw("local").name === "local") assert(sw("spark").name === "spark") intercept[NoSuchElementException] { sw("some") } assert(sw.toString === "{\n local: 0ms,\n spark: 0ms\n}") val localDuration = checkStopwatch(sw("local")) val sparkDuration = checkStopwatch(sw("spark")) val localElapsed = sw("local").elapsed() val sparkElapsed = sw("spark").elapsed() assert(localElapsed === localDuration) assert(sparkElapsed === sparkDuration) assert(sw.toString === s"{\n local: ${localElapsed}ms,\n spark: ${sparkElapsed}ms\n}") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => sw("local").start() val duration = checkStopwatch(sw("spark")) sw("local").stop() acc.add(duration) } val localElapsed2 = sw("local").elapsed() assert(localElapsed2 === localElapsed) val sparkElapsed2 = sw("spark").elapsed() assert(sparkElapsed2 === sparkElapsed + acc.value) } } private object StopwatchSuite extends SparkFunSuite { private def now: Long = System.currentTimeMillis() }
Example 133
Source File: RDDLossFunctionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim.loss import org.apache.spark.SparkFunSuite import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregatorSuite.TestAggregator import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD class RDDLossFunctionSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var instances: RDD[Instance] = _ override def beforeAll(): Unit = { super.beforeAll() instances = sc.parallelize(Seq( Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)), Instance(2.0, 0.3, Vectors.dense(4.0, 0.5)) )) } test("regularization") { val coefficients = Vectors.dense(0.5, -0.1) val regLossFun = new L2Regularization(0.1, (_: Int) => true, None) val getAgg = (bvec: Broadcast[Vector]) => new TestAggregator(2)(bvec.value) val lossNoReg = new RDDLossFunction(instances, getAgg, None) val lossWithReg = new RDDLossFunction(instances, getAgg, Some(regLossFun)) val (loss1, grad1) = lossNoReg.calculate(coefficients.asBreeze.toDenseVector) val (regLoss, regGrad) = regLossFun.calculate(coefficients) val (loss2, grad2) = lossWithReg.calculate(coefficients.asBreeze.toDenseVector) BLAS.axpy(1.0, Vectors.fromBreeze(grad1), regGrad) assert(regGrad ~== Vectors.fromBreeze(grad2) relTol 1e-5) assert(loss1 + regLoss === loss2) } test("empty RDD") { val rdd = sc.parallelize(Seq.empty[Instance]) val coefficients = Vectors.dense(0.5, -0.1) val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value) val lossFun = new RDDLossFunction(rdd, getAgg, None) withClue("cannot calculate cost for empty dataset") { intercept[IllegalArgumentException]{ lossFun.calculate(coefficients.asBreeze.toDenseVector) } } } test("versus aggregating on an iterable") { val coefficients = Vectors.dense(0.5, -0.1) val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value) val lossFun = new RDDLossFunction(instances, getAgg, None) val (loss, grad) = lossFun.calculate(coefficients.asBreeze.toDenseVector) // just map the aggregator over the instances array val agg = new TestAggregator(2)(coefficients) instances.collect().foreach(agg.add) assert(loss === agg.loss) assert(Vectors.fromBreeze(grad) === agg.gradient) } }
Example 134
Source File: ANNSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext class ANNSuite extends SparkFunSuite with MLlibTestSparkContext { // TODO: test for weights comparison with Weka MLP test("ANN with Sigmoid learns XOR function with LBFGS optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array(0.0, 1.0, 1.0, 0.0) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights val trainer = new FeedForwardTrainer(topology, 2, 1) trainer.setWeights(initialWeights) trainer.LBFGSOptimizer.setNumIterations(20) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input)(0), label(0)) }.collect() predictionAndLabels.foreach { case (p, l) => assert(math.round(p) === l) } } test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array( Array(1.0, 0.0), Array(0.0, 1.0), Array(0.0, 1.0), Array(1.0, 0.0) ) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights val trainer = new FeedForwardTrainer(topology, 2, 2) // TODO: add a test for SGD trainer.LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(20) trainer.setWeights(initialWeights).setStackSize(1) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input), label) }.collect() predictionAndLabels.foreach { case (p, l) => assert(p ~== l absTol 0.5) } } }
Example 135
Source File: GradientSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientSuite extends SparkFunSuite with MLlibTestSparkContext { test("Gradient computation against numerical differentiation") { val input = new BDM[Double](3, 1, Array(1.0, 1.0, 1.0)) // output must contain zeros and one 1 for SoftMax val target = new BDM[Double](2, 1, Array(0.0, 1.0)) val topology = FeedForwardTopology.multiLayerPerceptron(Array(3, 4, 2), softmaxOnTop = false) val layersWithErrors = Seq( new SigmoidLayerWithSquaredError(), new SoftmaxLayerWithCrossEntropyLoss() ) // check all layers that provide loss computation // 1) compute loss and gradient given the model and initial weights // 2) modify weights with small number epsilon (per dimension i) // 3) compute new loss // 4) ((newLoss - loss) / epsilon) should be close to the i-th component of the gradient for (layerWithError <- layersWithErrors) { topology.layers(topology.layers.length - 1) = layerWithError val model = topology.model(seed = 12L) val weights = model.weights.toArray val numWeights = weights.size val gradient = Vectors.dense(Array.fill[Double](numWeights)(0.0)) val loss = model.computeGradient(input, target, gradient, 1) val eps = 1e-4 var i = 0 val tol = 1e-4 while (i < numWeights) { val originalValue = weights(i) weights(i) += eps val newModel = topology.model(Vectors.dense(weights)) val newLoss = computeLoss(input, target, newModel) val derivativeEstimate = (newLoss - loss) / eps assert(math.abs(gradient(i) - derivativeEstimate) < tol, "Layer failed gradient check: " + layerWithError.getClass) weights(i) = originalValue i += 1 } } } private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = { val outputs = model.forward(input, true) model.layerModels.last match { case layerWithLoss: LossFunction => layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols)) case _ => throw new UnsupportedOperationException("Top layer is required to have loss." + " Failed layer:" + model.layerModels.last.getClass) } } }
Example 136
Source File: ImageSchemaSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.image import java.nio.file.Paths import java.util.Arrays import org.apache.spark.SparkFunSuite import org.apache.spark.ml.image.ImageSchema._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.types._ class ImageSchemaSuite extends SparkFunSuite with MLlibTestSparkContext { // Single column of images named "image" private lazy val imagePath = "../data/mllib/images" test("Smoke test: create basic ImageSchema dataframe") { val origin = "path" val width = 1 val height = 1 val nChannels = 3 val data = Array[Byte](0, 0, 0) val mode = ocvTypes("CV_8UC3") // Internal Row corresponds to image StructType val rows = Seq(Row(Row(origin, height, width, nChannels, mode, data)), Row(Row(null, height, width, nChannels, mode, data))) val rdd = sc.makeRDD(rows) val df = spark.createDataFrame(rdd, ImageSchema.imageSchema) assert(df.count === 2, "incorrect image count") assert(df.schema("image").dataType == columnSchema, "data do not fit ImageSchema") } test("readImages count test") { var df = readImages(imagePath) assert(df.count === 1) df = readImages(imagePath, null, true, -1, false, 1.0, 0) assert(df.count === 10) df = readImages(imagePath, null, true, -1, true, 1.0, 0) val countTotal = df.count assert(countTotal === 8) df = readImages(imagePath, null, true, -1, true, 0.5, 0) // Random number about half of the size of the original dataset val count50 = df.count assert(count50 > 0 && count50 < countTotal) } test("readImages partition test") { val df = readImages(imagePath, null, true, 3, true, 1.0, 0) assert(df.rdd.getNumPartitions === 3) } // Images with the different number of channels test("readImages pixel values test") { val images = readImages(imagePath + "/multi-channel/").collect images.foreach { rrow => val row = rrow.getAs[Row](0) val filename = Paths.get(getOrigin(row)).getFileName().toString() if (firstBytes20.contains(filename)) { val mode = getMode(row) val bytes20 = getData(row).slice(0, 20) val (expectedMode, expectedBytes) = firstBytes20(filename) assert(ocvTypes(expectedMode) === mode, "mode of the image is not read correctly") assert(Arrays.equals(expectedBytes, bytes20), "incorrect numeric value for flattened image") } } } // number of channels and first 20 bytes of OpenCV representation // - default representation for 3-channel RGB images is BGR row-wise: // (B00, G00, R00, B10, G10, R10, ...) // - default representation for 4-channel RGB images is BGRA row-wise: // (B00, G00, R00, A00, B10, G10, R10, A00, ...) private val firstBytes20 = Map( "grayscale.jpg" -> (("CV_8UC1", Array[Byte](-2, -33, -61, -60, -59, -59, -64, -59, -66, -67, -73, -73, -62, -57, -60, -63, -53, -49, -55, -69))), "chr30.4.184.jpg" -> (("CV_8UC3", Array[Byte](-9, -3, -1, -43, -32, -28, -75, -60, -57, -78, -59, -56, -74, -59, -57, -71, -58, -56, -73, -64))), "BGRA.png" -> (("CV_8UC4", Array[Byte](-128, -128, -8, -1, -128, -128, -8, -1, -128, -128, -8, -1, 127, 127, -9, -1, 127, 127, -9, -1))), "BGRA_alpha_60.png" -> (("CV_8UC4", Array[Byte](-128, -128, -8, 60, -128, -128, -8, 60, -128, -128, -8, 60, 127, 127, -9, 60, 127, 127, -9, 60))) ) }
Example 137
Source File: ElementwiseProductSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 138
Source File: IDFSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 139
Source File: PCASuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } test("memory cost computation") { assert(PCAUtil.memoryCost(10, 100) < Int.MaxValue) // check overflowing assert(PCAUtil.memoryCost(40000, 60000) > Int.MaxValue) } }
Example 140
Source File: HashingTFSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("hashing tf on a single doc") { val hashingTF = new HashingTF(1000) val doc = "a a b b c d".split(" ") val n = hashingTF.numFeatures val termFreqs = Seq( (hashingTF.indexOf("a"), 2.0), (hashingTF.indexOf("b"), 2.0), (hashingTF.indexOf("c"), 1.0), (hashingTF.indexOf("d"), 1.0)) assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n), "index must be in range [0, #features)") assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing") val expected = Vectors.sparse(n, termFreqs) assert(hashingTF.transform(doc) === expected) } test("hashing tf on an RDD") { val hashingTF = new HashingTF val localDocs: Seq[Seq[String]] = Seq( "a a b b b c d".split(" "), "a b c d a b c".split(" "), "c b a c b a a".split(" ")) val docs = sc.parallelize(localDocs, 2) assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet) } test("applying binary term freqs") { val hashingTF = new HashingTF(100).setBinary(true) val doc = "a a b c c c".split(" ") val n = hashingTF.numFeatures val expected = Vectors.sparse(n, Seq( (hashingTF.indexOf("a"), 1.0), (hashingTF.indexOf("b"), 1.0), (hashingTF.indexOf("c"), 1.0))) assert(hashingTF.transform(doc) ~== expected absTol 1e-14) } }
Example 141
Source File: MatrixFactorizationModelSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.recommendation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext { val rank = 2 var userFeatures: RDD[(Int, Array[Double])] = _ var prodFeatures: RDD[(Int, Array[Double])] = _ override def beforeAll(): Unit = { super.beforeAll() userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0)))) prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0)))) } test("constructor") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) assert(model.predict(0, 2) ~== 17.0 relTol 1e-14) intercept[IllegalArgumentException] { new MatrixFactorizationModel(1, userFeatures, prodFeatures) } val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures1, prodFeatures) } val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures, prodFeatures1) } } test("save/load") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = { features.mapValues(_.toSeq).collect().toSet } try { model.save(sc, path) val newModel = MatrixFactorizationModel.load(sc, path) assert(newModel.rank === rank) assert(collect(newModel.userFeatures) === collect(userFeatures)) assert(collect(newModel.productFeatures) === collect(prodFeatures)) } finally { Utils.deleteRecursively(tempDir) } } test("batch predict API recommendProductsForUsers") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendProductsForUsers(topK).collectAsMap() assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14) assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14) } test("batch predict API recommendUsersForProducts") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendUsersForProducts(topK).collectAsMap() assert(recommendations(2)(0).user == 1) assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14) assert(recommendations(2)(1).user == 0) assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14) } }
Example 142
Source File: RankingMetricsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("Ranking metrics: MAP, NDCG") { val predictionAndLabels = sc.parallelize( Seq( (Array(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array(1, 2, 3, 4, 5)), (Array(4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array(1, 2, 3)), (Array(1, 2, 3, 4, 5), Array.empty[Int]) ), 2) val eps = 1.0E-5 val metrics = new RankingMetrics(predictionAndLabels) val map = metrics.meanAveragePrecision assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps) assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps) assert(map ~== 0.355026 absTol eps) assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps) assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps) assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps) assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps) } test("MAP, NDCG with few predictions (SPARK-14886)") { val predictionAndLabels = sc.parallelize( Seq( (Array(1, 6, 2), Array(1, 2, 3, 4, 5)), (Array.empty[Int], Array(1, 2, 3)) ), 2) val eps = 1.0E-5 val metrics = new RankingMetrics(predictionAndLabels) assert(metrics.precisionAt(1) ~== 0.5 absTol eps) assert(metrics.precisionAt(2) ~== 0.25 absTol eps) assert(metrics.ndcgAt(1) ~== 0.5 absTol eps) assert(metrics.ndcgAt(2) ~== 0.30657 absTol eps) } }
Example 143
Source File: AreaUnderCurveSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext { test("auc computation") { val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0)) val auc = 4.0 assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5) } test("auc of an empty curve") { val curve = Seq.empty[(Double, Double)] assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } test("auc of a curve with a single point") { val curve = Seq((1.0, 1.0)) assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } }
Example 144
Source File: FPTreeSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import scala.language.existentials import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext { test("add transaction") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) assert(tree.root.children.size == 2) assert(tree.root.children.contains("a")) assert(tree.root.children("a").item.equals("a")) assert(tree.root.children("a").count == 2) assert(tree.root.children.contains("b")) assert(tree.root.children("b").item.equals("b")) assert(tree.root.children("b").count == 1) var child = tree.root.children("a") assert(child.children.size == 1) assert(child.children.contains("b")) assert(child.children("b").item.equals("b")) assert(child.children("b").count == 2) child = child.children("b") assert(child.children.size == 2) assert(child.children.contains("c")) assert(child.children.contains("y")) assert(child.children("c").item.equals("c")) assert(child.children("y").item.equals("y")) assert(child.children("c").count == 1) assert(child.children("y").count == 1) } test("merge tree") { val tree1 = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) val tree2 = new FPTree[String] .add(Seq("a", "b")) .add(Seq("a", "b", "c")) .add(Seq("a", "b", "c", "d")) .add(Seq("a", "x")) .add(Seq("a", "x", "y")) .add(Seq("c", "n")) .add(Seq("c", "m")) val tree3 = tree1.merge(tree2) assert(tree3.root.children.size == 3) assert(tree3.root.children("a").count == 7) assert(tree3.root.children("b").count == 1) assert(tree3.root.children("c").count == 2) val child1 = tree3.root.children("a") assert(child1.children.size == 2) assert(child1.children("b").count == 5) assert(child1.children("x").count == 2) val child2 = child1.children("b") assert(child2.children.size == 2) assert(child2.children("y").count == 1) assert(child2.children("c").count == 3) val child3 = child2.children("c") assert(child3.children.size == 1) assert(child3.children("d").count == 1) val child4 = child1.children("x") assert(child4.children.size == 1) assert(child4.children("y").count == 1) val child5 = tree3.root.children("c") assert(child5.children.size == 2) assert(child5.children("n").count == 1) assert(child5.children("m").count == 1) } test("extract freq itemsets") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("a", "b")) .add(Seq("a")) .add(Seq("b")) .add(Seq("b", "n")) val freqItemsets = tree.extract(3L).map { case (items, count) => (items.toSet, count) }.toSet val expected = Set( (Set("a"), 4L), (Set("b"), 5L), (Set("a", "b"), 3L)) assert(freqItemsets === expected) } }
Example 145
Source File: AssociationRulesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext { test("association rules using String type") { val freqItemsets = sc.parallelize(Seq( (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L), (Set("r"), 3L), (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L), (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L), (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L), (Set("t", "y", "x"), 3L), (Set("t", "y", "x", "z"), 3L) ).map { case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq) }) val ar = new AssociationRules() val results1 = ar .setMinConfidence(0.9) .run(freqItemsets) .collect() assert(results2.size === 30) assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23) } }
Example 146
Source File: KernelDensitySuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 147
Source File: MultivariateGaussianSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Matrices, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 148
Source File: CoordinateMatrixSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { val m = 5 val n = 4 var mat: CoordinateMatrix = _ override def beforeAll() { super.beforeAll() val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } mat = new CoordinateMatrix(entries) } test("size") { assert(mat.numRows() === m) assert(mat.numCols() === n) } test("empty entries") { val entries = sc.parallelize(Seq[MatrixEntry](), 1) val emptyMat = new CoordinateMatrix(entries) intercept[RuntimeException] { emptyMat.numCols() } intercept[RuntimeException] { emptyMat.numRows() } } test("toBreeze") { val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(mat.toBreeze() === expected) } test("transpose") { val transposed = mat.transpose() assert(mat.toBreeze().t === transposed.toBreeze()) } test("toIndexedRowMatrix") { val indexedRowMatrix = mat.toIndexedRowMatrix() val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(indexedRowMatrix.toBreeze() === expected) } test("toRowMatrix") { val rowMatrix = mat.toRowMatrix() val rows = rowMatrix.rows.collect().toSet val expected = Set( Vectors.dense(1.0, 2.0, 0.0, 0.0), Vectors.dense(0.0, 3.0, 4.0, 0.0), Vectors.dense(0.0, 0.0, 5.0, 6.0), Vectors.dense(7.0, 0.0, 0.0, 8.0), Vectors.dense(0.0, 9.0, 0.0, 0.0)) assert(rows === expected) } test("toBlockMatrix") { val blockMat = mat.toBlockMatrix(2, 2) assert(blockMat.numRows() === m) assert(blockMat.numCols() === n) assert(blockMat.toBreeze() === mat.toBreeze()) intercept[IllegalArgumentException] { mat.toBlockMatrix(-1, 2) } intercept[IllegalArgumentException] { mat.toBlockMatrix(2, 0) } } }
Example 149
Source File: MLPairRDDFunctionsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ import org.apache.spark.mllib.util.MLlibTestSparkContext class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("topByKey") { val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5, 1), (3, 5)), 2) .topByKey(5) .collectAsMap() assert(topMap.size === 3) assert(topMap(1) === Array(7, 6, 3, 2, 1)) assert(topMap(3) === Array(7, 5, 2)) assert(topMap(5) === Array(1)) } }
Example 150
Source File: RDDFunctionsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.rdd.RDDFunctions._ import org.apache.spark.mllib.util.MLlibTestSparkContext class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("sliding") { val data = 0 until 6 for (numPartitions <- 1 to 8) { val rdd = sc.parallelize(data, numPartitions) for (windowSize <- 1 to 6) { for (step <- 1 to 3) { val sliding = rdd.sliding(windowSize, step).collect().map(_.toList).toList val expected = data.sliding(windowSize, step) .map(_.toList).toList.filter(l => l.size == windowSize) assert(sliding === expected) } } assert(rdd.sliding(7).collect().isEmpty, "Should return an empty RDD if the window size is greater than the number of items.") } } test("sliding with empty partitions") { val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7)) val rdd = sc.parallelize(data, data.length).flatMap(s => s) assert(rdd.partitions.length === data.length) val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq) val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq) assert(sliding === expected) } }
Example 151
Source File: OneHotEncoderSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { def stringIndexed(): DataFrame = { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) val df = sqlContext.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") .fit(df) indexer.transform(df) } test("params") { ParamsSuite.checkParams(new OneHotEncoder) } test("OneHotEncoder dropLast = false") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") .setDropLast(false) val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1), vec(2)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0), (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0)) assert(output === expected) } test("OneHotEncoder dropLast = true") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0), (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0)) assert(output === expected) } test("input column with ML attribute") { val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large") val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size") .select(col("size").as("size", attr.toMetadata())) val encoder = new OneHotEncoder() .setInputCol("size") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1)) } test("input column without ML attribute") { val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index") val encoder = new OneHotEncoder() .setInputCol("index") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1)) } test("read/write") { val t = new OneHotEncoder() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setDropLast(false) testDefaultReadWrite(t) } }
Example 152
Source File: ChiSqSelectorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{Row, SQLContext} class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("Test Chi-Square selector") { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ val data = Seq( LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))) ) val preFilteredData = Seq( Vectors.dense(0.0), Vectors.dense(6.0), Vectors.dense(8.0), Vectors.dense(5.0) ) val df = sc.parallelize(data.zip(preFilteredData)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") val model = new ChiSqSelector() .setNumTopFeatures(1) .setFeaturesCol("data") .setLabelCol("label") .setOutputCol("filtered") model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } } test("ChiSqSelector read/write") { val t = new ChiSqSelector() .setFeaturesCol("myFeaturesCol") .setLabelCol("myLabelCol") .setOutputCol("myOutputCol") .setNumTopFeatures(2) testDefaultReadWrite(t) } test("ChiSqSelectorModel read/write") { val oldModel = new feature.ChiSqSelectorModel(Array(1, 3)) val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel) val newInstance = testDefaultReadWrite(instance) assert(newInstance.selectedFeatures === instance.selectedFeatures) } }
Example 153
Source File: DCTSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true) } else { (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = sqlContext.createDataFrame(Seq( DCTTestData(data, expectedResult) )) val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 154
Source File: VectorSlicerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 155
Source File: MaxAbsScalerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("MaxAbsScaler fit basic case") { val data = Array( Vectors.dense(1, 0, 100), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-2, -100)), Vectors.sparse(3, Array(0), Array(-1.5))) val expected: Array[Vector] = Array( Vectors.dense(0.5, 0, 1), Vectors.dense(1, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-1, -1)), Vectors.sparse(3, Array(0), Array(-0.75))) val df = data.zip(expected).toSeq.toDF("features", "expected") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaled") val model = scaler.fit(df) model.transform(df).select("expected", "scaled").collect() .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1") } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } test("MaxAbsScaler read/write") { val t = new MaxAbsScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } test("MaxAbsScalerModel read/write") { val instance = new MaxAbsScalerModel( "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.maxAbs === instance.maxAbs) } }
Example 156
Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("Test Chi-Square selector") { import testImplicits._ val data = Seq( LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))) ) val preFilteredData = Seq( Vectors.dense(8.0), Vectors.dense(0.0), Vectors.dense(0.0), Vectors.dense(8.0) ) val df = sc.parallelize(data.zip(preFilteredData)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") val selector = new ChiSqSelector() .setSelectorType("kbest") .setNumTopFeatures(1) .setFeaturesCol("data") .setLabelCol("label") .setOutputCol("filtered") selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } selector.setSelectorType("percentile").setPercentile(0.34).fit(df).transform(df) .select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } val preFilteredData2 = Seq( Vectors.dense(8.0, 7.0), Vectors.dense(0.0, 9.0), Vectors.dense(0.0, 9.0), Vectors.dense(8.0, 9.0) ) val df2 = sc.parallelize(data.zip(preFilteredData2)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") selector.setSelectorType("fpr").setAlpha(0.2).fit(df2).transform(df2) .select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } } test("ChiSqSelector read/write") { val t = new ChiSqSelector() .setFeaturesCol("myFeaturesCol") .setLabelCol("myLabelCol") .setOutputCol("myOutputCol") .setNumTopFeatures(2) testDefaultReadWrite(t) } test("ChiSqSelectorModel read/write") { val oldModel = new feature.ChiSqSelectorModel(Array(1, 3)) val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel) val newInstance = testDefaultReadWrite(instance) assert(newInstance.selectedFeatures === instance.selectedFeatures) } test("should support all NumericType labels and not support other types") { val css = new ChiSqSelector() MLTestingUtils.checkNumericTypes[ChiSqSelectorModel, ChiSqSelector]( css, spark) { (expected, actual) => assert(expected.selectedFeatures === actual.selectedFeatures) } } }
Example 157
Source File: DCTSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 158
Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("read/write") { val ep = new ElementwiseProduct() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setScalingVec(Vectors.dense(0.1, 0.2)) testDefaultReadWrite(ep) } }
Example 159
Source File: BinarizerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Double] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4) } test("params") { ParamsSuite.checkParams(new Binarizer) } test("Binarize continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize continuous features with setter") { val threshold: Double = 0.2 val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with setter") { val threshold: Double = 0.2 val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("read/write") { val t = new Binarizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setThreshold(0.1) testDefaultReadWrite(t) } }
Example 160
Source File: SQLTransformerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.types.{LongType, StructField, StructType} class SQLTransformerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new SQLTransformer()) } test("transform numeric data") { val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val result = sqlTrans.transform(original) val resultSchema = sqlTrans.transformSchema(original.schema) val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)) .toDF("id", "v1", "v2", "v3", "v4") assert(result.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(result.collect().toSeq == expected.collect().toSeq) assert(original.sparkSession.catalog.listTables().count() == 0) } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } test("transformSchema") { val df = spark.range(10) val outputSchema = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") .transformSchema(df.schema) val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) assert(outputSchema === expected) } }
Example 161
Source File: TokenizerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new Tokenizer) } test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.RegexTokenizerSuite._ import testImplicits._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) ).toDF() testRegexTokenizer(tokenizer0, dataset0) val dataset1 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) ).toDF() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) ).toDF() testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) ).toDF() testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 162
Source File: NGramSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.NGramSuite._ import testImplicits._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = Seq(NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") )).toDF() testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") )).toDF() testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData(Array(), Array())).toDF() testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array() )).toDF() testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } } object NGramSuite extends SparkFunSuite { def testNGram(t: NGram, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("nGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => assert(actualNGrams === wantedNGrams) } } }
Example 163
Source File: PCASuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new PCA) val mat = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix] val explainedVariance = Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector] val model = new PCAModel("pca", mat, explainedVariance) ParamsSuite.checkParams(model) } test("pca") { val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val dataRDD = sc.parallelize(data, 2) val mat = new RowMatrix(dataRDD.map(OldVectors.fromML)) val pc = mat.computePrincipalComponents(3) val expected = mat.multiply(pc).rows.map(_.asML) val df = dataRDD.zip(expected).toDF("features", "expected") val pca = new PCA() .setInputCol("features") .setOutputCol("pca_features") .setK(3) .fit(df) // copied model must have the same parent. MLTestingUtils.checkCopy(pca) pca.transform(df).select("pca_features", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("PCA read/write") { val t = new PCA() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setK(3) testDefaultReadWrite(t) } test("PCAModel read/write") { val instance = new PCAModel("myPCAModel", Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix], Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector]) val newInstance = testDefaultReadWrite(instance) assert(newInstance.pc === instance.pc) } }
Example 164
Source File: HashingTFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val df = Seq((0, "a a b b c d".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(n)) val features = output.select("features").first().getAs[Vector](0) // Assume perfect hash on "a", "b", "c", and "d". def idx: Any => Int = murmur3FeatureIdx(n) val expected = Vectors.sparse(n, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) assert(features ~== expected absTol 1e-14) } test("applying binary term freqs") { val df = Seq((0, "a a b c c c".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) .setBinary(true) val output = hashingTF.transform(df) val features = output.select("features").first().getAs[Vector](0) def idx: Any => Int = murmur3FeatureIdx(n) // Assume perfect hash on input features val expected = Vectors.sparse(n, Seq((idx("a"), 1.0), (idx("b"), 1.0), (idx("c"), 1.0))) assert(features ~== expected absTol 1e-14) } test("read/write") { val t = new HashingTF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumFeatures(10) testDefaultReadWrite(t) } private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = { Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures) } }
Example 165
Source File: BaggedPointSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.tree.EnsembleTestHelper import org.apache.spark.mllib.util.MLlibTestSparkContext class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext { test("BaggedPoint RDD: without subsampling") { val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42) baggedRDD.collect().foreach { baggedPoint => assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 1.0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample)) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") { val numSubsamples = 100 val (expectedMean, expectedStddev) = (1.0, 0) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") { val numSubsamples = 100 val subsample = 0.5 val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample))) val seeds = Array(123, 5354, 230, 349867, 23987) val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) val rdd = sc.parallelize(arr) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, expectedStddev, epsilon = 0.01) } } }
Example 166
Source File: GradientBoostedTreesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 167
Source File: BinaryClassificationEvaluatorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext class BinaryClassificationEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new BinaryClassificationEvaluator) } test("read/write") { val evaluator = new BinaryClassificationEvaluator() .setRawPredictionCol("myRawPrediction") .setLabelCol("myLabel") .setMetricName("areaUnderPR") testDefaultReadWrite(evaluator) } test("should accept both vector and double raw prediction col") { val evaluator = new BinaryClassificationEvaluator() .setMetricName("areaUnderPR") val vectorDF = Seq( (0d, Vectors.dense(12, 2.5)), (1d, Vectors.dense(1, 3)), (0d, Vectors.dense(10, 2)) ).toDF("label", "rawPrediction") assert(evaluator.evaluate(vectorDF) === 1.0) val doubleDF = Seq( (0d, 0d), (1d, 1d), (0d, 0d) ).toDF("label", "rawPrediction") assert(evaluator.evaluate(doubleDF) === 1.0) val stringDF = Seq( (0d, "0d"), (1d, "1d"), (0d, "0d") ).toDF("label", "rawPrediction") val thrown = intercept[IllegalArgumentException] { evaluator.evaluate(stringDF) } assert(thrown.getMessage.replace("\n", "") contains "Column rawPrediction must be of type " + "equal to one of the following types: [DoubleType, ") assert(thrown.getMessage.replace("\n", "") contains "but was actually of type StringType.") } test("should support all NumericType labels and not support other types") { val evaluator = new BinaryClassificationEvaluator().setRawPredictionCol("prediction") MLTestingUtils.checkNumericTypes(evaluator, spark) } }
Example 168
Source File: MulticlassClassificationEvaluatorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext class MulticlassClassificationEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new MulticlassClassificationEvaluator) } test("read/write") { val evaluator = new MulticlassClassificationEvaluator() .setPredictionCol("myPrediction") .setLabelCol("myLabel") .setMetricName("accuracy") testDefaultReadWrite(evaluator) } test("should support all NumericType labels and not support other types") { MLTestingUtils.checkNumericTypes(new MulticlassClassificationEvaluator, spark) } }
Example 169
Source File: RegressionEvaluatorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new RegressionEvaluator) } test("Regression Evaluator: default params") { val trainer = new LinearRegression val model = trainer.fit(dataset) val predictions = model.transform(dataset) // default = rmse val evaluator = new RegressionEvaluator() assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01) // r2 score evaluator.setMetricName("r2") assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01) // mae evaluator.setMetricName("mae") assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01) } test("read/write") { val evaluator = new RegressionEvaluator() .setPredictionCol("myPrediction") .setLabelCol("myLabel") .setMetricName("r2") testDefaultReadWrite(evaluator) } test("should support all NumericType labels and not support other types") { MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark) } }
Example 170
Source File: RWrapperUtilsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.{RFormula, RFormulaModel} import org.apache.spark.mllib.util.MLlibTestSparkContext class RWrapperUtilsSuite extends SparkFunSuite with MLlibTestSparkContext { test("avoid libsvm data column name conflicting") { val rFormula = new RFormula().setFormula("label ~ features") val data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt") // if not checking column name, then IllegalArgumentException intercept[IllegalArgumentException] { rFormula.fit(data) } // after checking, model build is ok RWrapperUtils.checkDataColumns(rFormula, data) assert(rFormula.getLabelCol == "label") assert(rFormula.getFeaturesCol.startsWith("features_")) val model = rFormula.fit(data) assert(model.isInstanceOf[RFormulaModel]) assert(model.getLabelCol == "label") assert(model.getFeaturesCol.startsWith("features_")) } }
Example 171
Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 172
Source File: StopwatchSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import java.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext { import StopwatchSuite._ private def testStopwatchOnDriver(sw: Stopwatch): Unit = { assert(sw.name === "sw") assert(sw.elapsed() === 0L) assert(!sw.isRunning) intercept[AssertionError] { sw.stop() } val duration = checkStopwatch(sw) val elapsed = sw.elapsed() assert(elapsed === duration) val duration2 = checkStopwatch(sw) val elapsed2 = sw.elapsed() assert(elapsed2 === duration + duration2) assert(sw.toString === s"sw: ${elapsed2}ms") sw.start() assert(sw.isRunning) intercept[AssertionError] { sw.start() } } test("LocalStopwatch") { val sw = new LocalStopwatch("sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on driver") { val sw = new DistributedStopwatch(sc, "sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on executors") { val sw = new DistributedStopwatch(sc, "sw") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => acc.add(checkStopwatch(sw)) } assert(!sw.isRunning) val elapsed = sw.elapsed() assert(elapsed === acc.value) } test("MultiStopwatch") { val sw = new MultiStopwatch(sc) .addLocal("local") .addDistributed("spark") assert(sw("local").name === "local") assert(sw("spark").name === "spark") intercept[NoSuchElementException] { sw("some") } assert(sw.toString === "{\n local: 0ms,\n spark: 0ms\n}") val localDuration = checkStopwatch(sw("local")) val sparkDuration = checkStopwatch(sw("spark")) val localElapsed = sw("local").elapsed() val sparkElapsed = sw("spark").elapsed() assert(localElapsed === localDuration) assert(sparkElapsed === sparkDuration) assert(sw.toString === s"{\n local: ${localElapsed}ms,\n spark: ${sparkElapsed}ms\n}") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => sw("local").start() val duration = checkStopwatch(sw("spark")) sw("local").stop() acc.add(duration) } val localElapsed2 = sw("local").elapsed() assert(localElapsed2 === localElapsed) val sparkElapsed2 = sw("spark").elapsed() assert(sparkElapsed2 === sparkElapsed + acc.value) } } private object StopwatchSuite extends SparkFunSuite { private def now: Long = System.currentTimeMillis() }
Example 173
Source File: ANNSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext class ANNSuite extends SparkFunSuite with MLlibTestSparkContext { // TODO: test for weights comparison with Weka MLP test("ANN with Sigmoid learns XOR function with LBFGS optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array(0.0, 1.0, 1.0, 0.0) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights val trainer = new FeedForwardTrainer(topology, 2, 1) trainer.setWeights(initialWeights) trainer.LBFGSOptimizer.setNumIterations(20) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input)(0), label(0)) }.collect() predictionAndLabels.foreach { case (p, l) => assert(math.round(p) === l) } } test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array( Array(1.0, 0.0), Array(0.0, 1.0), Array(0.0, 1.0), Array(1.0, 0.0) ) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights val trainer = new FeedForwardTrainer(topology, 2, 2) // TODO: add a test for SGD trainer.LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(20) trainer.setWeights(initialWeights).setStackSize(1) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input), label) }.collect() predictionAndLabels.foreach { case (p, l) => assert(p ~== l absTol 0.5) } } }
Example 174
Source File: GradientSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientSuite extends SparkFunSuite with MLlibTestSparkContext { test("Gradient computation against numerical differentiation") { val input = new BDM[Double](3, 1, Array(1.0, 1.0, 1.0)) // output must contain zeros and one 1 for SoftMax val target = new BDM[Double](2, 1, Array(0.0, 1.0)) val topology = FeedForwardTopology.multiLayerPerceptron(Array(3, 4, 2), softmaxOnTop = false) val layersWithErrors = Seq( new SigmoidLayerWithSquaredError(), new SoftmaxLayerWithCrossEntropyLoss() ) // check all layers that provide loss computation // 1) compute loss and gradient given the model and initial weights // 2) modify weights with small number epsilon (per dimension i) // 3) compute new loss // 4) ((newLoss - loss) / epsilon) should be close to the i-th component of the gradient for (layerWithError <- layersWithErrors) { topology.layers(topology.layers.length - 1) = layerWithError val model = topology.model(seed = 12L) val weights = model.weights.toArray val numWeights = weights.size val gradient = Vectors.dense(Array.fill[Double](numWeights)(0.0)) val loss = model.computeGradient(input, target, gradient, 1) val eps = 1e-4 var i = 0 val tol = 1e-4 while (i < numWeights) { val originalValue = weights(i) weights(i) += eps val newModel = topology.model(Vectors.dense(weights)) val newLoss = computeLoss(input, target, newModel) val derivativeEstimate = (newLoss - loss) / eps assert(math.abs(gradient(i) - derivativeEstimate) < tol, "Layer failed gradient check: " + layerWithError.getClass) weights(i) = originalValue i += 1 } } } private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = { val outputs = model.forward(input) model.layerModels.last match { case layerWithLoss: LossFunction => layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols)) case _ => throw new UnsupportedOperationException("Top layer is required to have loss." + " Failed layer:" + model.layerModels.last.getClass) } } }
Example 175
Source File: RandomForestRegressorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.tree.impl.TreeTests import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame def compareAPIs( data: RDD[LabeledPoint], rf: RandomForestRegressor, categoricalFeatures: Map[Int, Int]): Unit = { val numFeatures = data.first().features.size val oldStrategy = rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity) val oldModel = OldRandomForest.trainRegressor(data.map(OldLabeledPoint.fromML), oldStrategy, rf.getNumTrees, rf.getFeatureSubsetStrategy, rf.getSeed.toInt) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val newModel = rf.fit(newData) // Use parent from newTree since this is not checked anyways. val oldModelAsNew = RandomForestRegressionModel.fromOld( oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) assert(newModel.numFeatures === numFeatures) } }
Example 176
Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(8.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } test("ChiSqSelector by FPR transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = ChiSqSelectorModel.load(sc, path) ChiSqSelectorSuite.checkEqual(model, sameModel) } finally { Utils.deleteRecursively(tempDir) } } } object ChiSqSelectorSuite extends SparkFunSuite { def createModel(): ChiSqSelectorModel = { val arr = Array(1, 2, 3, 4) new ChiSqSelectorModel(arr) } def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = { assert(a.selectedFeatures.deep == b.selectedFeatures.deep) } }
Example 177
Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 178
Source File: IDFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 179
Source File: PCASuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 180
Source File: HashingTFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("hashing tf on a single doc") { val hashingTF = new HashingTF(1000) val doc = "a a b b c d".split(" ") val n = hashingTF.numFeatures val termFreqs = Seq( (hashingTF.indexOf("a"), 2.0), (hashingTF.indexOf("b"), 2.0), (hashingTF.indexOf("c"), 1.0), (hashingTF.indexOf("d"), 1.0)) assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n), "index must be in range [0, #features)") assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing") val expected = Vectors.sparse(n, termFreqs) assert(hashingTF.transform(doc) === expected) } test("hashing tf on an RDD") { val hashingTF = new HashingTF val localDocs: Seq[Seq[String]] = Seq( "a a b b b c d".split(" "), "a b c d a b c".split(" "), "c b a c b a a".split(" ")) val docs = sc.parallelize(localDocs, 2) assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet) } test("applying binary term freqs") { val hashingTF = new HashingTF(100).setBinary(true) val doc = "a a b c c c".split(" ") val n = hashingTF.numFeatures val expected = Vectors.sparse(n, Seq( (hashingTF.indexOf("a"), 1.0), (hashingTF.indexOf("b"), 1.0), (hashingTF.indexOf("c"), 1.0))) assert(hashingTF.transform(doc) ~== expected absTol 1e-14) } }
Example 181
Source File: MatrixFactorizationModelSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.recommendation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext { val rank = 2 var userFeatures: RDD[(Int, Array[Double])] = _ var prodFeatures: RDD[(Int, Array[Double])] = _ override def beforeAll(): Unit = { super.beforeAll() userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0)))) prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0)))) } test("constructor") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) assert(model.predict(0, 2) ~== 17.0 relTol 1e-14) intercept[IllegalArgumentException] { new MatrixFactorizationModel(1, userFeatures, prodFeatures) } val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures1, prodFeatures) } val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0)))) intercept[IllegalArgumentException] { new MatrixFactorizationModel(rank, userFeatures, prodFeatures1) } } test("save/load") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = { features.mapValues(_.toSeq).collect().toSet } try { model.save(sc, path) val newModel = MatrixFactorizationModel.load(sc, path) assert(newModel.rank === rank) assert(collect(newModel.userFeatures) === collect(userFeatures)) assert(collect(newModel.productFeatures) === collect(prodFeatures)) } finally { Utils.deleteRecursively(tempDir) } } test("batch predict API recommendProductsForUsers") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendProductsForUsers(topK).collectAsMap() assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14) assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14) } test("batch predict API recommendUsersForProducts") { val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures) val topK = 10 val recommendations = model.recommendUsersForProducts(topK).collectAsMap() assert(recommendations(2)(0).user == 1) assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14) assert(recommendations(2)(1).user == 0) assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14) } }
Example 182
Source File: MultilabelMetricsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("Multilabel evaluation metrics") { val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize( Seq((Array(0.0, 1.0), Array(0.0, 2.0)), (Array(0.0, 2.0), Array(0.0, 1.0)), (Array.empty[Double], Array(0.0)), (Array(2.0), Array(2.0)), (Array(2.0, 0.0), Array(2.0, 0.0)), (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)), (Array(1.0), Array(1.0, 2.0))), 2) val metrics = new MultilabelMetrics(scoreAndLabels) val delta = 0.00001 val precision0 = 4.0 / (4 + 0) val precision1 = 2.0 / (2 + 1) val precision2 = 2.0 / (2 + 2) val recall0 = 4.0 / (4 + 1) val recall1 = 2.0 / (2 + 1) val recall2 = 2.0 / (2 + 2) val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0) val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1) val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2) val sumTp = 4 + 2 + 2 assert(sumTp == (1 + 1 + 0 + 1 + 2 + 2 + 1)) val microPrecisionClass = sumTp.toDouble / (4 + 0 + 2 + 1 + 2 + 2) val microRecallClass = sumTp.toDouble / (4 + 1 + 2 + 1 + 2 + 2) val microF1MeasureClass = 2.0 * sumTp.toDouble / (2 * sumTp.toDouble + (1 + 1 + 2) + (0 + 1 + 2)) val macroPrecisionDoc = 1.0 / 7 * (1.0 / 2 + 1.0 / 2 + 0 + 1.0 / 1 + 2.0 / 2 + 2.0 / 3 + 1.0 / 1.0) val macroRecallDoc = 1.0 / 7 * (1.0 / 2 + 1.0 / 2 + 0 / 1 + 1.0 / 1 + 2.0 / 2 + 2.0 / 2 + 1.0 / 2) val macroF1MeasureDoc = (1.0 / 7) * 2 * ( 1.0 / (2 + 2) + 1.0 / (2 + 2) + 0 + 1.0 / (1 + 1) + 2.0 / (2 + 2) + 2.0 / (3 + 2) + 1.0 / (1 + 2) ) val hammingLoss = (1.0 / (7 * 3)) * (2 + 2 + 1 + 0 + 0 + 1 + 1) val strictAccuracy = 2.0 / 7 val accuracy = 1.0 / 7 * (1.0 / 3 + 1.0 /3 + 0 + 1.0 / 1 + 2.0 / 2 + 2.0 / 3 + 1.0 / 2) assert(math.abs(metrics.precision(0.0) - precision0) < delta) assert(math.abs(metrics.precision(1.0) - precision1) < delta) assert(math.abs(metrics.precision(2.0) - precision2) < delta) assert(math.abs(metrics.recall(0.0) - recall0) < delta) assert(math.abs(metrics.recall(1.0) - recall1) < delta) assert(math.abs(metrics.recall(2.0) - recall2) < delta) assert(math.abs(metrics.f1Measure(0.0) - f1measure0) < delta) assert(math.abs(metrics.f1Measure(1.0) - f1measure1) < delta) assert(math.abs(metrics.f1Measure(2.0) - f1measure2) < delta) assert(math.abs(metrics.microPrecision - microPrecisionClass) < delta) assert(math.abs(metrics.microRecall - microRecallClass) < delta) assert(math.abs(metrics.microF1Measure - microF1MeasureClass) < delta) assert(math.abs(metrics.precision - macroPrecisionDoc) < delta) assert(math.abs(metrics.recall - macroRecallDoc) < delta) assert(math.abs(metrics.f1Measure - macroF1MeasureDoc) < delta) assert(math.abs(metrics.hammingLoss - hammingLoss) < delta) assert(math.abs(metrics.subsetAccuracy - strictAccuracy) < delta) assert(math.abs(metrics.accuracy - accuracy) < delta) assert(metrics.labels.sameElements(Array(0.0, 1.0, 2.0))) } }
Example 183
Source File: RankingMetricsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("Ranking metrics: MAP, NDCG") { val predictionAndLabels = sc.parallelize( Seq( (Array(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array(1, 2, 3, 4, 5)), (Array(4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array(1, 2, 3)), (Array(1, 2, 3, 4, 5), Array.empty[Int]) ), 2) val eps = 1.0E-5 val metrics = new RankingMetrics(predictionAndLabels) val map = metrics.meanAveragePrecision assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps) assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps) assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps) assert(map ~== 0.355026 absTol eps) assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps) assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps) assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps) assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps) } test("MAP, NDCG with few predictions (SPARK-14886)") { val predictionAndLabels = sc.parallelize( Seq( (Array(1, 6, 2), Array(1, 2, 3, 4, 5)), (Array.empty[Int], Array(1, 2, 3)) ), 2) val eps = 1.0E-5 val metrics = new RankingMetrics(predictionAndLabels) assert(metrics.precisionAt(1) ~== 0.5 absTol eps) assert(metrics.precisionAt(2) ~== 0.25 absTol eps) assert(metrics.ndcgAt(1) ~== 0.5 absTol eps) assert(metrics.ndcgAt(2) ~== 0.30657 absTol eps) } }
Example 184
Source File: AreaUnderCurveSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext { test("auc computation") { val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0)) val auc = 4.0 assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5) } test("auc of an empty curve") { val curve = Seq.empty[(Double, Double)] assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } test("auc of a curve with a single point") { val curve = Seq((1.0, 1.0)) assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5) val rddCurve = sc.parallelize(curve, 2) assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5) } }
Example 185
Source File: RegressionMetricsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class RegressionMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { val obs = List[Double](77, 85, 62, 55, 63, 88, 57, 81, 51) val eps = 1E-5 test("regression metrics for unbiased (includes intercept term) predictor") { val preds = obs val predictionAndObservations = sc.parallelize(preds.zip(obs), 2) val metrics = new RegressionMetrics(predictionAndObservations) assert(metrics.explainedVariance ~== 174.83951 absTol eps, "explained variance regression score mismatch") assert(metrics.meanAbsoluteError ~== 0.0 absTol eps, "mean absolute error mismatch") assert(metrics.meanSquaredError ~== 0.0 absTol eps, "mean squared error mismatch") assert(metrics.rootMeanSquaredError ~== 0.0 absTol eps, "root mean squared error mismatch") assert(metrics.r2 ~== 1.0 absTol eps, "r2 score mismatch") } }
Example 186
Source File: MulticlassMetricsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Matrices import org.apache.spark.mllib.util.MLlibTestSparkContext class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { test("Multiclass evaluation metrics") { val confusionMatrix = Matrices.dense(3, 3, Array(2, 1, 0, 1, 3, 0, 1, 0, 1)) val labels = Array(0.0, 1.0, 2.0) val predictionAndLabels = sc.parallelize( Seq((0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2) val metrics = new MulticlassMetrics(predictionAndLabels) val delta = 0.0000001 val tpRate0 = 2.0 / (2 + 2) val tpRate1 = 3.0 / (3 + 1) val tpRate2 = 1.0 / (1 + 0) val fpRate0 = 1.0 / (9 - 4) val fpRate1 = 1.0 / (9 - 4) val fpRate2 = 1.0 / (9 - 1) val precision0 = 2.0 / (2 + 1) val precision1 = 3.0 / (3 + 1) val precision2 = 1.0 / (1 + 1) val recall0 = 2.0 / (2 + 2) val recall1 = 3.0 / (3 + 1) val recall2 = 1.0 / (1 + 0) val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0) val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1) val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2) val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0) val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1) val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2) assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray)) assert(math.abs(metrics.truePositiveRate(0.0) - tpRate0) < delta) assert(math.abs(metrics.truePositiveRate(1.0) - tpRate1) < delta) assert(math.abs(metrics.truePositiveRate(2.0) - tpRate2) < delta) assert(math.abs(metrics.falsePositiveRate(0.0) - fpRate0) < delta) assert(math.abs(metrics.falsePositiveRate(1.0) - fpRate1) < delta) assert(math.abs(metrics.falsePositiveRate(2.0) - fpRate2) < delta) assert(math.abs(metrics.precision(0.0) - precision0) < delta) assert(math.abs(metrics.precision(1.0) - precision1) < delta) assert(math.abs(metrics.precision(2.0) - precision2) < delta) assert(math.abs(metrics.recall(0.0) - recall0) < delta) assert(math.abs(metrics.recall(1.0) - recall1) < delta) assert(math.abs(metrics.recall(2.0) - recall2) < delta) assert(math.abs(metrics.fMeasure(0.0) - f1measure0) < delta) assert(math.abs(metrics.fMeasure(1.0) - f1measure1) < delta) assert(math.abs(metrics.fMeasure(2.0) - f1measure2) < delta) assert(math.abs(metrics.fMeasure(0.0, 2.0) - f2measure0) < delta) assert(math.abs(metrics.fMeasure(1.0, 2.0) - f2measure1) < delta) assert(math.abs(metrics.fMeasure(2.0, 2.0) - f2measure2) < delta) assert(math.abs(metrics.accuracy - (2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta) assert(math.abs(metrics.accuracy - metrics.precision) < delta) assert(math.abs(metrics.accuracy - metrics.recall) < delta) assert(math.abs(metrics.accuracy - metrics.fMeasure) < delta) assert(math.abs(metrics.accuracy - metrics.weightedRecall) < delta) assert(math.abs(metrics.weightedTruePositiveRate - ((4.0 / 9) * tpRate0 + (4.0 / 9) * tpRate1 + (1.0 / 9) * tpRate2)) < delta) assert(math.abs(metrics.weightedFalsePositiveRate - ((4.0 / 9) * fpRate0 + (4.0 / 9) * fpRate1 + (1.0 / 9) * fpRate2)) < delta) assert(math.abs(metrics.weightedPrecision - ((4.0 / 9) * precision0 + (4.0 / 9) * precision1 + (1.0 / 9) * precision2)) < delta) assert(math.abs(metrics.weightedRecall - ((4.0 / 9) * recall0 + (4.0 / 9) * recall1 + (1.0 / 9) * recall2)) < delta) assert(math.abs(metrics.weightedFMeasure - ((4.0 / 9) * f1measure0 + (4.0 / 9) * f1measure1 + (1.0 / 9) * f1measure2)) < delta) assert(math.abs(metrics.weightedFMeasure(2.0) - ((4.0 / 9) * f2measure0 + (4.0 / 9) * f2measure1 + (1.0 / 9) * f2measure2)) < delta) assert(metrics.labels.sameElements(labels)) } }
Example 187
Source File: FPTreeSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import scala.language.existentials import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext { test("add transaction") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) assert(tree.root.children.size == 2) assert(tree.root.children.contains("a")) assert(tree.root.children("a").item.equals("a")) assert(tree.root.children("a").count == 2) assert(tree.root.children.contains("b")) assert(tree.root.children("b").item.equals("b")) assert(tree.root.children("b").count == 1) var child = tree.root.children("a") assert(child.children.size == 1) assert(child.children.contains("b")) assert(child.children("b").item.equals("b")) assert(child.children("b").count == 2) child = child.children("b") assert(child.children.size == 2) assert(child.children.contains("c")) assert(child.children.contains("y")) assert(child.children("c").item.equals("c")) assert(child.children("y").item.equals("y")) assert(child.children("c").count == 1) assert(child.children("y").count == 1) } test("merge tree") { val tree1 = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("b")) val tree2 = new FPTree[String] .add(Seq("a", "b")) .add(Seq("a", "b", "c")) .add(Seq("a", "b", "c", "d")) .add(Seq("a", "x")) .add(Seq("a", "x", "y")) .add(Seq("c", "n")) .add(Seq("c", "m")) val tree3 = tree1.merge(tree2) assert(tree3.root.children.size == 3) assert(tree3.root.children("a").count == 7) assert(tree3.root.children("b").count == 1) assert(tree3.root.children("c").count == 2) val child1 = tree3.root.children("a") assert(child1.children.size == 2) assert(child1.children("b").count == 5) assert(child1.children("x").count == 2) val child2 = child1.children("b") assert(child2.children.size == 2) assert(child2.children("y").count == 1) assert(child2.children("c").count == 3) val child3 = child2.children("c") assert(child3.children.size == 1) assert(child3.children("d").count == 1) val child4 = child1.children("x") assert(child4.children.size == 1) assert(child4.children("y").count == 1) val child5 = tree3.root.children("c") assert(child5.children.size == 2) assert(child5.children("n").count == 1) assert(child5.children("m").count == 1) } test("extract freq itemsets") { val tree = new FPTree[String] .add(Seq("a", "b", "c")) .add(Seq("a", "b", "y")) .add(Seq("a", "b")) .add(Seq("a")) .add(Seq("b")) .add(Seq("b", "n")) val freqItemsets = tree.extract(3L).map { case (items, count) => (items.toSet, count) }.toSet val expected = Set( (Set("a"), 4L), (Set("b"), 5L), (Set("a", "b"), 3L)) assert(freqItemsets === expected) } }
Example 188
Source File: AssociationRulesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext { test("association rules using String type") { val freqItemsets = sc.parallelize(Seq( (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L), (Set("r"), 3L), (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L), (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L), (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L), (Set("t", "y", "x"), 3L), (Set("t", "y", "x", "z"), 3L) ).map { case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq) }) val ar = new AssociationRules() val results1 = ar .setMinConfidence(0.9) .run(freqItemsets) .collect() assert(results2.size === 30) assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23) } }
Example 189
Source File: KernelDensitySuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 190
Source File: MultivariateGaussianSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Matrices, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 191
Source File: CoordinateMatrixSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { val m = 5 val n = 4 var mat: CoordinateMatrix = _ override def beforeAll() { super.beforeAll() val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } mat = new CoordinateMatrix(entries) } test("size") { assert(mat.numRows() === m) assert(mat.numCols() === n) } test("empty entries") { val entries = sc.parallelize(Seq[MatrixEntry](), 1) val emptyMat = new CoordinateMatrix(entries) intercept[RuntimeException] { emptyMat.numCols() } intercept[RuntimeException] { emptyMat.numRows() } } test("toBreeze") { val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(mat.toBreeze() === expected) } test("transpose") { val transposed = mat.transpose() assert(mat.toBreeze().t === transposed.toBreeze()) } test("toIndexedRowMatrix") { val indexedRowMatrix = mat.toIndexedRowMatrix() val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(indexedRowMatrix.toBreeze() === expected) } test("toRowMatrix") { val rowMatrix = mat.toRowMatrix() val rows = rowMatrix.rows.collect().toSet val expected = Set( Vectors.dense(1.0, 2.0, 0.0, 0.0), Vectors.dense(0.0, 3.0, 4.0, 0.0), Vectors.dense(0.0, 0.0, 5.0, 6.0), Vectors.dense(7.0, 0.0, 0.0, 8.0), Vectors.dense(0.0, 9.0, 0.0, 0.0)) assert(rows === expected) } test("toBlockMatrix") { val blockMat = mat.toBlockMatrix(2, 2) assert(blockMat.numRows() === m) assert(blockMat.numCols() === n) assert(blockMat.toBreeze() === mat.toBreeze()) intercept[IllegalArgumentException] { mat.toBlockMatrix(-1, 2) } intercept[IllegalArgumentException] { mat.toBlockMatrix(2, 0) } } }
Example 192
Source File: MLPairRDDFunctionsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ import org.apache.spark.mllib.util.MLlibTestSparkContext class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("topByKey") { val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5, 1), (3, 5)), 2) .topByKey(5) .collectAsMap() assert(topMap.size === 3) assert(topMap(1) === Array(7, 6, 3, 2, 1)) assert(topMap(3) === Array(7, 5, 2)) assert(topMap(5) === Array(1)) } }
Example 193
Source File: RDDFunctionsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.rdd.RDDFunctions._ import org.apache.spark.mllib.util.MLlibTestSparkContext class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { test("sliding") { val data = 0 until 6 for (numPartitions <- 1 to 8) { val rdd = sc.parallelize(data, numPartitions) for (windowSize <- 1 to 6) { for (step <- 1 to 3) { val sliding = rdd.sliding(windowSize, step).collect().map(_.toList).toList val expected = data.sliding(windowSize, step) .map(_.toList).toList.filter(l => l.size == windowSize) assert(sliding === expected) } } assert(rdd.sliding(7).collect().isEmpty, "Should return an empty RDD if the window size is greater than the number of items.") } } test("sliding with empty partitions") { val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7)) val rdd = sc.parallelize(data, data.length).flatMap(s => s) assert(rdd.partitions.length === data.length) val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq) val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq) assert(sliding === expected) } }
Example 194
Source File: SpLinopMatrixSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.fs.dvector.dmatrix._ class SpLinopMatrixSuite extends FunSuite with MLlibTestSparkContext { test("SpLinopMatrix.apply is implemented properly") { val matrix: DMatrix = sc.parallelize(Array( Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)), 2) val vector: DVector = sc.parallelize(Array(2.0, 3.0), 2).glom.map(new DenseVector(_)) val expectApply: DMatrix = sc.parallelize(Array( Vectors.dense(2.0 * 1.0, 2.0 * 2.0, 2.0 * 3.0), Vectors.dense(3.0 * 4.0, 3.0 * 5.0, 3.0 * 6.0)), 2) assert((new SpLinopMatrix(vector))(matrix).collect().deep == expectApply.collect().deep, // or sameElements "SpLinopMatrix.apply should return the correct result.") } }
Example 195
Source File: InitializeSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, _} import org.apache.spark.mllib.optimization.tfocs.VectorSpace.{DMatrix, DVector} class InitializeSuite extends FunSuite with MLlibTestSparkContext { val numPartitions = 2 val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0) val BArray = Array( Array(12.0, 16.0, 30.0, 1.0, 0.0), Array(24.0, 16.0, 12.0, 0.0, 1.0), Array(-1.0, 0.0, 0.0, 0.0, 0.0), Array(0.0, -1.0, 0.0, 0.0, 0.0), Array(0.0, 0.0, -1.0, 0.0, 0.0), Array(0.0, 0.0, 0.0, 1.0, 0.0), Array(0.0, 0.0, 0.0, 0.0, 1.0)) val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0) lazy val c: DVector = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_)) lazy val rows: DMatrix = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_)) lazy val b: DenseVector = new DenseVector(bArray) val cBrz = new BDV[Double](cArray) val BBrz = new BDM[Double](7, 5, BArray.flatMap(x => x), offset = 0, majorStride = 5, isTranspose = true) val bBrz = new BDV[Double](bArray) // (BT * B) ^(-1) val BTBInv = inv(BBrz.t * BBrz) // xTilda = B * BTBInv * b val xTilda: BDV[Double] = BBrz * (BTBInv * bBrz) // lambdaTilda = BTBInv * (B^T * c) val lambdaTilda: BDV[Double] = BTBInv * (BBrz.t * cBrz) // sTilda = c - B * lambdaTilda val sTilda = cBrz - BBrz * lambdaTilda val deltax = Math.max(1.5 * max(xTilda), 0) val deltas = Math.max(1.5 * max(sTilda), 0) val xHat = xTilda :+ deltax val sHat = sTilda :+ deltas val deltaxHat: Double = 0.5 * (xHat.t * sHat) / sum(sHat) val deltasHat: Double = 0.5 * (xHat.t * sHat) / sum(xHat) // x = xHat + deltaxHat * e val expectedx: BDV[Double] = xHat :+ deltaxHat // val expectedLambda = lambdaTilda val expecteds: BDV[Double] = sHat :+ deltasHat test("Initialize.init is implemented properly") { val result = Initialize.init(c, rows, b) //println(LP.solve(c, rows, b, 1e-4, 1).collect()) assert(Vectors.dense(expectedx.toArray) ~= Vectors.dense(result._1.flatMap(_.toArray).collect()) relTol 1e-6, "Initialize.init x0 is not computed correctly.") assert(Vectors.dense(lambdaTilda.toArray) ~= Vectors.dense(result._2.toArray) relTol 1e-6, "Initialize.init lambda0 is not computed correctly.") assert(Vectors.dense(expecteds.toArray) ~= Vectors.dense(result._3.flatMap(_.toArray).collect()) relTol 1e-6, "Initialize.init s0 should return the correct answer.") } }
Example 196
Source File: LPSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace class LPSuite extends FunSuite with MLlibTestSparkContext { val numPartitions = 2 val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0) val BArray = Array( Array(12.0, 16.0, 30.0, 1.0, 0.0), Array(24.0, 16.0, 12.0, 0.0, 1.0), Array(-1.0, 0.0, 0.0, 0.0, 0.0), Array(0.0, -1.0, 0.0, 0.0, 0.0), Array(0.0, 0.0, -1.0, 0.0, 0.0), Array(0.0, 0.0, 0.0, 1.0, 0.0), Array(0.0, 0.0, 0.0, 0.0, 1.0)) val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0) lazy val c = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_)) lazy val rows = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_)) lazy val b = new DenseVector(bArray) test("LP solve is implemented properly") { val (v, x) = LP.solve(c, rows, b, sc=sc) // solution obtained from scipy.optimize.linprog and octave glgk lpsolver with fun_val = 12.083 val expectedSol = Vectors.dense( Array(1.66666667, 5.83333333, 40.0, 0.0, 0.0, 13.33333333, 9.16666667)) val xx = Vectors.dense(x.flatMap(_.toArray).collect()) println(s"$xx") println("optimal min value: " + v) assert(xx ~== expectedSol absTol 1e-6, "LP.solve x should return the correct answer.") } }
Example 197
Source File: VectorSpaceSuite.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.mllib.optimization.tfocs.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.tfocs.vs.dvectordouble.DVectorDoubleSpace import org.apache.spark.mllib.optimization.tfocs.vs.vector.DenseVectorSpace class VectorSpaceSuite extends FunSuite with MLlibTestSparkContext { test("DenseVectorSpace.combine is implemented properly") { val alpha = 1.1 val a = new DenseVector(Array(2.0, 3.0)) val beta = 4.0 val b = new DenseVector(Array(5.0, 6.0)) val expectedCombination = Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0) assert(DenseVectorSpace.combine(alpha, a, beta, b) == expectedCombination, "DenseVectorSpace.combine should return the correct result.") } test("DenseVectorSpace.dot is implemented properly") { val a = new DenseVector(Array(2.0, 3.0)) val b = new DenseVector(Array(5.0, 6.0)) val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 assert(DenseVectorSpace.dot(a, b) == expectedDot, "DenseVectorSpace.dot should return the correct result.") } test("DVectorSpace.combine is implemented properly") { val alpha = 1.1 val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2) val beta = 4.0 val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2) val combination = DVectorSpace.combine(alpha, a, beta, b) val expectedCombination = Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0) assert(Vectors.dense(combination.collectElements) == expectedCombination, "DVectorSpace.combine should return the correct result.") } test("DVectorSpace.dot is implemented properly") { val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2) val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2) val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0 assert(DVectorSpace.dot(a, b) == expectedDot, "DVectorSpace.dot should return the correct result.") } test("DVectorDoubleSpace.combine is implemented properly") { val alpha = 1.1 val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2), 9.9) val beta = 4.0 val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2), 11.11) val combination = DVectorDoubleSpace.combine(alpha, a, beta, b) val expectedCombination = (Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0), 1.1 * 9.9 + 4.0 * 11.11) assert(Vectors.dense(combination._1.collectElements) == expectedCombination._1, "DVectorVectorSpace.combine should return the correct result.") assert(combination._2 == expectedCombination._2, "DVectorVectorSpace.combine should return the correct result.") } test("DVectorDoubleSpace.dot is implemented properly") { val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2), 9.9) val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2), 11.11) val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0 + 9.9 * 11.11 assert(DVectorDoubleSpace.dot(a, b) == expectedDot, "DVectorVectorSpace.dot should return the correct result.") } }
Example 198
Source File: LinearOperatorSuite.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.scalatest.FunSuite import org.apache.spark.SparkException import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvectordouble.{ LinopMatrix => LinopMatrixVector } import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector.{ LinopMatrixAdjoint => LinopMatrixVectorAdjoint } import org.apache.spark.mllib.util.MLlibTestSparkContext class LinearOperatorSuite extends FunSuite with MLlibTestSparkContext { lazy val matrix = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)), 2) lazy val vector = new DenseVector(Array(2.2, 3.3, 4.4)) test("LinopMatrix multiplies properly") { val f = new LinopMatrix(matrix) val x = new DenseVector(Array(7.0, 8.0, 9.0)) val result = f(x) val expectedResult = Vectors.dense(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9) assert(Vectors.dense(result.collectElements) == expectedResult, "should return the correct product") } test("LinopMatrixAdjoint multiplies properly") { val f = new LinopMatrixAdjoint(matrix) val y = sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2) val result = f(y) val expectedResult = Vectors.dense(1 * 5 + 4 * 6, 2 * 5 + 5 * 6, 3 * 5 + 6 * 6) assert(result == expectedResult, "should return the correct product") } test("LinopMatrixAdjoint checks for mismatched partition vectors") { val f = new LinopMatrixAdjoint(matrix) val y = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2) intercept[SparkException] { f(y) } } test("LinopMatrixVector multiplies properly") { val f = new LinopMatrixVector(matrix, vector) val x = new DenseVector(Array(7.0, 8.0, 9.0)) val result = f(x) val expectedResult = (new DenseVector(Array(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)), 7.0 * 2.2 + 8.0 * 3.3 + 9.0 * 4.4) assert(Vectors.dense(result._1.collectElements) == expectedResult._1, "should return the correct product") assert(result._2 == expectedResult._2, "should return the correct product") } test("LinopMatrixVectorAdjoint multiplies properly") { var f = new LinopMatrixVectorAdjoint(matrix, vector) val y = (sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2), 8.8) val result = f(y) val expectedResult = Vectors.dense(1 * 5 + 4 * 6 + 2.2, 2 * 5 + 5 * 6 + 3.3, 3 * 5 + 6 * 6 + 4.4) assert(result == expectedResult, "should return the correct product") } test("LinopMatrixVectorAdjoint checks for mismatched partition vectors") { val f = new LinopMatrixVectorAdjoint(matrix, vector) val y = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2), 8.8) intercept[SparkException] { f(y) } } }
Example 199
Source File: LocalTrainingPlanSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.tree.OptimizedLearningNode import org.apache.spark.mllib.tree.configuration.{DefaultTimePredictionStrategy, TimePredictionStrategy} import org.apache.spark.mllib.util.MLlibTestSparkContext class LocalTrainingPlanSuite extends SparkFunSuite with MLlibTestSparkContext { val timePredictonStrategy: TimePredictionStrategy = new DefaultTimePredictionStrategy test("memory restriction") { val plan = new LocalTrainingPlan(10, timePredictonStrategy, Int.MaxValue) plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 1, rows = 2, impurity = 1.0)) plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 1, rows = 2, impurity = 1.0)) plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 1, rows = 2, impurity = 1.0)) plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 1, rows = 9, impurity = 1.0)) assert(plan.bins.length == 2) assert(plan.bins.head.tasks.length == 3) assert(plan.bins(1).tasks.length == 1) } test("count restriction") { val plan = new LocalTrainingPlan(10, timePredictonStrategy, 2) plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 1, rows = 2, impurity = 1.0)) plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 1, rows = 2, impurity = 1.0)) plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 1, rows = 2, impurity = 1.0)) assert(plan.bins.length == 2) assert(plan.bins.head.tasks.length == 2) assert(plan.bins(1).tasks.length == 1) } test("task implicit ordering by memory usage descending") { val l = List(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 1, rows = 1, impurity = 1.0), new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 2, rows = 5, impurity = 1.0), new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1), treeIndex = 3, rows = 3, impurity = 1.0) ) val sorted = l.sorted assert(sorted.head.treeIndex == 2) } }
Example 200
Source File: LocalTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val distribTree = setParams(new DecisionTreeRegressor(), testParams) val localTree = setParams(new LocalDecisionTreeRegressor(), testParams) val localModel = localTree.fit(train) val model = distribTree.fit(train) OptimizedTreeTests.checkEqual(model, localModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }