org.apache.spark.mllib.util.MLlibTestSparkContext Scala Example

Source File: AreaUnderCurveSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("auc computation") {
    val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0))
    val auc = 4.0
    assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5)
  }

  test("auc of an empty curve") {
    val curve = Seq.empty[(Double, Double)]
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }

  test("auc of a curve with a single point") {
    val curve = Seq((1.0, 1.0))
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }
}

Source File: ChiSqSelectorSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {

  

  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(6.0))),
        LabeledPoint(1.0, Vectors.dense(Array(8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(5.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSet
    assert(filteredData == preFilteredData)
  }

  test("model load / save") {
    val model = ChiSqSelectorSuite.createModel()
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    try {
      model.save(sc, path)
      val sameModel = ChiSqSelectorModel.load(sc, path)
      ChiSqSelectorSuite.checkEqual(model, sameModel)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

object ChiSqSelectorSuite extends SparkFunSuite {

  def createModel(): ChiSqSelectorModel = {
    val arr = Array(1, 2, 3, 4)
    new ChiSqSelectorModel(arr)
  }

  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
  }
}

Source File: Word2VecSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.Utils

class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {

  // TODO: add more tests

  test("Word2Vec") {
    val sentence = "a b " * 100 + "a c " * 10
    val localDoc = Seq(sentence, sentence)
    val doc = sc.parallelize(localDoc)
      .map(line => line.split(" ").toSeq)
    val model = new Word2Vec().setVectorSize(10).setSeed(42L).fit(doc)
    val syms = model.findSynonyms("a", 2)
    assert(syms.length == 2)
    assert(syms(0)._1 == "b")
    assert(syms(1)._1 == "c")

    // Test that model built using Word2Vec, i.e wordVectors and wordIndec
    // and a Word2VecMap give the same values.
    val word2VecMap = model.getVectors
    val newModel = new Word2VecModel(word2VecMap)
    assert(newModel.getVectors.mapValues(_.toSeq) === word2VecMap.mapValues(_.toSeq))
  }

  test("Word2Vec throws exception when vocabulary is empty") {
    intercept[IllegalArgumentException] {
      val sentence = "a b c"
      val localDoc = Seq(sentence, sentence)
      val doc = sc.parallelize(localDoc)
        .map(line => line.split(" ").toSeq)
      new Word2Vec().setMinCount(10).fit(doc)
    }
  }

  test("Word2VecModel") {
    val num = 2
    val word2VecMap = Map(
      ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)),
      ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)),
      ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)),
      ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f))
    )
    val model = new Word2VecModel(word2VecMap)
    val syms = model.findSynonyms("china", num)
    assert(syms.length == num)
    assert(syms(0)._1 == "taiwan")
    assert(syms(1)._1 == "japan")
  }

  test("model load / save") {

    val word2VecMap = Map(
      ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)),
      ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)),
      ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)),
      ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f))
    )
    val model = new Word2VecModel(word2VecMap)

    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString

    try {
      model.save(sc, path)
      val sameModel = Word2VecModel.load(sc, path)
      assert(sameModel.getVectors.mapValues(_.toSeq) === model.getVectors.mapValues(_.toSeq))
    } finally {
      Utils.deleteRecursively(tempDir)
    }

  }
}

Source File: ElementwiseProductSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("elementwise (hadamard) product should properly apply vector to dense data set") {
    val denseData = Array(
      Vectors.dense(1.0, 4.0, 1.9, -9.0)
    )
    val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
    val transformer = new ElementwiseProduct(scalingVec)
    val transformedData = transformer.transform(sc.makeRDD(denseData))
    val transformedVecs = transformedData.collect()
    val transformedVec = transformedVecs(0)
    val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
    assert(transformedVec ~== expectedVec absTol 1E-5,
      s"Expected transformed vector $expectedVec but found $transformedVec")
  }

  test("elementwise (hadamard) product should properly apply vector to sparse data set") {
    val sparseData = Array(
      Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
    )
    val dataRDD = sc.parallelize(sparseData, 3)
    val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
    val transformer = new ElementwiseProduct(scalingVec)
    val data2 = sparseData.map(transformer.transform)
    val data2RDD = transformer.transform(dataRDD)

    assert((sparseData, data2, data2RDD.collect()).zipped.forall {
      case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after hadamard product")

    assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
    assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
  }
}

Source File: IDFSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
        0
      }
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

}

Source File: PCASuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
  )

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val pc = mat.computePrincipalComponents(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect()

    assert(pca_transform.toSet === mat_multiply.toSet)
  }
}

Source File: HashingTFSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("hashing tf on a single doc") {
    val hashingTF = new HashingTF(1000)
    val doc = "a a b b c d".split(" ")
    val n = hashingTF.numFeatures
    val termFreqs = Seq(
      (hashingTF.indexOf("a"), 2.0),
      (hashingTF.indexOf("b"), 2.0),
      (hashingTF.indexOf("c"), 1.0),
      (hashingTF.indexOf("d"), 1.0))
    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
      "index must be in range [0, #features)")
    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")
    val expected = Vectors.sparse(n, termFreqs)
    assert(hashingTF.transform(doc) === expected)
  }

  test("hashing tf on an RDD") {
    val hashingTF = new HashingTF
    val localDocs: Seq[Seq[String]] = Seq(
      "a a b b b c d".split(" "),
      "a b c d a b c".split(" "),
      "c b a c b a a".split(" "))
    val docs = sc.parallelize(localDocs, 2)
    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
  }
}

Source File: BaggedPointSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.tree.EnsembleTestHelper
import org.apache.spark.mllib.util.MLlibTestSparkContext


class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext  {

  test("BaggedPoint RDD: without subsampling") {
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42)
    baggedRDD.collect().foreach { baggedPoint =>
      assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 1.0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample)))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }
}

Source File: MatrixFactorizationModelSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.recommendation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils

class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext {

  val rank = 2
  var userFeatures: RDD[(Int, Array[Double])] = _
  var prodFeatures: RDD[(Int, Array[Double])] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0))))
    prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0))))
  }

  test("constructor") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    assert(model.predict(0, 2) ~== 17.0 relTol 1e-14)

    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(1, userFeatures, prodFeatures)
    }

    val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures1, prodFeatures)
    }

    val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures, prodFeatures1)
    }
  }

  test("save/load") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = {
      features.mapValues(_.toSeq).collect().toSet
    }
    try {
      model.save(sc, path)
      val newModel = MatrixFactorizationModel.load(sc, path)
      assert(newModel.rank === rank)
      assert(collect(newModel.userFeatures) === collect(userFeatures))
      assert(collect(newModel.productFeatures) === collect(prodFeatures))
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }

  test("batch predict API recommendProductsForUsers") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendProductsForUsers(topK).collectAsMap()

    assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14)
    assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14)
  }

  test("batch predict API recommendUsersForProducts") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendUsersForProducts(topK).collectAsMap()

    assert(recommendations(2)(0).user == 1)
    assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14)
    assert(recommendations(2)(1).user == 0)
    assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14)
  }
}

Source File: RankingMetricsSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("Ranking metrics: map, ndcg") {
    val predictionAndLabels = sc.parallelize(
      Seq(
        (Array[Int](1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array[Int](1, 2, 3, 4, 5)),
        (Array[Int](4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array[Int](1, 2, 3)),
        (Array[Int](1, 2, 3, 4, 5), Array[Int]())
      ), 2)
    val eps: Double = 1E-5

    val metrics = new RankingMetrics(predictionAndLabels)
    val map = metrics.meanAveragePrecision

    assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps)
    assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps)

    assert(map ~== 0.355026 absTol eps)

    assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps)
    assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps)
    assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps)

  }
}

Source File: DecisionTreeRegressorSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.regression

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.impl.TreeTests
import org.apache.spark.ml.util.MLTestingUtils
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree,
  DecisionTreeSuite => OldDecisionTreeSuite}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame


class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import DecisionTreeRegressorSuite.compareAPIs

  private var categoricalDataPointsRDD: RDD[LabeledPoint] = _

  override def beforeAll() {
    super.beforeAll()
    categoricalDataPointsRDD =
      sc.parallelize(OldDecisionTreeSuite.generateCategoricalDataPoints())
  }

  /////////////////////////////////////////////////////////////////////////////
  // Tests calling train()
  /////////////////////////////////////////////////////////////////////////////

  test("Regression stump with 3-ary (ordered) categorical features") {
    val dt = new DecisionTreeRegressor()
      .setImpurity("variance")
      .setMaxDepth(2)
      .setMaxBins(100)
      .setSeed(1)
    val categoricalFeatures = Map(0 -> 3, 1-> 3)
    compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures)
  }

  test("Regression stump with binary (ordered) categorical features") {
    val dt = new DecisionTreeRegressor()
      .setImpurity("variance")
      .setMaxDepth(2)
      .setMaxBins(100)
    val categoricalFeatures = Map(0 -> 2, 1-> 2)
    compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures)
  }

  test("copied model must have the same parent") {
    val categoricalFeatures = Map(0 -> 2, 1-> 2)
    val df = TreeTests.setMetadata(categoricalDataPointsRDD, categoricalFeatures, numClasses = 0)
    val model = new DecisionTreeRegressor()
      .setImpurity("variance")
      .setMaxDepth(2)
      .setMaxBins(8).fit(df)
    MLTestingUtils.checkCopy(model)
  }

  /////////////////////////////////////////////////////////////////////////////
  // Tests of model save/load
  /////////////////////////////////////////////////////////////////////////////

  // TODO: test("model save/load")   SPARK-6725
}

private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {

  
  def compareAPIs(
      data: RDD[LabeledPoint],
      dt: DecisionTreeRegressor,
      categoricalFeatures: Map[Int, Int]): Unit = {
    val numFeatures = data.first().features.size
    val oldStrategy = dt.getOldStrategy(categoricalFeatures)
    val oldTree = OldDecisionTree.train(data, oldStrategy)
    val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
    val newTree = dt.fit(newData)
    // Use parent from newTree since this is not checked anyways.
    val oldTreeAsNew = DecisionTreeRegressionModel.fromOld(
      oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures)
    TreeTests.checkEqual(oldTreeAsNew, newTree)
    assert(newTree.numFeatures === numFeatures)
  }
}

Source File: RegressionMetricsSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class RegressionMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("regression metrics for unbiased (includes intercept term) predictor") {
    
    val predictionAndObservations = sc.parallelize(
      Seq((2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)), 2)
    val metrics = new RegressionMetrics(predictionAndObservations)
    assert(metrics.explainedVariance ~== 8.85937 absTol 1E-5,
      "explained variance regression score mismatch")
    assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch")
    assert(metrics.meanSquaredError ~== 0.375 absTol 1E-5, "mean squared error mismatch")
    assert(metrics.rootMeanSquaredError ~== 0.61237 absTol 1E-5,
      "root mean squared error mismatch")
    assert(metrics.r2 ~== 0.94860 absTol 1E-5, "r2 score mismatch")
  }

  test("regression metrics with complete fitting") {
    val predictionAndObservations = sc.parallelize(
      Seq((3.0, 3.0), (0.0, 0.0), (2.0, 2.0), (8.0, 8.0)), 2)
    val metrics = new RegressionMetrics(predictionAndObservations)
    assert(metrics.explainedVariance ~== 8.6875 absTol 1E-5,
      "explained variance regression score mismatch")
    assert(metrics.meanAbsoluteError ~== 0.0 absTol 1E-5, "mean absolute error mismatch")
    assert(metrics.meanSquaredError ~== 0.0 absTol 1E-5, "mean squared error mismatch")
    assert(metrics.rootMeanSquaredError ~== 0.0 absTol 1E-5,
      "root mean squared error mismatch")
    assert(metrics.r2 ~== 1.0 absTol 1E-5, "r2 score mismatch")
  }
}

Source File: MulticlassMetricsSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Matrices
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("Multiclass evaluation metrics") {
    
    val confusionMatrix = Matrices.dense(3, 3, Array(2, 1, 0, 1, 3, 0, 1, 0, 1))
    val labels = Array(0.0, 1.0, 2.0)
    val predictionAndLabels = sc.parallelize(
      Seq((0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0),
        (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2)
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val delta = 0.0000001
    val fpRate0 = 1.0 / (9 - 4)
    val fpRate1 = 1.0 / (9 - 4)
    val fpRate2 = 1.0 / (9 - 1)
    val precision0 = 2.0 / (2 + 1)
    val precision1 = 3.0 / (3 + 1)
    val precision2 = 1.0 / (1 + 1)
    val recall0 = 2.0 / (2 + 2)
    val recall1 = 3.0 / (3 + 1)
    val recall2 = 1.0 / (1 + 0)
    val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0)
    val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1)
    val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2)
    val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0)
    val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1)
    val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2)

    assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray))
    assert(math.abs(metrics.falsePositiveRate(0.0) - fpRate0) < delta)
    assert(math.abs(metrics.falsePositiveRate(1.0) - fpRate1) < delta)
    assert(math.abs(metrics.falsePositiveRate(2.0) - fpRate2) < delta)
    assert(math.abs(metrics.precision(0.0) - precision0) < delta)
    assert(math.abs(metrics.precision(1.0) - precision1) < delta)
    assert(math.abs(metrics.precision(2.0) - precision2) < delta)
    assert(math.abs(metrics.recall(0.0) - recall0) < delta)
    assert(math.abs(metrics.recall(1.0) - recall1) < delta)
    assert(math.abs(metrics.recall(2.0) - recall2) < delta)
    assert(math.abs(metrics.fMeasure(0.0) - f1measure0) < delta)
    assert(math.abs(metrics.fMeasure(1.0) - f1measure1) < delta)
    assert(math.abs(metrics.fMeasure(2.0) - f1measure2) < delta)
    assert(math.abs(metrics.fMeasure(0.0, 2.0) - f2measure0) < delta)
    assert(math.abs(metrics.fMeasure(1.0, 2.0) - f2measure1) < delta)
    assert(math.abs(metrics.fMeasure(2.0, 2.0) - f2measure2) < delta)

    assert(math.abs(metrics.recall -
      (2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta)
    assert(math.abs(metrics.recall - metrics.precision) < delta)
    assert(math.abs(metrics.recall - metrics.fMeasure) < delta)
    assert(math.abs(metrics.recall - metrics.weightedRecall) < delta)
    assert(math.abs(metrics.weightedFalsePositiveRate -
      ((4.0 / 9) * fpRate0 + (4.0 / 9) * fpRate1 + (1.0 / 9) * fpRate2)) < delta)
    assert(math.abs(metrics.weightedPrecision -
      ((4.0 / 9) * precision0 + (4.0 / 9) * precision1 + (1.0 / 9) * precision2)) < delta)
    assert(math.abs(metrics.weightedRecall -
      ((4.0 / 9) * recall0 + (4.0 / 9) * recall1 + (1.0 / 9) * recall2)) < delta)
    assert(math.abs(metrics.weightedFMeasure -
      ((4.0 / 9) * f1measure0 + (4.0 / 9) * f1measure1 + (1.0 / 9) * f1measure2)) < delta)
    assert(math.abs(metrics.weightedFMeasure(2.0) -
      ((4.0 / 9) * f2measure0 + (4.0 / 9) * f2measure1 + (1.0 / 9) * f2measure2)) < delta)
    assert(metrics.labels.sameElements(labels))
  }
}

Source File: FPTreeSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import scala.language.existentials

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("add transaction") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    assert(tree.root.children.size == 2)
    assert(tree.root.children.contains("a"))
    assert(tree.root.children("a").item.equals("a"))
    assert(tree.root.children("a").count == 2)
    assert(tree.root.children.contains("b"))
    assert(tree.root.children("b").item.equals("b"))
    assert(tree.root.children("b").count == 1)
    var child = tree.root.children("a")
    assert(child.children.size == 1)
    assert(child.children.contains("b"))
    assert(child.children("b").item.equals("b"))
    assert(child.children("b").count == 2)
    child = child.children("b")
    assert(child.children.size == 2)
    assert(child.children.contains("c"))
    assert(child.children.contains("y"))
    assert(child.children("c").item.equals("c"))
    assert(child.children("y").item.equals("y"))
    assert(child.children("c").count == 1)
    assert(child.children("y").count == 1)
  }

  test("merge tree") {
    val tree1 = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    val tree2 = new FPTree[String]
      .add(Seq("a", "b"))
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "c", "d"))
      .add(Seq("a", "x"))
      .add(Seq("a", "x", "y"))
      .add(Seq("c", "n"))
      .add(Seq("c", "m"))

    val tree3 = tree1.merge(tree2)

    assert(tree3.root.children.size == 3)
    assert(tree3.root.children("a").count == 7)
    assert(tree3.root.children("b").count == 1)
    assert(tree3.root.children("c").count == 2)
    val child1 = tree3.root.children("a")
    assert(child1.children.size == 2)
    assert(child1.children("b").count == 5)
    assert(child1.children("x").count == 2)
    val child2 = child1.children("b")
    assert(child2.children.size == 2)
    assert(child2.children("y").count == 1)
    assert(child2.children("c").count == 3)
    val child3 = child2.children("c")
    assert(child3.children.size == 1)
    assert(child3.children("d").count == 1)
    val child4 = child1.children("x")
    assert(child4.children.size == 1)
    assert(child4.children("y").count == 1)
    val child5 = tree3.root.children("c")
    assert(child5.children.size == 2)
    assert(child5.children("n").count == 1)
    assert(child5.children("m").count == 1)
  }

  test("extract freq itemsets") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("a", "b"))
      .add(Seq("a"))
      .add(Seq("b"))
      .add(Seq("b", "n"))

    val freqItemsets = tree.extract(3L).map { case (items, count) =>
      (items.toSet, count)
    }.toSet
    val expected = Set(
      (Set("a"), 4L),
      (Set("b"), 5L),
      (Set("a", "b"), 3L))
    assert(freqItemsets === expected)
  }
}

Source File: AssociationRulesSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("association rules using String type") {
    val freqItemsets = sc.parallelize(Seq(
      (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
      (Set("r"), 3L),
      (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L),
      (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L),
      (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L),
      (Set("t", "y", "x"), 3L),
      (Set("t", "y", "x", "z"), 3L)
    ).map {
      case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq)
    })

    val ar = new AssociationRules()

    val results1 = ar
      .setMinConfidence(0.9)
      .run(freqItemsets)
      .collect()

    
    assert(results2.size === 30)
    assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
  }
}

Source File: KernelDensitySuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MultivariateGaussianSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{ Vectors, Matrices }
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }

}

Source File: CoordinateMatrixSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.linalg.Vectors

class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {

  val m = 5
  val n = 4
  var mat: CoordinateMatrix = _

  override def beforeAll() {
    super.beforeAll()
    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    mat = new CoordinateMatrix(entries)
  }

  test("size") {
    assert(mat.numRows() === m)
    assert(mat.numCols() === n)
  }

  test("empty entries") {
    val entries = sc.parallelize(Seq[MatrixEntry](), 1)
    val emptyMat = new CoordinateMatrix(entries)
    intercept[RuntimeException] {
      emptyMat.numCols()
    }
    intercept[RuntimeException] {
      emptyMat.numRows()
    }
  }

  test("toBreeze") {
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(mat.toBreeze() === expected)
  }

  test("transpose") {
    val transposed = mat.transpose()
    assert(mat.toBreeze().t === transposed.toBreeze())
  }

  test("toIndexedRowMatrix") {
    val indexedRowMatrix = mat.toIndexedRowMatrix()
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(indexedRowMatrix.toBreeze() === expected)
  }

  test("toRowMatrix") {
    val rowMatrix = mat.toRowMatrix()
    val rows = rowMatrix.rows.collect().toSet
    val expected = Set(
      Vectors.dense(1.0, 2.0, 0.0, 0.0),
      Vectors.dense(0.0, 3.0, 4.0, 0.0),
      Vectors.dense(0.0, 0.0, 5.0, 6.0),
      Vectors.dense(7.0, 0.0, 0.0, 8.0),
      Vectors.dense(0.0, 9.0, 0.0, 0.0))
    assert(rows === expected)
  }

  test("toBlockMatrix") {
    val blockMat = mat.toBlockMatrix(2, 2)
    assert(blockMat.numRows() === m)
    assert(blockMat.numCols() === n)
    assert(blockMat.toBreeze() === mat.toBreeze())

    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(-1, 2)
    }
    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(2, 0)
    }
  }
}

Source File: MLPairRDDFunctionsSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._

class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("topByKey") {
    val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5,
      1), (3, 5)), 2)
      .topByKey(5)
      .collectAsMap()

    assert(topMap.size === 3)
    assert(topMap(1) === Array(7, 6, 3, 2, 1))
    assert(topMap(3) === Array(7, 5, 2))
    assert(topMap(5) === Array(1))
  }
}

Source File: RDDFunctionsSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.rdd.RDDFunctions._

class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("sliding") {
    val data = 0 until 6
    for (numPartitions <- 1 to 8) {
      val rdd = sc.parallelize(data, numPartitions)
      for (windowSize <- 1 to 6) {
        for (step <- 1 to 3) {
          val sliding = rdd.sliding(windowSize, step).collect().map(_.toList).toList
          val expected = data.sliding(windowSize, step)
            .map(_.toList).toList.filter(l => l.size == windowSize)
          assert(sliding === expected)
        }
      }
      assert(rdd.sliding(7).collect().isEmpty,
        "Should return an empty RDD if the window size is greater than the number of items.")
    }
  }

  test("sliding with empty partitions") {
    val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7))
    val rdd = sc.parallelize(data, data.length).flatMap(s => s)
    assert(rdd.partitions.length === data.length)
    val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq)
    val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq)
    assert(sliding === expected)
  }
}

Source File: QuantileDiscretizerSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkContext, SparkFunSuite}

class QuantileDiscretizerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import org.apache.spark.ml.feature.QuantileDiscretizerSuite._

  test("Test quantile discretizer") {
    checkDiscretizedData(sc,
      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
      10,
      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
      Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity"))

    checkDiscretizedData(sc,
      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
      4,
      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
      Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity"))

    checkDiscretizedData(sc,
      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
      3,
      Array[Double](0, 1, 2, 2, 2, 2, 2, 2, 2),
      Array("-Infinity, 2.0", "2.0, 3.0", "3.0, Infinity"))

    checkDiscretizedData(sc,
      Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
      2,
      Array[Double](0, 1, 1, 1, 1, 1, 1, 1, 1),
      Array("-Infinity, 2.0", "2.0, Infinity"))

  }

  test("Test getting splits") {
    val splitTestPoints = Array(
      Array[Double]() -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
      Array(Double.NegativeInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
      Array(Double.PositiveInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
      Array(Double.NegativeInfinity, Double.PositiveInfinity)
        -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
      Array(0.0) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
      Array(1.0) -> Array(Double.NegativeInfinity, 1, Double.PositiveInfinity),
      Array(0.0, 1.0) -> Array(Double.NegativeInfinity, 0, 1, Double.PositiveInfinity)
    )
    for ((ori, res) <- splitTestPoints) {
      assert(QuantileDiscretizer.getSplits(ori) === res, "Returned splits are invalid.")
    }
  }

  test("read/write") {
    val t = new QuantileDiscretizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setNumBuckets(6)
    testDefaultReadWrite(t)
  }
}

private object QuantileDiscretizerSuite extends SparkFunSuite {

  def checkDiscretizedData(
      sc: SparkContext,
      data: Array[Double],
      numBucket: Int,
      expectedResult: Array[Double],
      expectedAttrs: Array[String]): Unit = {
    val sqlCtx = SQLContext.getOrCreate(sc)
    import sqlCtx.implicits._

    val df = sc.parallelize(data.map(Tuple1.apply)).toDF("input")
    val discretizer = new QuantileDiscretizer().setInputCol("input").setOutputCol("result")
      .setNumBuckets(numBucket)
    val result = discretizer.fit(df).transform(df)

    val transformedFeatures = result.select("result").collect()
      .map { case Row(transformedFeature: Double) => transformedFeature }
    val transformedAttrs = Attribute.fromStructField(result.schema("result"))
      .asInstanceOf[NominalAttribute].values.get

    assert(transformedFeatures === expectedResult,
      "Transformed features do not equal expected features.")
    assert(transformedAttrs === expectedAttrs,
      "Transformed attributes do not equal expected attributes.")
  }
}

Source File: BinarizerSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  @transient var data: Array[Double] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
  }

  test("params") {
    ParamsSuite.checkParams(new Binarizer)
  }

  test("Binarize continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = sqlContext.createDataFrame(
      data.zip(defaultBinarized)).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize continuous features with setter") {
    val threshold: Double = 0.2
    val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = sqlContext.createDataFrame(
        data.zip(thresholdBinarized)).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("read/write") {
    val t = new Binarizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setThreshold(0.1)
    testDefaultReadWrite(t)
  }
}

Source File: SQLTransformerSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class SQLTransformerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new SQLTransformer())
  }

  test("transform numeric data") {
    val original = sqlContext.createDataFrame(
      Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    val result = sqlTrans.transform(original)
    val resultSchema = sqlTrans.transformSchema(original.schema)
    val expected = sqlContext.createDataFrame(
      Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)))
      .toDF("id", "v1", "v2", "v3", "v4")
    assert(result.schema.toString == resultSchema.toString)
    assert(resultSchema == expected.schema)
    assert(result.collect().toSeq == expected.collect().toSeq)
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }
}

Source File: TokenizerSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

@BeanInfo
case class TokenizerTestData(rawText: String, wantedTokens: Array[String])

class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new Tokenizer)
  }

  test("read/write") {
    val t = new Tokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }
}

class RegexTokenizerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import org.apache.spark.ml.feature.RegexTokenizerSuite._

  test("params") {
    ParamsSuite.checkParams(new RegexTokenizer)
  }

  test("RegexTokenizer") {
    val tokenizer0 = new RegexTokenizer()
      .setGaps(false)
      .setPattern("\\w+|\\p{Punct}")
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset0 = sqlContext.createDataFrame(Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")),
      TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct"))
    ))
    testRegexTokenizer(tokenizer0, dataset0)

    val dataset1 = sqlContext.createDataFrame(Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")),
      TokenizerTestData("Te,st. punct", Array("punct"))
    ))
    tokenizer0.setMinTokenLength(3)
    testRegexTokenizer(tokenizer0, dataset1)

    val tokenizer2 = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset2 = sqlContext.createDataFrame(Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")),
      TokenizerTestData("Te,st.  punct", Array("te,st.", "punct"))
    ))
    testRegexTokenizer(tokenizer2, dataset2)
  }

  test("RegexTokenizer with toLowercase false") {
    val tokenizer = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
      .setToLowercase(false)
    val dataset = sqlContext.createDataFrame(Seq(
      TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")),
      TokenizerTestData("java scala", Array("java", "scala"))
    ))
    testRegexTokenizer(tokenizer, dataset)
  }

  test("read/write") {
    val t = new RegexTokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setMinTokenLength(2)
      .setGaps(false)
      .setPattern("hi")
      .setToLowercase(false)
    testDefaultReadWrite(t)
  }
}

object RegexTokenizerSuite extends SparkFunSuite {

  def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = {
    t.transform(dataset)
      .select("tokens", "wantedTokens")
      .collect()
      .foreach { case Row(tokens, wantedTokens) =>
        assert(tokens === wantedTokens)
      }
  }
}

Source File: MinMaxScalerSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SQLContext}

class MinMaxScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("MinMaxScaler fit basic case") {
    val sqlContext = new SQLContext(sc)

    val data = Array(
      Vectors.dense(1, 0, Long.MinValue),
      Vectors.dense(2, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(3, Long.MaxValue)),
      Vectors.sparse(3, Array(0), Array(1.5)))

    val expected: Array[Vector] = Array(
      Vectors.dense(-5, 0, -5),
      Vectors.dense(0, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(5, 5)),
      Vectors.sparse(3, Array(0), Array(-2.5)))

    val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected")
    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setMin(-5)
      .setMax(5)

    val model = scaler.fit(df)
    model.transform(df).select("expected", "scaled").collect()
      .foreach { case Row(vector1: Vector, vector2: Vector) =>
        assert(vector1.equals(vector2), "Transformed vector is different with expected.")
    }

    // copied model must have the same parent.
    MLTestingUtils.checkCopy(model)
  }

  test("MinMaxScaler arguments max must be larger than min") {
    withClue("arguments max must be larger than min") {
      intercept[IllegalArgumentException] {
        val scaler = new MinMaxScaler().setMin(10).setMax(0)
        scaler.validateParams()
      }
      intercept[IllegalArgumentException] {
        val scaler = new MinMaxScaler().setMin(0).setMax(0)
        scaler.validateParams()
      }
    }
  }

  test("MinMaxScaler read/write") {
    val t = new MinMaxScaler()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setMax(1.0)
      .setMin(-1.0)
    testDefaultReadWrite(t)
  }

  test("MinMaxScalerModel read/write") {
    val instance = new MinMaxScalerModel(
        "myMinMaxScalerModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0))
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setMin(-1.0)
      .setMax(1.0)
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.originalMin === instance.originalMin)
    assert(newInstance.originalMax === instance.originalMax)
  }
}

Source File: PolynomialExpansionSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.param.ParamsSuite
import org.scalatest.exceptions.TestFailedException

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class PolynomialExpansionSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new PolynomialExpansion)
  }

  test("Polynomial expansion with default parameter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())
    )

    val twoDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)),
      Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29),
      Vectors.dense(new Array[Double](9)),
      Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0),
      Vectors.sparse(9, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")

    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
    }
  }

  test("Polynomial expansion with setter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())
    )

    val threeDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8),
        Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)),
      Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17),
      Vectors.dense(new Array[Double](19)),
      Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8,
        -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0),
      Vectors.sparse(19, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")
      .setDegree(3)

    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
    }
  }

  test("read/write") {
    val t = new PolynomialExpansion()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setDegree(3)
    testDefaultReadWrite(t)
  }
}

Source File: IDFSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = {
    dataSet.map {
      case data: DenseVector =>
        val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y }
        Vectors.dense(res)
      case data: SparseVector =>
        val res = data.indices.zip(data.values).map { case (id, value) =>
          (id, value * model(id))
        }
        Vectors.sparse(data.size, res)
    }
  }

  test("params") {
    ParamsSuite.checkParams(new IDF)
    val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0)))
    ParamsSuite.checkParams(model)
  }

  test("compute IDF with default parameter") {
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    )
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((numOfData + 1.0) / (x + 1.0))
    })
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected")

    val idfModel = new IDF()
      .setInputCol("features")
      .setOutputCol("idfValue")
      .fit(df)

    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
    }
  }

  test("compute IDF with setter") {
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    )
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0
    })
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected")

    val idfModel = new IDF()
      .setInputCol("features")
      .setOutputCol("idfValue")
      .setMinDocFreq(1)
      .fit(df)

    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
    }
  }

  test("IDF read/write") {
    val t = new IDF()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setMinDocFreq(5)
    testDefaultReadWrite(t)
  }

  test("IDFModel read/write") {
    val instance = new IDFModel("myIDFModel", new OldIDFModel(Vectors.dense(1.0, 2.0)))
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.idf === instance.idf)
  }
}

Source File: NGramSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

@BeanInfo
case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String])

class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
  import org.apache.spark.ml.feature.NGramSuite._

  test("default behavior yields bigram features") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
    val dataset = sqlContext.createDataFrame(Seq(
      NGramTestData(
        Array("Test", "for", "ngram", "."),
        Array("Test for", "for ngram", "ngram .")
    )))
    testNGram(nGram, dataset)
  }

  test("NGramLength=4 yields length 4 n-grams") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = sqlContext.createDataFrame(Seq(
      NGramTestData(
        Array("a", "b", "c", "d", "e"),
        Array("a b c d", "b c d e")
      )))
    testNGram(nGram, dataset)
  }

  test("empty input yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = sqlContext.createDataFrame(Seq(
      NGramTestData(
        Array(),
        Array()
      )))
    testNGram(nGram, dataset)
  }

  test("input array < n yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(6)
    val dataset = sqlContext.createDataFrame(Seq(
      NGramTestData(
        Array("a", "b", "c", "d", "e"),
        Array()
      )))
    testNGram(nGram, dataset)
  }

  test("read/write") {
    val t = new NGram()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setN(3)
    testDefaultReadWrite(t)
  }
}

object NGramSuite extends SparkFunSuite {

  def testNGram(t: NGram, dataset: DataFrame): Unit = {
    t.transform(dataset)
      .select("nGrams", "wantedNGrams")
      .collect()
      .foreach { case Row(actualNGrams, wantedNGrams) =>
        assert(actualNGrams === wantedNGrams)
      }
  }
}

Source File: PCASuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.mllib.feature.{PCAModel => OldPCAModel}
import org.apache.spark.sql.Row

class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new PCA)
    val mat = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix]
    val model = new PCAModel("pca", mat)
    ParamsSuite.checkParams(model)
  }

  test("pca") {
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )

    val dataRDD = sc.parallelize(data, 2)

    val mat = new RowMatrix(dataRDD)
    val pc = mat.computePrincipalComponents(3)
    val expected = mat.multiply(pc).rows

    val df = sqlContext.createDataFrame(dataRDD.zip(expected)).toDF("features", "expected")

    val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pca_features")
      .setK(3)
      .fit(df)

    // copied model must have the same parent.
    MLTestingUtils.checkCopy(pca)

    pca.transform(df).select("pca_features", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
    }
  }

  test("PCA read/write") {
    val t = new PCA()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setK(3)
    testDefaultReadWrite(t)
  }

  test("PCAModel read/write") {
    val instance = new PCAModel("myPCAModel",
      Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix])
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.pc === instance.pc)
  }
}

Source File: HashingTFSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.Utils

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new HashingTF)
  }

  test("hashingTF") {
    val df = sqlContext.createDataFrame(Seq(
      (0, "a a b b c d".split(" ").toSeq)
    )).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
      .setInputCol("words")
      .setOutputCol("features")
      .setNumFeatures(n)
    val output = hashingTF.transform(df)
    val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
    require(attrGroup.numAttributes === Some(n))
    val features = output.select("features").first().getAs[Vector](0)
    // Assume perfect hash on "a", "b", "c", and "d".
    def idx(any: Any): Int = Utils.nonNegativeMod(any.##, n)
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))
    assert(features ~== expected absTol 1e-14)
  }

  test("read/write") {
    val t = new HashingTF()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setNumFeatures(10)
    testDefaultReadWrite(t)
  }
}

Source File: StopWordsRemoverSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

object StopWordsRemoverSuite extends SparkFunSuite {
  def testStopWordsRemover(t: StopWordsRemover, dataset: DataFrame): Unit = {
    t.transform(dataset)
      .select("filtered", "expected")
      .collect()
      .foreach { case Row(tokens, wantedTokens) =>
        assert(tokens === wantedTokens)
    }
  }
}

class StopWordsRemoverSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import StopWordsRemoverSuite._

  test("StopWordsRemover default") {
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")
    val dataSet = sqlContext.createDataFrame(Seq(
      (Seq("test", "test"), Seq("test", "test")),
      (Seq("a", "b", "c", "d"), Seq("b", "c", "d")),
      (Seq("a", "the", "an"), Seq()),
      (Seq("A", "The", "AN"), Seq()),
      (Seq(null), Seq(null)),
      (Seq(), Seq())
    )).toDF("raw", "expected")

    testStopWordsRemover(remover, dataSet)
  }

  test("StopWordsRemover case sensitive") {
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")
      .setCaseSensitive(true)
    val dataSet = sqlContext.createDataFrame(Seq(
      (Seq("A"), Seq("A")),
      (Seq("The", "the"), Seq("The"))
    )).toDF("raw", "expected")

    testStopWordsRemover(remover, dataSet)
  }

  test("StopWordsRemover with additional words") {
    val stopWords = StopWords.English ++ Array("python", "scala")
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("filtered")
      .setStopWords(stopWords)
    val dataSet = sqlContext.createDataFrame(Seq(
      (Seq("python", "scala", "a"), Seq()),
      (Seq("Python", "Scala", "swift"), Seq("swift"))
    )).toDF("raw", "expected")

    testStopWordsRemover(remover, dataSet)
  }

  test("read/write") {
    val t = new StopWordsRemover()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setStopWords(Array("the", "a"))
      .setCaseSensitive(true)
    testDefaultReadWrite(t)
  }

  test("StopWordsRemover output column already exists") {
    val outputCol = "expected"
    val remover = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol(outputCol)
    val dataSet = sqlContext.createDataFrame(Seq(
      (Seq("The", "the", "swift"), Seq("swift"))
    )).toDF("raw", outputCol)

    val thrown = intercept[IllegalArgumentException] {
      testStopWordsRemover(remover, dataSet)
    }
    assert(thrown.getMessage == s"requirement failed: Column $outputCol already exists.")
  }
}

Source File: RandomForestSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.impl.TreeTests
import org.apache.spark.ml.tree.{ContinuousSplit, DecisionTreeModel, LeafNode, Node}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.tree.impurity.GiniCalculator
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.collection.OpenHashMap


    val leftImp = new GiniCalculator(Array(3.0, 2.0, 1.0))
    val left = new LeafNode(0.0, leftImp.calculate(), leftImp)

    val rightImp = new GiniCalculator(Array(1.0, 2.0, 5.0))
    val right = new LeafNode(2.0, rightImp.calculate(), rightImp)

    val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5))
    val parentImp = parent.impurityStats

    val left2Imp = new GiniCalculator(Array(1.0, 6.0, 1.0))
    val left2 = new LeafNode(0.0, left2Imp.calculate(), left2Imp)

    val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0))
    val grandImp = grandParent.impurityStats

    // Test feature importance computed at different subtrees.
    def testNode(node: Node, expected: Map[Int, Double]): Unit = {
      val map = new OpenHashMap[Int, Double]()
      RandomForest.computeFeatureImportance(node, map)
      assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01)
    }

    // Leaf node
    testNode(left, Map.empty[Int, Double])

    // Internal node with 2 leaf children
    val feature0importance = parentImp.calculate() * parentImp.count -
      (leftImp.calculate() * leftImp.count + rightImp.calculate() * rightImp.count)
    testNode(parent, Map(0 -> feature0importance))

    // Full tree
    val feature1importance = grandImp.calculate() * grandImp.count -
      (left2Imp.calculate() * left2Imp.count + parentImp.calculate() * parentImp.count)
    testNode(grandParent, Map(0 -> feature0importance, 1 -> feature1importance))

    // Forest consisting of (full tree) + (internal node with 2 leafs)
    val trees = Array(parent, grandParent).map { root =>
      new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3)
        .asInstanceOf[DecisionTreeModel]
    }
    val importances: Vector = RandomForest.featureImportances(trees, 2)
    val tree2norm = feature0importance + feature1importance
    val expected = Vectors.dense((1.0 + feature0importance / tree2norm) / 2.0,
      (feature1importance / tree2norm) / 2.0)
    assert(importances ~== expected relTol 0.01)
  }

  test("normalizeMapValues") {
    val map = new OpenHashMap[Int, Double]()
    map(0) = 1.0
    map(2) = 2.0
    RandomForest.normalizeMapValues(map)
    val expected = Map(0 -> 1.0 / 3.0, 2 -> 2.0 / 3.0)
    assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01)
  }

}

private object RandomForestSuite {

  def mapToVec(map: Map[Int, Double]): Vector = {
    val size = (map.keys.toSeq :+ 0).max + 1
    val (indices, values) = map.toSeq.sortBy(_._1).unzip
    Vectors.sparse(size, indices.toArray, values.toArray)
  }
}

Source File: BinaryClassificationEvaluatorSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class BinaryClassificationEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new BinaryClassificationEvaluator)
  }

  test("read/write") {
    val evaluator = new BinaryClassificationEvaluator()
      .setRawPredictionCol("myRawPrediction")
      .setLabelCol("myLabel")
      .setMetricName("areaUnderPR")
    testDefaultReadWrite(evaluator)
  }
}

Source File: MulticlassClassificationEvaluatorSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MulticlassClassificationEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new MulticlassClassificationEvaluator)
  }

  test("read/write") {
    val evaluator = new MulticlassClassificationEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("recall")
    testDefaultReadWrite(evaluator)
  }
}

Source File: RegressionEvaluatorSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
  }

  test("read/write") {
    val evaluator = new RegressionEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("r2")
    testDefaultReadWrite(evaluator)
  }
}

Source File: LibSVMRelationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File

import com.google.common.base.Charsets
import com.google.common.io.Files

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  var tempDir: File = _
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    tempDir = Utils.createTempDir()
    val file = new File(tempDir, "part-00000")
    Files.write(lines, file, Charsets.US_ASCII)
    path = tempDir.toURI.toString
  }

  override def afterAll(): Unit = {
    Utils.deleteRecursively(tempDir)
    super.afterAll()
  }

  test("select as sparse vector") {
    val df = sqlContext.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = sqlContext.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = sqlContext.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }
}

Source File: MultilayerPerceptronClassifierSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.classification

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.LogisticRegressionSuite._
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("XOR function learning as binary classification problem with two outputs.") {
    val dataFrame = sqlContext.createDataFrame(Seq(
        (Vectors.dense(0.0, 0.0), 0.0),
        (Vectors.dense(0.0, 1.0), 1.0),
        (Vectors.dense(1.0, 0.0), 1.0),
        (Vectors.dense(1.0, 1.0), 0.0))
    ).toDF("features", "label")
    val layers = Array[Int](2, 5, 2)
    val trainer = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(1)
      .setSeed(11L)
      .setMaxIter(100)
    val model = trainer.fit(dataFrame)
    val result = model.transform(dataFrame)
    val predictionAndLabels = result.select("prediction", "label").collect()
    predictionAndLabels.foreach { case Row(p: Double, l: Double) =>
      assert(p == l)
    }
  }

  // TODO: implement a more rigorous test
  test("3 class classification with 2 hidden layers") {
    val nPoints = 1000

    // The following coefficients are taken from OneVsRestSuite.scala
    // they represent 3-class iris dataset
    val coefficients = Array(
      -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
      -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)

    val xMean = Array(5.843, 3.057, 3.758, 1.199)
    val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
    // the input seed is somewhat magic, to make this test pass
    val rdd = sc.parallelize(generateMultinomialLogisticInput(
      coefficients, xMean, xVariance, true, nPoints, 1), 2)
    val dataFrame = sqlContext.createDataFrame(rdd).toDF("label", "features")
    val numClasses = 3
    val numIterations = 100
    val layers = Array[Int](4, 5, 4, numClasses)
    val trainer = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(1)
      .setSeed(11L) // currently this seed is ignored
      .setMaxIter(numIterations)
    val model = trainer.fit(dataFrame)
    val numFeatures = dataFrame.select("features").first().getAs[Vector](0).size
    assert(model.numFeatures === numFeatures)
    val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label")
      .map { case Row(p: Double, l: Double) => (p, l) }
    // train multinomial logistic regression
    val lr = new LogisticRegressionWithLBFGS()
      .setIntercept(true)
      .setNumClasses(numClasses)
    lr.optimizer.setRegParam(0.0)
      .setNumIterations(numIterations)
    val lrModel = lr.run(rdd)
    val lrPredictionAndLabels = lrModel.predict(rdd.map(_.features)).zip(rdd.map(_.label))
    // MLP's predictions should not differ a lot from LR's.
    val lrMetrics = new MulticlassMetrics(lrPredictionAndLabels)
    val mlpMetrics = new MulticlassMetrics(mlpPredictionAndLabels)
    assert(mlpMetrics.confusionMatrix ~== lrMetrics.confusionMatrix absTol 100)
  }
}

Source File: StopwatchSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.util

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert(sw.name === "sw")
    assert(sw.elapsed() === 0L)
    assert(!sw.isRunning)
    intercept[AssertionError] {
      sw.stop()
    }
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    sw.start()
    assert(sw.isRunning)
    intercept[AssertionError] {
      sw.start()
    }
  }

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.accumulator(0L)
    rdd.foreach { i =>
      acc += checkStopwatch(sw)
    }
    assert(!sw.isRunning)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)
  }

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
      .addLocal("local")
      .addDistributed("spark")
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
      sw("some")
    }
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.accumulator(0L)
    rdd.foreach { i =>
      sw("local").start()
      val duration = checkStopwatch(sw("spark"))
      sw("local").stop()
      acc += duration
    }
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)
  }
}

private object StopwatchSuite extends SparkFunSuite {

  
  private def now: Long = System.currentTimeMillis()
}

Source File: WeightedLeastSquaresSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.optim

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.Instance
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD

class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext {

  private var instances: RDD[Instance] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    

    val expected = Seq(
      Vectors.dense(0.0, -3.727117, 3.009982),
      Vectors.dense(0.0, -3.727117, 3.009982),
      Vectors.dense(0.0, -3.307532, 2.924206),
      Vectors.dense(0.0, -2.914790, 2.840627),
      Vectors.dense(0.0, -1.526575, 2.558158),
      Vectors.dense(0.0, 0.06984238, 2.20488344),
      Vectors.dense(18.0799727, 6.0799832, -0.5999941),
      Vectors.dense(18.0799727, 6.0799832, -0.5999941),
      Vectors.dense(13.5356178, 3.2714044, 0.3770744),
      Vectors.dense(14.064629, 3.565802, 0.269593),
      Vectors.dense(10.1238013, 0.9708569, 1.1475466),
      Vectors.dense(13.1860638, 2.1761382, 0.6213134))

    var idx = 0
    for (fitIntercept <- Seq(false, true);
         regParam <- Seq(0.0, 0.1, 1.0);
         standardizeFeatures <- Seq(false, true)) {
      val wls = new WeightedLeastSquares(
        fitIntercept, regParam, standardizeFeatures, standardizeLabel = true)
        .fit(instances)
      val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
      assert(actual ~== expected(idx) absTol 1e-4)
      idx += 1
    }
  }
}

Source File: ANNSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._


class ANNSuite extends SparkFunSuite with MLlibTestSparkContext {

  // TODO: test for weights comparison with Weka MLP
  test("ANN with Sigmoid learns XOR function with LBFGS optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(0.0, 1.0, 1.0, 0.0)
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights()
    val trainer = new FeedForwardTrainer(topology, 2, 1)
    trainer.setWeights(initialWeights)
    trainer.LBFGSOptimizer.setNumIterations(20)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input)(0), label(0))
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(math.round(p) === l)
    }
  }

  test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(
      Array(1.0, 0.0),
      Array(0.0, 1.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0)
    )
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights()
    val trainer = new FeedForwardTrainer(topology, 2, 2)
    trainer.SGDOptimizer.setNumIterations(2000)
    trainer.setWeights(initialWeights)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input), label)
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(p ~== l absTol 0.5)
    }
  }
}

Source File: MulticlassClassificationEvaluatorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MulticlassClassificationEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new MulticlassClassificationEvaluator)
  }

  test("read/write") {
    val evaluator = new MulticlassClassificationEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("accuracy")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new MulticlassClassificationEvaluator, spark)
  }
}

Source File: RegressionEvaluatorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
  }

  test("read/write") {
    val evaluator = new RegressionEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("r2")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark)
  }
}

Source File: RWrapperUtilsSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.{RFormula, RFormulaModel}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class RWrapperUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("avoid libsvm data column name conflicting") {
    val rFormula = new RFormula().setFormula("label ~ features")
    val data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt")

    // if not checking column name, then IllegalArgumentException
    intercept[IllegalArgumentException] {
      rFormula.fit(data)
    }

    // after checking, model build is ok
    RWrapperUtils.checkDataColumns(rFormula, data)

    assert(rFormula.getLabelCol == "label")
    assert(rFormula.getFeaturesCol.startsWith("features_"))

    val model = rFormula.fit(data)
    assert(model.isInstanceOf[RFormulaModel])

    assert(model.getLabelCol == "label")
    assert(model.getFeaturesCol.startsWith("features_"))
  }

}

Source File: PredictorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import PredictorSuite._

  test("should support all NumericType labels and not support other types") {
    val df = spark.createDataFrame(Seq(
      (0, Vectors.dense(0, 2, 3)),
      (1, Vectors.dense(0, 3, 9)),
      (0, Vectors.dense(0, 2, 6))
    )).toDF("label", "features")

    val types =
      Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))

    val predictor = new MockPredictor()

    types.foreach { t =>
      predictor.fit(df.select(col("label").cast(t), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label").cast(StringType), col("features")))
    }
  }
}

object PredictorSuite {

  class MockPredictor(override val uid: String)
    extends Predictor[Vector, MockPredictor, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictor"))

    override def train(dataset: Dataset[_]): MockPredictionModel = {
      require(dataset.schema("label").dataType == DoubleType)
      new MockPredictionModel(uid)
    }

    override def copy(extra: ParamMap): MockPredictor =
      throw new NotImplementedError()
  }

  class MockPredictionModel(override val uid: String)
    extends PredictionModel[Vector, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictormodel"))

    override def predict(features: Vector): Double =
      throw new NotImplementedError()

    override def copy(extra: ParamMap): MockPredictionModel =
      throw new NotImplementedError()
  }
}

Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: StopwatchSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.util

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert(sw.name === "sw")
    assert(sw.elapsed() === 0L)
    assert(!sw.isRunning)
    intercept[AssertionError] {
      sw.stop()
    }
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    sw.start()
    assert(sw.isRunning)
    intercept[AssertionError] {
      sw.start()
    }
  }

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      acc.add(checkStopwatch(sw))
    }
    assert(!sw.isRunning)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)
  }

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
      .addLocal("local")
      .addDistributed("spark")
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
      sw("some")
    }
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      sw("local").start()
      val duration = checkStopwatch(sw("spark"))
      sw("local").stop()
      acc.add(duration)
    }
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)
  }
}

private object StopwatchSuite extends SparkFunSuite {

  
  private def now: Long = System.currentTimeMillis()
}

Source File: ANNSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ANNSuite extends SparkFunSuite with MLlibTestSparkContext {

  // TODO: test for weights comparison with Weka MLP
  test("ANN with Sigmoid learns XOR function with LBFGS optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(0.0, 1.0, 1.0, 0.0)
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 1)
    trainer.setWeights(initialWeights)
    trainer.LBFGSOptimizer.setNumIterations(20)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input)(0), label(0))
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(math.round(p) === l)
    }
  }

  test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(
      Array(1.0, 0.0),
      Array(0.0, 1.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0)
    )
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 2)
    // TODO: add a test for SGD
    trainer.LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(20)
    trainer.setWeights(initialWeights).setStackSize(1)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input), label)
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(p ~== l absTol 0.5)
    }
  }
}

Source File: GradientSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class GradientSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("Gradient computation against numerical differentiation") {
    val input = new BDM[Double](3, 1, Array(1.0, 1.0, 1.0))
    // output must contain zeros and one 1 for SoftMax
    val target = new BDM[Double](2, 1, Array(0.0, 1.0))
    val topology = FeedForwardTopology.multiLayerPerceptron(Array(3, 4, 2), softmaxOnTop = false)
    val layersWithErrors = Seq(
      new SigmoidLayerWithSquaredError(),
      new SoftmaxLayerWithCrossEntropyLoss()
    )
    // check all layers that provide loss computation
    // 1) compute loss and gradient given the model and initial weights
    // 2) modify weights with small number epsilon (per dimension i)
    // 3) compute new loss
    // 4) ((newLoss - loss) / epsilon) should be close to the i-th component of the gradient
    for (layerWithError <- layersWithErrors) {
      topology.layers(topology.layers.length - 1) = layerWithError
      val model = topology.model(seed = 12L)
      val weights = model.weights.toArray
      val numWeights = weights.size
      val gradient = Vectors.dense(Array.fill[Double](numWeights)(0.0))
      val loss = model.computeGradient(input, target, gradient, 1)
      val eps = 1e-4
      var i = 0
      val tol = 1e-4
      while (i < numWeights) {
        val originalValue = weights(i)
        weights(i) += eps
        val newModel = topology.model(Vectors.dense(weights))
        val newLoss = computeLoss(input, target, newModel)
        val derivativeEstimate = (newLoss - loss) / eps
        assert(math.abs(gradient(i) - derivativeEstimate) < tol, "Layer failed gradient check: " +
          layerWithError.getClass)
        weights(i) = originalValue
        i += 1
      }
    }
  }

  private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = {
    val outputs = model.forward(input)
    model.layerModels.last match {
      case layerWithLoss: LossFunction =>
        layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols))
      case _ =>
        throw new UnsupportedOperationException("Top layer is required to have loss." +
          " Failed layer:" + model.layerModels.last.getClass)
    }
  }
}

Source File: ChiSqSelectorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {

  

  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Seq(LabeledPoint(0.0, Vectors.dense(Array(8.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSeq
    assert(filteredData === preFilteredData)
  }

  test("ChiSqSelector by fpr transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
    val preFilteredData =
      Seq(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(9.0))))
    val model: ChiSqSelectorModel = new ChiSqSelector().setSelectorType("fpr")
      .setFpr(0.1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSeq
    assert(filteredData === preFilteredData)
  }

  test("model load / save") {
    val model = ChiSqSelectorSuite.createModel()
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    try {
      model.save(sc, path)
      val sameModel = ChiSqSelectorModel.load(sc, path)
      ChiSqSelectorSuite.checkEqual(model, sameModel)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

object ChiSqSelectorSuite extends SparkFunSuite {

  def createModel(): ChiSqSelectorModel = {
    val arr = Array(1, 2, 3, 4)
    new ChiSqSelectorModel(arr)
  }

  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
  }
}

Source File: ElementwiseProductSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("elementwise (hadamard) product should properly apply vector to dense data set") {
    val denseData = Array(
      Vectors.dense(1.0, 4.0, 1.9, -9.0)
    )
    val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
    val transformer = new ElementwiseProduct(scalingVec)
    val transformedData = transformer.transform(sc.makeRDD(denseData))
    val transformedVecs = transformedData.collect()
    val transformedVec = transformedVecs(0)
    val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
    assert(transformedVec ~== expectedVec absTol 1E-5,
      s"Expected transformed vector $expectedVec but found $transformedVec")
  }

  test("elementwise (hadamard) product should properly apply vector to sparse data set") {
    val sparseData = Array(
      Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
    )
    val dataRDD = sc.parallelize(sparseData, 3)
    val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
    val transformer = new ElementwiseProduct(scalingVec)
    val data2 = sparseData.map(transformer.transform)
    val data2RDD = transformer.transform(dataRDD)

    assert((sparseData, data2, data2RDD.collect()).zipped.forall {
      case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after hadamard product")

    assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
    assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
  }
}

Source File: IDFSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
        0
      }
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

}

Source File: PCASuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
  )

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect()

    pca_transform.zip(mat_multiply).foreach { case (calculated, expected) =>
      assert(calculated ~== expected relTol 1e-8)
    }
    assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)
  }
}

Source File: HashingTFSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("hashing tf on a single doc") {
    val hashingTF = new HashingTF(1000)
    val doc = "a a b b c d".split(" ")
    val n = hashingTF.numFeatures
    val termFreqs = Seq(
      (hashingTF.indexOf("a"), 2.0),
      (hashingTF.indexOf("b"), 2.0),
      (hashingTF.indexOf("c"), 1.0),
      (hashingTF.indexOf("d"), 1.0))
    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
      "index must be in range [0, #features)")
    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")
    val expected = Vectors.sparse(n, termFreqs)
    assert(hashingTF.transform(doc) === expected)
  }

  test("hashing tf on an RDD") {
    val hashingTF = new HashingTF
    val localDocs: Seq[Seq[String]] = Seq(
      "a a b b b c d".split(" "),
      "a b c d a b c".split(" "),
      "c b a c b a a".split(" "))
    val docs = sc.parallelize(localDocs, 2)
    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
  }

  test("applying binary term freqs") {
    val hashingTF = new HashingTF(100).setBinary(true)
    val doc = "a a b c c c".split(" ")
    val n = hashingTF.numFeatures
    val expected = Vectors.sparse(n, Seq(
      (hashingTF.indexOf("a"), 1.0),
      (hashingTF.indexOf("b"), 1.0),
      (hashingTF.indexOf("c"), 1.0)))
    assert(hashingTF.transform(doc) ~== expected absTol 1e-14)
  }
}

Source File: MatrixFactorizationModelSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.recommendation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils

class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext {

  val rank = 2
  var userFeatures: RDD[(Int, Array[Double])] = _
  var prodFeatures: RDD[(Int, Array[Double])] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0))))
    prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0))))
  }

  test("constructor") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    assert(model.predict(0, 2) ~== 17.0 relTol 1e-14)

    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(1, userFeatures, prodFeatures)
    }

    val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures1, prodFeatures)
    }

    val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures, prodFeatures1)
    }
  }

  test("save/load") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = {
      features.mapValues(_.toSeq).collect().toSet
    }
    try {
      model.save(sc, path)
      val newModel = MatrixFactorizationModel.load(sc, path)
      assert(newModel.rank === rank)
      assert(collect(newModel.userFeatures) === collect(userFeatures))
      assert(collect(newModel.productFeatures) === collect(prodFeatures))
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }

  test("batch predict API recommendProductsForUsers") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendProductsForUsers(topK).collectAsMap()

    assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14)
    assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14)
  }

  test("batch predict API recommendUsersForProducts") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendUsersForProducts(topK).collectAsMap()

    assert(recommendations(2)(0).user == 1)
    assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14)
    assert(recommendations(2)(1).user == 0)
    assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14)
  }
}

Source File: RankingMetricsSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("Ranking metrics: MAP, NDCG") {
    val predictionAndLabels = sc.parallelize(
      Seq(
        (Array(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array(1, 2, 3, 4, 5)),
        (Array(4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array(1, 2, 3)),
        (Array(1, 2, 3, 4, 5), Array.empty[Int])
      ), 2)
    val eps = 1.0E-5

    val metrics = new RankingMetrics(predictionAndLabels)
    val map = metrics.meanAveragePrecision

    assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps)
    assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps)

    assert(map ~== 0.355026 absTol eps)

    assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps)
    assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps)
    assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps)
  }

  test("MAP, NDCG with few predictions (SPARK-14886)") {
    val predictionAndLabels = sc.parallelize(
      Seq(
        (Array(1, 6, 2), Array(1, 2, 3, 4, 5)),
        (Array.empty[Int], Array(1, 2, 3))
      ), 2)
    val eps = 1.0E-5

    val metrics = new RankingMetrics(predictionAndLabels)
    assert(metrics.precisionAt(1) ~== 0.5 absTol eps)
    assert(metrics.precisionAt(2) ~== 0.25 absTol eps)
    assert(metrics.ndcgAt(1) ~== 0.5 absTol eps)
    assert(metrics.ndcgAt(2) ~== 0.30657 absTol eps)
  }

}

Source File: AreaUnderCurveSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("auc computation") {
    val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0))
    val auc = 4.0
    assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5)
  }

  test("auc of an empty curve") {
    val curve = Seq.empty[(Double, Double)]
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }

  test("auc of a curve with a single point") {
    val curve = Seq((1.0, 1.0))
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }
}

Source File: FPTreeSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import scala.language.existentials

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("add transaction") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    assert(tree.root.children.size == 2)
    assert(tree.root.children.contains("a"))
    assert(tree.root.children("a").item.equals("a"))
    assert(tree.root.children("a").count == 2)
    assert(tree.root.children.contains("b"))
    assert(tree.root.children("b").item.equals("b"))
    assert(tree.root.children("b").count == 1)
    var child = tree.root.children("a")
    assert(child.children.size == 1)
    assert(child.children.contains("b"))
    assert(child.children("b").item.equals("b"))
    assert(child.children("b").count == 2)
    child = child.children("b")
    assert(child.children.size == 2)
    assert(child.children.contains("c"))
    assert(child.children.contains("y"))
    assert(child.children("c").item.equals("c"))
    assert(child.children("y").item.equals("y"))
    assert(child.children("c").count == 1)
    assert(child.children("y").count == 1)
  }

  test("merge tree") {
    val tree1 = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    val tree2 = new FPTree[String]
      .add(Seq("a", "b"))
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "c", "d"))
      .add(Seq("a", "x"))
      .add(Seq("a", "x", "y"))
      .add(Seq("c", "n"))
      .add(Seq("c", "m"))

    val tree3 = tree1.merge(tree2)

    assert(tree3.root.children.size == 3)
    assert(tree3.root.children("a").count == 7)
    assert(tree3.root.children("b").count == 1)
    assert(tree3.root.children("c").count == 2)
    val child1 = tree3.root.children("a")
    assert(child1.children.size == 2)
    assert(child1.children("b").count == 5)
    assert(child1.children("x").count == 2)
    val child2 = child1.children("b")
    assert(child2.children.size == 2)
    assert(child2.children("y").count == 1)
    assert(child2.children("c").count == 3)
    val child3 = child2.children("c")
    assert(child3.children.size == 1)
    assert(child3.children("d").count == 1)
    val child4 = child1.children("x")
    assert(child4.children.size == 1)
    assert(child4.children("y").count == 1)
    val child5 = tree3.root.children("c")
    assert(child5.children.size == 2)
    assert(child5.children("n").count == 1)
    assert(child5.children("m").count == 1)
  }

  test("extract freq itemsets") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("a", "b"))
      .add(Seq("a"))
      .add(Seq("b"))
      .add(Seq("b", "n"))

    val freqItemsets = tree.extract(3L).map { case (items, count) =>
      (items.toSet, count)
    }.toSet
    val expected = Set(
      (Set("a"), 4L),
      (Set("b"), 5L),
      (Set("a", "b"), 3L))
    assert(freqItemsets === expected)
  }
}

Source File: AssociationRulesSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("association rules using String type") {
    val freqItemsets = sc.parallelize(Seq(
      (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
      (Set("r"), 3L),
      (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L),
      (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L),
      (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L),
      (Set("t", "y", "x"), 3L),
      (Set("t", "y", "x", "z"), 3L)
    ).map {
      case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq)
    })

    val ar = new AssociationRules()

    val results1 = ar
      .setMinConfidence(0.9)
      .run(freqItemsets)
      .collect()

    
    assert(results2.size === 30)
    assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
  }
}

Source File: KernelDensitySuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MultivariateGaussianSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Matrices, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }

}

Source File: CoordinateMatrixSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {

  val m = 5
  val n = 4
  var mat: CoordinateMatrix = _

  override def beforeAll() {
    super.beforeAll()
    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    mat = new CoordinateMatrix(entries)
  }

  test("size") {
    assert(mat.numRows() === m)
    assert(mat.numCols() === n)
  }

  test("empty entries") {
    val entries = sc.parallelize(Seq[MatrixEntry](), 1)
    val emptyMat = new CoordinateMatrix(entries)
    intercept[RuntimeException] {
      emptyMat.numCols()
    }
    intercept[RuntimeException] {
      emptyMat.numRows()
    }
  }

  test("toBreeze") {
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(mat.toBreeze() === expected)
  }

  test("transpose") {
    val transposed = mat.transpose()
    assert(mat.toBreeze().t === transposed.toBreeze())
  }

  test("toIndexedRowMatrix") {
    val indexedRowMatrix = mat.toIndexedRowMatrix()
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(indexedRowMatrix.toBreeze() === expected)
  }

  test("toRowMatrix") {
    val rowMatrix = mat.toRowMatrix()
    val rows = rowMatrix.rows.collect().toSet
    val expected = Set(
      Vectors.dense(1.0, 2.0, 0.0, 0.0),
      Vectors.dense(0.0, 3.0, 4.0, 0.0),
      Vectors.dense(0.0, 0.0, 5.0, 6.0),
      Vectors.dense(7.0, 0.0, 0.0, 8.0),
      Vectors.dense(0.0, 9.0, 0.0, 0.0))
    assert(rows === expected)
  }

  test("toBlockMatrix") {
    val blockMat = mat.toBlockMatrix(2, 2)
    assert(blockMat.numRows() === m)
    assert(blockMat.numCols() === n)
    assert(blockMat.toBreeze() === mat.toBreeze())

    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(-1, 2)
    }
    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(2, 0)
    }
  }
}

Source File: MLPairRDDFunctionsSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("topByKey") {
    val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5,
      1), (3, 5)), 2)
      .topByKey(5)
      .collectAsMap()

    assert(topMap.size === 3)
    assert(topMap(1) === Array(7, 6, 3, 2, 1))
    assert(topMap(3) === Array(7, 5, 2))
    assert(topMap(5) === Array(1))
  }
}

Source File: RDDFunctionsSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.rdd.RDDFunctions._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("sliding") {
    val data = 0 until 6
    for (numPartitions <- 1 to 8) {
      val rdd = sc.parallelize(data, numPartitions)
      for (windowSize <- 1 to 6) {
        for (step <- 1 to 3) {
          val sliding = rdd.sliding(windowSize, step).collect().map(_.toList).toList
          val expected = data.sliding(windowSize, step)
            .map(_.toList).toList.filter(l => l.size == windowSize)
          assert(sliding === expected)
        }
      }
      assert(rdd.sliding(7).collect().isEmpty,
        "Should return an empty RDD if the window size is greater than the number of items.")
    }
  }

  test("sliding with empty partitions") {
    val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7))
    val rdd = sc.parallelize(data, data.length).flatMap(s => s)
    assert(rdd.partitions.length === data.length)
    val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq)
    val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq)
    assert(sliding === expected)
  }
}

Source File: OneHotEncoderSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col

class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext {

  def stringIndexed(): DataFrame = {
    val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
    val df = sqlContext.createDataFrame(data).toDF("id", "label")
    val indexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("labelIndex")
      .fit(df)
    indexer.transform(df)
  }

  test("params") {
    ParamsSuite.checkParams(new OneHotEncoder)
  }

  test("OneHotEncoder dropLast = false") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
      .setInputCol("labelIndex")
      .setOutputCol("labelVec")
      .setDropLast(false)
    val encoded = encoder.transform(transformed)

    val output = encoded.select("id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1), vec(2))
    }.collect().toSet
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0),
      (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0))
    assert(output === expected)
  }

  test("OneHotEncoder dropLast = true") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
      .setInputCol("labelIndex")
      .setOutputCol("labelVec")
    val encoded = encoder.transform(transformed)

    val output = encoded.select("id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1))
    }.collect().toSet
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0),
      (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0))
    assert(output === expected)
  }

  test("input column with ML attribute") {
    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size")
      .select(col("size").as("size", attr.toMetadata()))
    val encoder = new OneHotEncoder()
      .setInputCol("size")
      .setOutputCol("encoded")
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("size_is_small").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("size_is_medium").withIndex(1))
  }

  test("input column without ML attribute") {
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index")
    val encoder = new OneHotEncoder()
      .setInputCol("index")
      .setOutputCol("encoded")
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("index_is_0").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("index_is_1").withIndex(1))
  }
}

Source File: Word2VecSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel}

class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new Word2Vec)
    val model = new Word2VecModel("w2v", new OldWord2VecModel(Map("a" -> Array(0.0f))))
    ParamsSuite.checkParams(model)
  }

  test("Word2Vec") {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val sentence = "a b " * 100 + "a c " * 10
    val numOfWords = sentence.split(" ").size
    val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))

    val codes = Map(
      "a" -> Array(-0.2811822295188904, -0.6356269121170044, -0.3020961284637451),
      "b" -> Array(1.0309048891067505, -1.29472815990448, 0.22276712954044342),
      "c" -> Array(-0.08456747233867645, 0.5137411952018738, 0.11731560528278351)
    )

    val expected = doc.map { sentence =>
      Vectors.dense(sentence.map(codes.apply).reduce((word1, word2) =>
        word1.zip(word2).map { case (v1, v2) => v1 + v2 }
      ).map(_ / numOfWords))
    }

    val docDF = doc.zip(expected).toDF("text", "expected")

    val model = new Word2Vec()
      .setVectorSize(3)
      .setInputCol("text")
      .setOutputCol("result")
      .setSeed(42L)
      .fit(docDF)

    model.transform(docDF).select("result", "expected").collect().foreach {
      case Row(vector1: Vector, vector2: Vector) =>
        assert(vector1 ~== vector2 absTol 1E-5, "Transformed vector is different with expected.")
    }
  }
}

Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.col

class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new VectorAssembler)
  }

  test("assemble") {
    import org.apache.spark.ml.feature.VectorAssembler.assemble
    assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty))
    assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0)))
    val dv = Vectors.dense(2.0, 0.0)
    assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0)))
    val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0))
    assert(assemble(0.0, dv, 1.0, sv) ===
      Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0)))
    for (v <- Seq(1, "a", null)) {
      intercept[SparkException](assemble(v))
      intercept[SparkException](assemble(1.0, v))
    }
  }

  test("assemble should compress vectors") {
    import org.apache.spark.ml.feature.VectorAssembler.assemble
    val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0))
    assert(v1.isInstanceOf[SparseVector])
    val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0)))
    assert(v2.isInstanceOf[DenseVector])
  }

  test("VectorAssembler") {
    val df = sqlContext.createDataFrame(Seq(
      (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L)
    )).toDF("id", "x", "y", "name", "z", "n")
    val assembler = new VectorAssembler()
      .setInputCols(Array("x", "y", "z", "n"))
      .setOutputCol("features")
    assembler.transform(df).select("features").collect().foreach {
      case Row(v: Vector) =>
        assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0)))
    }
  }

  test("ML attributes") {
    val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari")
    val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0)
    val user = new AttributeGroup("user", Array(
      NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"),
      NumericAttribute.defaultAttr.withName("salary")))
    val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0)))
    val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad")
      .select(
        col("browser").as("browser", browser.toMetadata()),
        col("hour").as("hour", hour.toMetadata()),
        col("count"), // "count" is an integer column without ML attribute
        col("user").as("user", user.toMetadata()),
        col("ad")) // "ad" is a vector column without ML attribute
    val assembler = new VectorAssembler()
      .setInputCols(Array("browser", "hour", "count", "user", "ad"))
      .setOutputCol("features")
    val output = assembler.transform(df)
    val schema = output.schema
    val features = AttributeGroup.fromStructField(schema("features"))
    assert(features.size === 7)
    val browserOut = features.getAttr(0)
    assert(browserOut === browser.withIndex(0).withName("browser"))
    val hourOut = features.getAttr(1)
    assert(hourOut === hour.withIndex(1).withName("hour"))
    val countOut = features.getAttr(2)
    assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2))
    val userGenderOut = features.getAttr(3)
    assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3))
    val userSalaryOut = features.getAttr(4)
    assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4))
    assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5))
    assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6))
  }
}

Source File: BinarizerSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext {

  @transient var data: Array[Double] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
  }

  test("params") {
    ParamsSuite.checkParams(new Binarizer)
  }

  test("Binarize continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = sqlContext.createDataFrame(
      data.zip(defaultBinarized)).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize continuous features with setter") {
    val threshold: Double = 0.2
    val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = sqlContext.createDataFrame(
        data.zip(thresholdBinarized)).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }
}

Source File: TokenizerSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

@BeanInfo
case class TokenizerTestData(rawText: String, wantedTokens: Array[String])

class TokenizerSuite extends SparkFunSuite {

  test("params") {
    ParamsSuite.checkParams(new Tokenizer)
  }
}

class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext {
  import org.apache.spark.ml.feature.RegexTokenizerSuite._

  test("params") {
    ParamsSuite.checkParams(new RegexTokenizer)
  }

  test("RegexTokenizer") {
    val tokenizer0 = new RegexTokenizer()
      .setGaps(false)
      .setPattern("\\w+|\\p{Punct}")
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset0 = sqlContext.createDataFrame(Seq(
      TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")),
      TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct"))
    ))
    testRegexTokenizer(tokenizer0, dataset0)

    val dataset1 = sqlContext.createDataFrame(Seq(
      TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")),
      TokenizerTestData("Te,st. punct", Array("punct"))
    ))
    tokenizer0.setMinTokenLength(3)
    testRegexTokenizer(tokenizer0, dataset1)

    val tokenizer2 = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset2 = sqlContext.createDataFrame(Seq(
      TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")),
      TokenizerTestData("Te,st.  punct", Array("Te,st.", "punct"))
    ))
    testRegexTokenizer(tokenizer2, dataset2)
  }
}

object RegexTokenizerSuite extends SparkFunSuite {

  def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = {
    t.transform(dataset)
      .select("tokens", "wantedTokens")
      .collect()
      .foreach { case Row(tokens, wantedTokens) =>
        assert(tokens === wantedTokens)
      }
  }
}

Source File: PolynomialExpansionSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.param.ParamsSuite
import org.scalatest.exceptions.TestFailedException

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new PolynomialExpansion)
  }

  test("Polynomial expansion with default parameter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())
    )

    val twoDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)),
      Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29),
      Vectors.dense(new Array[Double](9)),
      Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0),
      Vectors.sparse(9, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")

    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
    }
  }

  test("Polynomial expansion with setter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())
    )

    val threeDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8),
        Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)),
      Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17),
      Vectors.dense(new Array[Double](19)),
      Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8,
        -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0),
      Vectors.sparse(19, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")
      .setDegree(3)

    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
    }
  }
}

Source File: IDFSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = {
    dataSet.map {
      case data: DenseVector =>
        val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y }
        Vectors.dense(res)
      case data: SparseVector =>
        val res = data.indices.zip(data.values).map { case (id, value) =>
          (id, value * model(id))
        }
        Vectors.sparse(data.size, res)
    }
  }

  test("params") {
    ParamsSuite.checkParams(new IDF)
    val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0)))
    ParamsSuite.checkParams(model)
  }

  test("compute IDF with default parameter") {
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    )
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((numOfData + 1.0) / (x + 1.0))
    })
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected")

    val idfModel = new IDF()
      .setInputCol("features")
      .setOutputCol("idfValue")
      .fit(df)

    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
    }
  }

  test("compute IDF with setter") {
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    )
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0
    })
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected")

    val idfModel = new IDF()
      .setInputCol("features")
      .setOutputCol("idfValue")
      .setMinDocFreq(1)
      .fit(df)

    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
    }
  }
}

Source File: NormalizerSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}


class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext {

  @transient var data: Array[Vector] = _
  @transient var dataFrame: DataFrame = _
  @transient var normalizer: Normalizer = _
  @transient var l1Normalized: Array[Vector] = _
  @transient var l2Normalized: Array[Vector] = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))),
      Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))),
      Vectors.sparse(3, Seq())
    )
    l1Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.12765957, -0.23404255, -0.63829787),
      Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))),
      Vectors.dense(0.625, 0.07894737, 0.29605263),
      Vectors.sparse(3, Seq())
    )
    l2Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.184549876, -0.3383414, -0.922749378),
      Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))),
      Vectors.dense(0.897906166, 0.113419726, 0.42532397),
      Vectors.sparse(3, Seq())
    )

    val sqlContext = new SQLContext(sc)
    dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData))
    normalizer = new Normalizer()
      .setInputCol("features")
      .setOutputCol("normalized_features")
  }

  def collectResult(result: DataFrame): Array[Vector] = {
    result.select("normalized_features").collect().map {
      case Row(features: Vector) => features
    }
  }

  def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
    assert((lhs, rhs).zipped.forall {
      case (v1: DenseVector, v2: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after normalization.")
  }

  def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
    assert((lhs, rhs).zipped.forall { (vector1, vector2) =>
      vector1 ~== vector2 absTol 1E-5
    }, "The vector value is not correct after normalization.")
  }

  test("Normalization with default parameter") {
    val result = collectResult(normalizer.transform(dataFrame))

    assertTypeOfVector(data, result)

    assertValues(result, l2Normalized)
  }

  test("Normalization with setter") {
    normalizer.setP(1)

    val result = collectResult(normalizer.transform(dataFrame))

    assertTypeOfVector(data, result)

    assertValues(result, l1Normalized)
  }
}

private object NormalizerSuite {
  case class FeatureData(features: Vector)
}

Source File: HashingTFSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.Utils

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new HashingTF)
  }

  test("hashingTF") {
    val df = sqlContext.createDataFrame(Seq(
      (0, "a a b b c d".split(" ").toSeq)
    )).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
      .setInputCol("words")
      .setOutputCol("features")
      .setNumFeatures(n)
    val output = hashingTF.transform(df)
    val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
    require(attrGroup.numAttributes === Some(n))
    val features = output.select("features").first().getAs[Vector](0)
    // Assume perfect hash on "a", "b", "c", and "d".
    def idx(any: Any): Int = Utils.nonNegativeMod(any.##, n)
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))
    assert(features ~== expected absTol 1e-14)
  }
}

Source File: StringIndexerSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new StringIndexer)
    val model = new StringIndexerModel("indexer", Array("a", "b"))
    ParamsSuite.checkParams(model)
  }

  test("StringIndexer") {
    val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
    val df = sqlContext.createDataFrame(data).toDF("id", "label")
    val indexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("labelIndex")
      .fit(df)
    val transformed = indexer.transform(df)
    val attr = Attribute.fromStructField(transformed.schema("labelIndex"))
      .asInstanceOf[NominalAttribute]
    assert(attr.values.get === Array("a", "c", "b"))
    val output = transformed.select("id", "labelIndex").map { r =>
      (r.getInt(0), r.getDouble(1))
    }.collect().toSet
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0))
    assert(output === expected)
  }

  test("StringIndexer with a numeric input column") {
    val data = sc.parallelize(Seq((0, 100), (1, 200), (2, 300), (3, 100), (4, 100), (5, 300)), 2)
    val df = sqlContext.createDataFrame(data).toDF("id", "label")
    val indexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("labelIndex")
      .fit(df)
    val transformed = indexer.transform(df)
    val attr = Attribute.fromStructField(transformed.schema("labelIndex"))
      .asInstanceOf[NominalAttribute]
    assert(attr.values.get === Array("100", "300", "200"))
    val output = transformed.select("id", "labelIndex").map { r =>
      (r.getInt(0), r.getDouble(1))
    }.collect().toSet
    // 100 -> 0, 200 -> 2, 300 -> 1
    val expected = Set((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0))
    assert(output === expected)
  }

  test("StringIndexerModel should keep silent if the input column does not exist.") {
    val indexerModel = new StringIndexerModel("indexer", Array("a", "b", "c"))
      .setInputCol("label")
      .setOutputCol("labelIndex")
    val df = sqlContext.range(0L, 10L)
    assert(indexerModel.transform(df).eq(df))
  }
}

Source File: RegressionEvaluatorSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== -0.1019382 absTol 0.001)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== -0.08036075 absTol 0.001)
  }
}

Source File: RandomForestRegressorSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.regression

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.impl.TreeTests
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest}
import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame



  def compareAPIs(
      data: RDD[LabeledPoint],
      rf: RandomForestRegressor,
      categoricalFeatures: Map[Int, Int]): Unit = {
    val oldStrategy =
      rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity)
    val oldModel = OldRandomForest.trainRegressor(
      data, oldStrategy, rf.getNumTrees, rf.getFeatureSubsetStrategy, rf.getSeed.toInt)
    val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
    val newModel = rf.fit(newData)
    // Use parent from newTree since this is not checked anyways.
    val oldModelAsNew = RandomForestRegressionModel.fromOld(
      oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures)
    TreeTests.checkEqual(oldModelAsNew, newModel)
  }
}

Source File: DecisionTreeRegressorSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.regression

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.impl.TreeTests
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree,
  DecisionTreeSuite => OldDecisionTreeSuite}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame


class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import DecisionTreeRegressorSuite.compareAPIs

  private var categoricalDataPointsRDD: RDD[LabeledPoint] = _

  override def beforeAll() {
    super.beforeAll()
    categoricalDataPointsRDD =
      sc.parallelize(OldDecisionTreeSuite.generateCategoricalDataPoints())
  }

  /////////////////////////////////////////////////////////////////////////////
  // Tests calling train()
  /////////////////////////////////////////////////////////////////////////////

  test("Regression stump with 3-ary (ordered) categorical features") {
    val dt = new DecisionTreeRegressor()
      .setImpurity("variance")
      .setMaxDepth(2)
      .setMaxBins(100)
    val categoricalFeatures = Map(0 -> 3, 1-> 3)
    compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures)
  }

  test("Regression stump with binary (ordered) categorical features") {
    val dt = new DecisionTreeRegressor()
      .setImpurity("variance")
      .setMaxDepth(2)
      .setMaxBins(100)
    val categoricalFeatures = Map(0 -> 2, 1-> 2)
    compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures)
  }

  /////////////////////////////////////////////////////////////////////////////
  // Tests of model save/load
  /////////////////////////////////////////////////////////////////////////////

  // TODO: test("model save/load")   SPARK-6725
}

private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {

  
  def compareAPIs(
      data: RDD[LabeledPoint],
      dt: DecisionTreeRegressor,
      categoricalFeatures: Map[Int, Int]): Unit = {
    val oldStrategy = dt.getOldStrategy(categoricalFeatures)
    val oldTree = OldDecisionTree.train(data, oldStrategy)
    val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
    val newTree = dt.fit(newData)
    // Use parent from newTree since this is not checked anyways.
    val oldTreeAsNew = DecisionTreeRegressionModel.fromOld(
      oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures)
    TreeTests.checkEqual(oldTreeAsNew, newTree)
  }
}

Source File: ChiSqSelectorSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {

  

  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(6.0))),
        LabeledPoint(1.0, Vectors.dense(Array(8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(5.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSet
    assert(filteredData == preFilteredData)
  }
}

Source File: Word2VecSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.Utils

class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {

  // TODO: add more tests

  test("Word2Vec") {
    val sentence = "a b " * 100 + "a c " * 10
    val localDoc = Seq(sentence, sentence)
    val doc = sc.parallelize(localDoc)
      .map(line => line.split(" ").toSeq)
    val model = new Word2Vec().setVectorSize(10).setSeed(42L).fit(doc)
    val syms = model.findSynonyms("a", 2)
    assert(syms.length == 2)
    assert(syms(0)._1 == "b")
    assert(syms(1)._1 == "c")
  }

  test("Word2VecModel") {
    val num = 2
    val word2VecMap = Map(
      ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)),
      ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)),
      ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)),
      ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f))
    )
    val model = new Word2VecModel(word2VecMap)
    val syms = model.findSynonyms("china", num)
    assert(syms.length == num)
    assert(syms(0)._1 == "taiwan")
    assert(syms(1)._1 == "japan")
  }

  test("model load / save") {

    val word2VecMap = Map(
      ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)),
      ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)),
      ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)),
      ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f))
    )
    val model = new Word2VecModel(word2VecMap)

    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString

    try {
      model.save(sc, path)
      val sameModel = Word2VecModel.load(sc, path)
      assert(sameModel.getVectors.mapValues(_.toSeq) === model.getVectors.mapValues(_.toSeq))
    } finally {
      Utils.deleteRecursively(tempDir)
    }

  }
}

Source File: ElementwiseProductSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("elementwise (hadamard) product should properly apply vector to dense data set") {
    val denseData = Array(
      Vectors.dense(1.0, 4.0, 1.9, -9.0)
    )
    val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
    val transformer = new ElementwiseProduct(scalingVec)
    val transformedData = transformer.transform(sc.makeRDD(denseData))
    val transformedVecs = transformedData.collect()
    val transformedVec = transformedVecs(0)
    val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
    assert(transformedVec ~== expectedVec absTol 1E-5,
      s"Expected transformed vector $expectedVec but found $transformedVec")
  }

  test("elementwise (hadamard) product should properly apply vector to sparse data set") {
    val sparseData = Array(
      Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
    )
    val dataRDD = sc.parallelize(sparseData, 3)
    val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
    val transformer = new ElementwiseProduct(scalingVec)
    val data2 = sparseData.map(transformer.transform)
    val data2RDD = transformer.transform(dataRDD)

    assert((sparseData, data2, data2RDD.collect()).zipped.forall {
      case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after hadamard product")

    assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
    assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
  }
}

Source File: IDFSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
        0
      }
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

}

Source File: PCASuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
  )

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val pc = mat.computePrincipalComponents(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect()

    assert(pca_transform.toSet === mat_multiply.toSet)
  }
}

Source File: HashingTFSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("hashing tf on a single doc") {
    val hashingTF = new HashingTF(1000)
    val doc = "a a b b c d".split(" ")
    val n = hashingTF.numFeatures
    val termFreqs = Seq(
      (hashingTF.indexOf("a"), 2.0),
      (hashingTF.indexOf("b"), 2.0),
      (hashingTF.indexOf("c"), 1.0),
      (hashingTF.indexOf("d"), 1.0))
    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
      "index must be in range [0, #features)")
    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")
    val expected = Vectors.sparse(n, termFreqs)
    assert(hashingTF.transform(doc) === expected)
  }

  test("hashing tf on an RDD") {
    val hashingTF = new HashingTF
    val localDocs: Seq[Seq[String]] = Seq(
      "a a b b b c d".split(" "),
      "a b c d a b c".split(" "),
      "c b a c b a a".split(" "))
    val docs = sc.parallelize(localDocs, 2)
    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
  }
}

Source File: ImpuritySuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.tree.impurity.{EntropyAggregator, GiniAggregator}
import org.apache.spark.mllib.util.MLlibTestSparkContext


class ImpuritySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("Gini impurity does not support negative labels") {
    val gini = new GiniAggregator(2)
    intercept[IllegalArgumentException] {
      gini.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0)
    }
  }

  test("Entropy does not support negative labels") {
    val entropy = new EntropyAggregator(2)
    intercept[IllegalArgumentException] {
      entropy.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0)
    }
  }
}

Source File: BaggedPointSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.tree.EnsembleTestHelper
import org.apache.spark.mllib.util.MLlibTestSparkContext


class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext  {

  test("BaggedPoint RDD: without subsampling") {
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42)
    baggedRDD.collect().foreach { baggedPoint =>
      assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 1.0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample)))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }
}

Source File: MatrixFactorizationModelSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.recommendation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils

class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext {

  val rank = 2
  var userFeatures: RDD[(Int, Array[Double])] = _
  var prodFeatures: RDD[(Int, Array[Double])] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0))))
    prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0))))
  }

  test("constructor") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    assert(model.predict(0, 2) ~== 17.0 relTol 1e-14)

    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(1, userFeatures, prodFeatures)
    }

    val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures1, prodFeatures)
    }

    val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures, prodFeatures1)
    }
  }

  test("save/load") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = {
      features.mapValues(_.toSeq).collect().toSet
    }
    try {
      model.save(sc, path)
      val newModel = MatrixFactorizationModel.load(sc, path)
      assert(newModel.rank === rank)
      assert(collect(newModel.userFeatures) === collect(userFeatures))
      assert(collect(newModel.productFeatures) === collect(prodFeatures))
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }

  test("batch predict API recommendProductsForUsers") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendProductsForUsers(topK).collectAsMap()

    assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14)
    assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14)
  }

  test("batch predict API recommendUsersForProducts") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendUsersForProducts(topK).collectAsMap()

    assert(recommendations(2)(0).user == 1)
    assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14)
    assert(recommendations(2)(1).user == 0)
    assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14)
  }
}

Source File: RankingMetricsSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("Ranking metrics: map, ndcg") {
    val predictionAndLabels = sc.parallelize(
      Seq(
        (Array[Int](1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array[Int](1, 2, 3, 4, 5)),
        (Array[Int](4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array[Int](1, 2, 3)),
        (Array[Int](1, 2, 3, 4, 5), Array[Int]())
      ), 2)
    val eps: Double = 1E-5

    val metrics = new RankingMetrics(predictionAndLabels)
    val map = metrics.meanAveragePrecision

    assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps)
    assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps)

    assert(map ~== 0.355026 absTol eps)

    assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps)
    assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps)
    assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps)

  }
}

Source File: AreaUnderCurveSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("auc computation") {
    val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0))
    val auc = 4.0
    assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5)
  }

  test("auc of an empty curve") {
    val curve = Seq.empty[(Double, Double)]
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }

  test("auc of a curve with a single point") {
    val curve = Seq((1.0, 1.0))
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }
}

Source File: RegressionMetricsSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class RegressionMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("regression metrics") {
    val predictionAndObservations = sc.parallelize(
      Seq((2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)), 2)
    val metrics = new RegressionMetrics(predictionAndObservations)
    assert(metrics.explainedVariance ~== 0.95717 absTol 1E-5,
      "explained variance regression score mismatch")
    assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch")
    assert(metrics.meanSquaredError ~== 0.375 absTol 1E-5, "mean squared error mismatch")
    assert(metrics.rootMeanSquaredError ~== 0.61237 absTol 1E-5,
      "root mean squared error mismatch")
    assert(metrics.r2 ~== 0.94861 absTol 1E-5, "r2 score mismatch")
  }

  test("regression metrics with complete fitting") {
    val predictionAndObservations = sc.parallelize(
      Seq((3.0, 3.0), (0.0, 0.0), (2.0, 2.0), (8.0, 8.0)), 2)
    val metrics = new RegressionMetrics(predictionAndObservations)
    assert(metrics.explainedVariance ~== 1.0 absTol 1E-5,
      "explained variance regression score mismatch")
    assert(metrics.meanAbsoluteError ~== 0.0 absTol 1E-5, "mean absolute error mismatch")
    assert(metrics.meanSquaredError ~== 0.0 absTol 1E-5, "mean squared error mismatch")
    assert(metrics.rootMeanSquaredError ~== 0.0 absTol 1E-5,
      "root mean squared error mismatch")
    assert(metrics.r2 ~== 1.0 absTol 1E-5, "r2 score mismatch")
  }
}

Source File: MulticlassMetricsSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Matrices
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("Multiclass evaluation metrics") {
    
    val confusionMatrix = Matrices.dense(3, 3, Array(2, 1, 0, 1, 3, 0, 1, 0, 1))
    val labels = Array(0.0, 1.0, 2.0)
    val predictionAndLabels = sc.parallelize(
      Seq((0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0),
        (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2)
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val delta = 0.0000001
    val fpRate0 = 1.0 / (9 - 4)
    val fpRate1 = 1.0 / (9 - 4)
    val fpRate2 = 1.0 / (9 - 1)
    val precision0 = 2.0 / (2 + 1)
    val precision1 = 3.0 / (3 + 1)
    val precision2 = 1.0 / (1 + 1)
    val recall0 = 2.0 / (2 + 2)
    val recall1 = 3.0 / (3 + 1)
    val recall2 = 1.0 / (1 + 0)
    val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0)
    val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1)
    val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2)
    val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0)
    val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1)
    val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2)

    assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray))
    assert(math.abs(metrics.falsePositiveRate(0.0) - fpRate0) < delta)
    assert(math.abs(metrics.falsePositiveRate(1.0) - fpRate1) < delta)
    assert(math.abs(metrics.falsePositiveRate(2.0) - fpRate2) < delta)
    assert(math.abs(metrics.precision(0.0) - precision0) < delta)
    assert(math.abs(metrics.precision(1.0) - precision1) < delta)
    assert(math.abs(metrics.precision(2.0) - precision2) < delta)
    assert(math.abs(metrics.recall(0.0) - recall0) < delta)
    assert(math.abs(metrics.recall(1.0) - recall1) < delta)
    assert(math.abs(metrics.recall(2.0) - recall2) < delta)
    assert(math.abs(metrics.fMeasure(0.0) - f1measure0) < delta)
    assert(math.abs(metrics.fMeasure(1.0) - f1measure1) < delta)
    assert(math.abs(metrics.fMeasure(2.0) - f1measure2) < delta)
    assert(math.abs(metrics.fMeasure(0.0, 2.0) - f2measure0) < delta)
    assert(math.abs(metrics.fMeasure(1.0, 2.0) - f2measure1) < delta)
    assert(math.abs(metrics.fMeasure(2.0, 2.0) - f2measure2) < delta)

    assert(math.abs(metrics.recall -
      (2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta)
    assert(math.abs(metrics.recall - metrics.precision) < delta)
    assert(math.abs(metrics.recall - metrics.fMeasure) < delta)
    assert(math.abs(metrics.recall - metrics.weightedRecall) < delta)
    assert(math.abs(metrics.weightedFalsePositiveRate -
      ((4.0 / 9) * fpRate0 + (4.0 / 9) * fpRate1 + (1.0 / 9) * fpRate2)) < delta)
    assert(math.abs(metrics.weightedPrecision -
      ((4.0 / 9) * precision0 + (4.0 / 9) * precision1 + (1.0 / 9) * precision2)) < delta)
    assert(math.abs(metrics.weightedRecall -
      ((4.0 / 9) * recall0 + (4.0 / 9) * recall1 + (1.0 / 9) * recall2)) < delta)
    assert(math.abs(metrics.weightedFMeasure -
      ((4.0 / 9) * f1measure0 + (4.0 / 9) * f1measure1 + (1.0 / 9) * f1measure2)) < delta)
    assert(math.abs(metrics.weightedFMeasure(2.0) -
      ((4.0 / 9) * f2measure0 + (4.0 / 9) * f2measure1 + (1.0 / 9) * f2measure2)) < delta)
    assert(metrics.labels.sameElements(labels))
  }
}

Source File: FPTreeSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import scala.language.existentials

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("add transaction") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    assert(tree.root.children.size == 2)
    assert(tree.root.children.contains("a"))
    assert(tree.root.children("a").item.equals("a"))
    assert(tree.root.children("a").count == 2)
    assert(tree.root.children.contains("b"))
    assert(tree.root.children("b").item.equals("b"))
    assert(tree.root.children("b").count == 1)
    var child = tree.root.children("a")
    assert(child.children.size == 1)
    assert(child.children.contains("b"))
    assert(child.children("b").item.equals("b"))
    assert(child.children("b").count == 2)
    child = child.children("b")
    assert(child.children.size == 2)
    assert(child.children.contains("c"))
    assert(child.children.contains("y"))
    assert(child.children("c").item.equals("c"))
    assert(child.children("y").item.equals("y"))
    assert(child.children("c").count == 1)
    assert(child.children("y").count == 1)
  }

  test("merge tree") {
    val tree1 = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    val tree2 = new FPTree[String]
      .add(Seq("a", "b"))
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "c", "d"))
      .add(Seq("a", "x"))
      .add(Seq("a", "x", "y"))
      .add(Seq("c", "n"))
      .add(Seq("c", "m"))

    val tree3 = tree1.merge(tree2)

    assert(tree3.root.children.size == 3)
    assert(tree3.root.children("a").count == 7)
    assert(tree3.root.children("b").count == 1)
    assert(tree3.root.children("c").count == 2)
    val child1 = tree3.root.children("a")
    assert(child1.children.size == 2)
    assert(child1.children("b").count == 5)
    assert(child1.children("x").count == 2)
    val child2 = child1.children("b")
    assert(child2.children.size == 2)
    assert(child2.children("y").count == 1)
    assert(child2.children("c").count == 3)
    val child3 = child2.children("c")
    assert(child3.children.size == 1)
    assert(child3.children("d").count == 1)
    val child4 = child1.children("x")
    assert(child4.children.size == 1)
    assert(child4.children("y").count == 1)
    val child5 = tree3.root.children("c")
    assert(child5.children.size == 2)
    assert(child5.children("n").count == 1)
    assert(child5.children("m").count == 1)
  }

  test("extract freq itemsets") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("a", "b"))
      .add(Seq("a"))
      .add(Seq("b"))
      .add(Seq("b", "n"))

    val freqItemsets = tree.extract(3L).map { case (items, count) =>
      (items.toSet, count)
    }.toSet
    val expected = Set(
      (Set("a"), 4L),
      (Set("b"), 5L),
      (Set("a", "b"), 3L))
    assert(freqItemsets === expected)
  }
}

Source File: KernelDensitySuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MultivariateGaussianSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{ Vectors, Matrices }
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }

}

Source File: CoordinateMatrixSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.linalg.Vectors

class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {

  val m = 5
  val n = 4
  var mat: CoordinateMatrix = _

  override def beforeAll() {
    super.beforeAll()
    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    mat = new CoordinateMatrix(entries)
  }

  test("size") {
    assert(mat.numRows() === m)
    assert(mat.numCols() === n)
  }

  test("empty entries") {
    val entries = sc.parallelize(Seq[MatrixEntry](), 1)
    val emptyMat = new CoordinateMatrix(entries)
    intercept[RuntimeException] {
      emptyMat.numCols()
    }
    intercept[RuntimeException] {
      emptyMat.numRows()
    }
  }

  test("toBreeze") {
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(mat.toBreeze() === expected)
  }

  test("transpose") {
    val transposed = mat.transpose()
    assert(mat.toBreeze().t === transposed.toBreeze())
  }

  test("toIndexedRowMatrix") {
    val indexedRowMatrix = mat.toIndexedRowMatrix()
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(indexedRowMatrix.toBreeze() === expected)
  }

  test("toRowMatrix") {
    val rowMatrix = mat.toRowMatrix()
    val rows = rowMatrix.rows.collect().toSet
    val expected = Set(
      Vectors.dense(1.0, 2.0, 0.0, 0.0),
      Vectors.dense(0.0, 3.0, 4.0, 0.0),
      Vectors.dense(0.0, 0.0, 5.0, 6.0),
      Vectors.dense(7.0, 0.0, 0.0, 8.0),
      Vectors.dense(0.0, 9.0, 0.0, 0.0))
    assert(rows === expected)
  }

  test("toBlockMatrix") {
    val blockMat = mat.toBlockMatrix(2, 2)
    assert(blockMat.numRows() === m)
    assert(blockMat.numCols() === n)
    assert(blockMat.toBreeze() === mat.toBreeze())

    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(-1, 2)
    }
    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(2, 0)
    }
  }
}

Source File: MLPairRDDFunctionsSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._

class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("topByKey") {
    val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5,
      1), (3, 5)), 2)
      .topByKey(5)
      .collectAsMap()

    assert(topMap.size === 3)
    assert(topMap(1) === Array(7, 6, 3, 2, 1))
    assert(topMap(3) === Array(7, 5, 2))
    assert(topMap(5) === Array(1))
  }
}

Source File: RDDFunctionsSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.rdd.RDDFunctions._

class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("sliding") {
    val data = 0 until 6
    for (numPartitions <- 1 to 8) {
      val rdd = sc.parallelize(data, numPartitions)
      for (windowSize <- 1 to 6) {
        val sliding = rdd.sliding(windowSize).collect().map(_.toList).toList
        val expected = data.sliding(windowSize).map(_.toList).toList
        assert(sliding === expected)
      }
      assert(rdd.sliding(7).collect().isEmpty,
        "Should return an empty RDD if the window size is greater than the number of items.")
    }
  }

  test("sliding with empty partitions") {
    val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7))
    val rdd = sc.parallelize(data, data.length).flatMap(s => s)
    assert(rdd.partitions.size === data.length)
    val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq)
    val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq)
    assert(sliding === expected)
  }
}

Source File: VectorSlicerSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {//参数
    val slicer = new VectorSlicer
    ParamsSuite.checkParams(slicer)
    //指数
    assert(slicer.getIndices.length === 0)
    //名称
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {
        slicer.validateParams()
      }
    }
  }

  test("feature validity checks") {//特征有效性检查
    import VectorSlicer._
    //如果给定的特征索引是有效的,返回true
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(validIndices(Array.empty[Int]))
    assert(!validIndices(Array(-1)))
    assert(!validIndices(Array(1, 2, 1)))
    //如果给定的特征名称有效,返回true
    assert(validNames(Array("a", "b")))
    assert(validNames(Array.empty[String]))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))
  }

  test("Test vector slicer") {//测试向量机
    val sqlContext = new SQLContext(sc)

    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())
    )

    // Expected after selecting indices 1, 4
    //预计在选择指数1,4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) }
    
    val df = sqlContext.createDataFrame(rdd,
        StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))
    //VectorSlicer是一个转换器输入特征向量,输出原始特征向量子集.
    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {
      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      }
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
      resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
        assert(a === b)
      }
    }

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))//transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame

    vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
    //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame
    
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame

    validateResults(vectorSlicer.transform(df))
  }
}

Source File: DCTSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

@BeanInfo
case class DCTTestData(vec: Vector, wantedVec: Vector)

class DCTSuite extends SparkFunSuite with MLlibTestSparkContext {
  //正向离散余弦变换jtransforms比赛结果
  test("forward transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = false

    testDCT(data, inverse)
  }
  //逆离散余弦变换jtransforms比赛结果
  test("inverse transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = true

    testDCT(data, inverse)
  }

  private def testDCT(data: Vector, inverse: Boolean): Unit = {
    val expectedResultBuffer = data.toArray.clone()
    if (inverse) {
      (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true)
    } else {
      (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true)
    }
    val expectedResult = Vectors.dense(expectedResultBuffer)

    val dataset = sqlContext.createDataFrame(Seq(
      DCTTestData(data, expectedResult)
    ))

    val transformer = new DCT()
      .setInputCol("vec")
      .setOutputCol("resultVec")
      .setInverse(inverse)
     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    transformer.transform(dataset)
      .select("resultVec", "wantedVec")
      .collect()
      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
    }
  }
}

Source File: StopWordsRemoverSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

object StopWordsRemoverSuite extends SparkFunSuite {
  def testStopWordsRemover(t: StopWordsRemover, dataset: DataFrame): Unit = {
    t.transform(dataset) //transform()方法将DataFrame转化为另外一个DataFrame的算法
      .select("filtered", "expected")
      .collect()
      .foreach { case Row(tokens, wantedTokens) =>
        assert(tokens === wantedTokens)
    }
  }
}


    dataSet.show()
    testStopWordsRemover(remover, dataSet)
  }
}

Source File: TokenizerSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

@BeanInfo
case class TokenizerTestData(rawText: String, wantedTokens: Array[String])

    dataset1.show()
    tokenizer0.setMinTokenLength(3)
    
    testRegexTokenizer(tokenizer0, dataset1)

    val tokenizer2 = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset2 = sqlContext.createDataFrame(Seq(
      TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")),
      TokenizerTestData("Te,st.  punct", Array("Te,st.", "punct"))
    ))
    testRegexTokenizer(tokenizer2, dataset2)
  }
}

object RegexTokenizerSuite extends SparkFunSuite {

  def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = {
   //transform()方法将DataFrame转化为另外一个DataFrame的算法
    t.transform(dataset)
      .select("tokens", "wantedTokens")
      .collect()
      .foreach { case Row(tokens, wantedTokens) =>
        assert(tokens === wantedTokens)
      }
  }
}

Source File: MinMaxScalerSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.MLTestingUtils
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SQLContext}

        println(vector1+"|||"+vector2)
        assert(vector1.equals(vector2), "Transformed vector is different with expected.")
        }
    }

    // copied model must have the same parent.
    MLTestingUtils.checkCopy(model)
  }
  //MinMaxScaler将所有特征向量线性变换到用户指定最大-最小值之间
  test("MinMaxScaler arguments max must be larger than min") {
    withClue("arguments max must be larger than min") {
      intercept[IllegalArgumentException] {
        val scaler = new MinMaxScaler().setMin(10).setMax(0)
        scaler.validateParams()
      }
      intercept[IllegalArgumentException] {
        val scaler = new MinMaxScaler().setMin(0).setMax(0)
        scaler.validateParams()
      }
    }
  }
}

Source File: PolynomialExpansionSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.param.ParamsSuite
import org.scalatest.exceptions.TestFailedException

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {//参数
    ParamsSuite.checkParams(new PolynomialExpansion)
  }

  test("Polynomial expansion with default parameter") {//带有默认参数的多项式展开
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())
    )

    val twoDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)),
      Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29),
      Vectors.dense(new Array[Double](9)),
      Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0),
      Vectors.sparse(9, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")
     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
    }
  }
  //多项式展开设置
  test("Polynomial expansion with setter") {
    val data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq())
    )

    val threeDegreeExpansion: Array[Vector] = Array(
      Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8),
        Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)),
      Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17),
      Vectors.dense(new Array[Double](19)),
      Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8,
        -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0),
      Vectors.sparse(19, Array.empty, Array.empty))

    val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected")

    val polynomialExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")
      .setDegree(3)
    //transform()方法将DataFrame转化为另外一个DataFrame的算法
    polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach {
      case Row(expanded: DenseVector, expected: DenseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case Row(expanded: SparseVector, expected: SparseVector) =>
        assert(expanded ~== expected absTol 1e-1)
      case _ =>
        throw new TestFailedException("Unmatched data types after polynomial expansion", 0)
    }
  }
}

Source File: IDFSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.Row

       assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
    }
  }

  test("compute IDF with setter") {//设置IDF计算
    val numOfFeatures = 4
    val data = Array(
      Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(numOfFeatures, Array(1), Array(1.0))
    )
    val numOfData = data.size
    val idf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0
    })
    val expected = scaleDataWithIDF(data, idf)

    val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected")

    val idfModel = new IDF()
      .setInputCol("features")
      .setOutputCol("idfValue")
      .setMinDocFreq(1)
      .fit(df)//fit()方法将DataFrame转化为一个Transformer的算法
     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    idfModel.transform(df).select("idfValue", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
    }
  }
}

Source File: NormalizerSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}


class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext {

  @transient var data: Array[Vector] = _
  @transient var dataFrame: DataFrame = _
  @transient var normalizer: Normalizer = _
  @transient var l1Normalized: Array[Vector] = _
  @transient var l2Normalized: Array[Vector] = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))),
      Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))),
      Vectors.sparse(3, Seq())
    )
     l1Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.12765957, -0.23404255, -0.63829787),
      Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))),
      Vectors.dense(0.625, 0.07894737, 0.29605263),
      Vectors.sparse(3, Seq())
    )
    l2Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.184549876, -0.3383414, -0.922749378),
      Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))),
      Vectors.dense(0.897906166, 0.113419726, 0.42532397),
      Vectors.sparse(3, Seq())
    )

    val sqlContext = new SQLContext(sc)
    dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData))
    normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized_features")
  }
  //收集的结果
  def collectResult(result: DataFrame): Array[Vector] = {
    result.select("normalized_features").collect().map {
      case Row(features: Vector) => features
    }
  }
  //向量的断言类型
  def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
    assert((lhs, rhs).zipped.forall {
      case (v1: DenseVector, v2: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after normalization.")
  }
  //断言值
  def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
    assert((lhs, rhs).zipped.forall { (vector1, vector2) =>
      vector1 ~== vector2 absTol 1E-5
    }, "The vector value is not correct after normalization.")
  }

  test("Normalization with default parameter") {//默认参数的正常化
  //transform()方法将DataFrame转化为另外一个DataFrame的算法
    normalizer.transform(dataFrame).show()
    val result = collectResult(normalizer.transform(dataFrame))

    assertTypeOfVector(data, result)

    assertValues(result, l2Normalized)
  }

  test("Normalization with setter") {//规范化设置
    normalizer.setP(1)
    //transform()方法将DataFrame转化为另外一个DataFrame的算法
    normalizer.transform(dataFrame).show()
    val result = collectResult(normalizer.transform(dataFrame))

    assertTypeOfVector(data, result)

    assertValues(result, l1Normalized)
  }
}

private object NormalizerSuite {
  case class FeatureData(features: Vector)
}

Source File: RegressionEvaluatorSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {//评估回归:默认参数
    
    val trainer = new LinearRegression
    //fit()方法将DataFrame转化为一个Transformer的算法
    val model = trainer.fit(dataset) //转换
    //Prediction 预测
    //transform()方法将DataFrame转化为另外一个DataFrame的算法
    val predictions = model.transform(dataset)
    predictions.collect()

    // default = rmse
    //默认rmse均方根误差说明样本的离散程度
    val evaluator = new RegressionEvaluator()
    println("==MetricName="+evaluator.getMetricName+"=LabelCol="+evaluator.getLabelCol+"=PredictionCol="+evaluator.getPredictionCol)
    //==MetricName=rmse=LabelCol=label=PredictionCol=prediction,默认rmse均方根误差说明样本的离散程度
    assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)

    // r2 score 评分
    //R2平方系统也称判定系数,用来评估模型拟合数据的好坏
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)

    //MAE平均绝对误差是所有单个观测值与算术平均值的偏差的绝对值的平均
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
  }
}

Source File: ANNSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._


class ANNSuite extends SparkFunSuite with MLlibTestSparkContext {

  // TODO: test for weights comparison with Weka MLP
  //人工神经网络与乙状结肠学习LBFGS优化器异或函数
  test("ANN with Sigmoid learns XOR function with LBFGS optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(0.0, 1.0, 1.0, 0.0)
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights()
    val trainer = new FeedForwardTrainer(topology, 2, 1)
    //initialWeights初始取值,默认是0向量
    trainer.setWeights(initialWeights)
    trainer.LBFGSOptimizer.setNumIterations(20)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input)(0), label(0))
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(math.round(p) === l)
    }
  }
  //人工神经网络与学习两输出和批量SoftMax GD优化器异或函数
  test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(
      Array(1.0, 0.0),
      Array(0.0, 1.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0)
    )
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights()
    val trainer = new FeedForwardTrainer(topology, 2, 2)
    //(SGD随机梯度下降)
    trainer.SGDOptimizer.setNumIterations(2000)
    //initialWeights初始取值,默认是0向量
    trainer.setWeights(initialWeights)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input), label)
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(p ~== l absTol 0.5)
    }
  }
}

Source File: ChiSqSelectorSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext

  //特征提取和转换 卡方选择(ChiSqSelector)稀疏和稠密向量
  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(//标记的离散数据
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
      //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label)
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =//预过滤数据
    //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label)
      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(6.0))),
        LabeledPoint(1.0, Vectors.dense(Array(8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(5.0))))
	//fit()方法将DataFrame转化为一个Transformer的算法
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
     //transform()方法将DataFrame转化为另外一个DataFrame的算法
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSet
    assert(filteredData == preFilteredData)
  }
}

Source File: ElementwiseProductSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext {
  //产品应适用于数据集在一个密集的矢量
  test("elementwise (hadamard) product should properly apply vector to dense data set") {
    val denseData = Array(
      Vectors.dense(1.0, 4.0, 1.9, -9.0)
    )
    val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
    val transformer = new ElementwiseProduct(scalingVec)
    //批理变换和每个变换,得到相同的结果
     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    val transformedData = transformer.transform(sc.makeRDD(denseData))
    val transformedVecs = transformedData.collect()
    val transformedVec = transformedVecs(0)
    val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
    assert(transformedVec ~== expectedVec absTol 1E-5,
      s"Expected transformed vector $expectedVec but found $transformedVec")
  }
  //元素(Hadamard)产品应正确运用向量的稀疏数据集
  test("elementwise (hadamard) product should properly apply vector to sparse data set") {
    val sparseData = Array(
      Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
    )
    val dataRDD = sc.parallelize(sparseData, 3)
    val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
    val transformer = new ElementwiseProduct(scalingVec)
    val data2 = sparseData.map(transformer.transform)
     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    val data2RDD = transformer.transform(dataRDD)

    assert((sparseData, data2, data2RDD.collect()).zipped.forall {
      case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after hadamard product")

    assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
    assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
  }
}

Source File: PCASuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
  )

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {//正确的计算使用一个主成分分析包装
    val k = dataRDD.count().toInt
    //fit()方法将DataFrame转化为一个Transformer的算法
    val pca = new PCA(k).fit(dataRDD)
   //转换分布式矩阵分
    val mat = new RowMatrix(dataRDD)
    //计算主成分析,将维度降为K
    val pc = mat.computePrincipalComponents(k)
    //PCA变换
     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    val pca_transform = pca.transform(dataRDD).collect()
    //Mat _相乘
    val mat_multiply = mat.multiply(pc).rows.collect()
    assert(pca_transform.toSet === mat_multiply.toSet)
  }
}

Source File: HashingTFSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
  //把字符转换特征哈希值,返回词的频率


class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("hashing tf on a single doc") {//散列在一个单一的文件
      
    val hashingTF = new HashingTF(1000)
    val doc = "a a b b c d".split(" ")
    val n = hashingTF.numFeatures
    //词的频率
    val termFreqs = Seq(
      (hashingTF.indexOf("a"), 2.0),
      (hashingTF.indexOf("b"), 2.0),
      (hashingTF.indexOf("c"), 1.0),
      (hashingTF.indexOf("d"), 1.0))
    //termFreqs: Seq[(Int, Double)] = List((97,2.0), (98,2.0), (99,1.0), (100,1.0))
    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
      "index must be in range [0, #features)")//索引必须在范围内
    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")//期待完美的哈希
    val expected = Vectors.sparse(n, termFreqs)
    //transform 把每个输入文档映射到一个Vector对象
     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    assert(hashingTF.transform(doc) === expected)
  }

  test("hashing tf on an RDD") {//散列TF在RDD
    val hashingTF = new HashingTF
    val localDocs: Seq[Seq[String]] = Seq(
      "a a b b b c d".split(" "),
      "a b c d a b c".split(" "),
      "c b a c b a a".split(" "))
    val docs = sc.parallelize(localDocs, 2)    
     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
  }
}

Source File: ImpuritySuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.tree.impurity.{EntropyAggregator, GiniAggregator}
import org.apache.spark.mllib.util.MLlibTestSparkContext


class ImpuritySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("Gini impurity does not support negative labels") {//基尼杂质不支持负标签
    val gini = new GiniAggregator(2)
    intercept[IllegalArgumentException] {
      gini.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0)
    }
  }

  test("Entropy does not support negative labels") {//熵不支持负标签
    val entropy = new EntropyAggregator(2)
    intercept[IllegalArgumentException] {
      entropy.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0)
    }
  }
}

Source File: BaggedPointSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.tree.EnsembleTestHelper
import org.apache.spark.mllib.util.MLlibTestSparkContext


class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext  {

  test("BaggedPoint RDD: without subsampling") {
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42)
    baggedRDD.collect().foreach { baggedPoint =>
      assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 1.0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
      //epsilon代收敛的阀值
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
     //math.abs返回数的绝对值
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
     //math.abs返回数的绝对值
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample)))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }
}

Source File: MatrixFactorizationModelSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.recommendation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils

    sqlContext.createDataFrame(prodFeatures).show()
  }

  test("constructor") {//构造函数
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    //预测得分,用户ID,产品ID
    println("========"+model.predict(0, 2))
    //17.0
    assert(model.predict(0, 2) ~== 17.0 relTol 1e-14)

    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(1, userFeatures, prodFeatures)
    }
    //userFeatures 用户特征
    val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures1, prodFeatures)
    }
   //prodFeatures 产品特征
    val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures, prodFeatures1)
    }
  }

  test("save/load") {//保存/加载
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = {
      features.mapValues(_.toSeq).collect().toSet
    }
    try {
      model.save(sc, path)
      val newModel = MatrixFactorizationModel.load(sc, path)
      assert(newModel.rank === rank)
      //用户特征
      assert(collect(newModel.userFeatures) === collect(userFeatures))
      //产品特征
      assert(collect(newModel.productFeatures) === collect(prodFeatures))
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }

  test("batch predict API recommendProductsForUsers") {//批量预测API recommendproductsforusers
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    //为用户推荐个数为num的商品
    val recommendations = model.recommendProductsForUsers(topK).collectAsMap()

    assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14)
    assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14)
  }

  test("batch predict API recommendUsersForProducts") {
    
    //userFeatures用户因子,prodFeatures商品因子,rank因子个数,因子个数一般越多越好,普通取值10到200
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    //为用户推荐个数为num的商品
    val recommendations = model.recommendUsersForProducts(topK).collectAsMap()

    assert(recommendations(2)(0).user == 1)
    assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14)
    assert(recommendations(2)(1).user == 0)
    assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14)
  }
}

Source File: AreaUnderCurveSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("auc computation") {//AUC计算
    //曲线
    val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0))
    val auc = 4.0
    assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5) //1e-5的意思就是1乘以10的负5次幂.就是0.000001
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5)
  }

  test("auc of an empty curve") {//AUC空曲线
    //曲线
    val curve = Seq.empty[(Double, Double)]
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }

  test("auc of a curve with a single point") {//单点与曲线的AUC
    val curve = Seq((1.0, 1.0))
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }
}

Source File: FPTreeSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import scala.language.existentials

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("add transaction") {//增加转换
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    assert(tree.root.children.size == 2)
    assert(tree.root.children.contains("a"))
    assert(tree.root.children("a").item.equals("a"))
    assert(tree.root.children("a").count == 2)
    assert(tree.root.children.contains("b"))
    assert(tree.root.children("b").item.equals("b"))
    assert(tree.root.children("b").count == 1)
    var child = tree.root.children("a")
    assert(child.children.size == 1)
    assert(child.children.contains("b"))
    assert(child.children("b").item.equals("b"))
    assert(child.children("b").count == 2)
    child = child.children("b")
    assert(child.children.size == 2)
    assert(child.children.contains("c"))
    assert(child.children.contains("y"))
    assert(child.children("c").item.equals("c"))
    assert(child.children("y").item.equals("y"))
    assert(child.children("c").count == 1)
    assert(child.children("y").count == 1)
  }

  test("merge tree") {//合并树
    val tree1 = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    val tree2 = new FPTree[String]
      .add(Seq("a", "b"))
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "c", "d"))
      .add(Seq("a", "x"))
      .add(Seq("a", "x", "y"))
      .add(Seq("c", "n"))
      .add(Seq("c", "m"))

    val tree3 = tree1.merge(tree2)

    assert(tree3.root.children.size == 3)
    assert(tree3.root.children("a").count == 7)
    assert(tree3.root.children("b").count == 1)
    assert(tree3.root.children("c").count == 2)
    val child1 = tree3.root.children("a")
    assert(child1.children.size == 2)
    assert(child1.children("b").count == 5)
    assert(child1.children("x").count == 2)
    val child2 = child1.children("b")
    assert(child2.children.size == 2)
    assert(child2.children("y").count == 1)
    assert(child2.children("c").count == 3)
    val child3 = child2.children("c")
    assert(child3.children.size == 1)
    assert(child3.children("d").count == 1)
    val child4 = child1.children("x")
    assert(child4.children.size == 1)
    assert(child4.children("y").count == 1)
    val child5 = tree3.root.children("c")
    assert(child5.children.size == 2)
    assert(child5.children("n").count == 1)
    assert(child5.children("m").count == 1)
  }

  test("extract freq itemsets") {//频繁项集的提取物
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("a", "b"))
      .add(Seq("a"))
      .add(Seq("b"))
      .add(Seq("b", "n"))

    val freqItemsets = tree.extract(3L).map { case (items, count) =>
      (items.toSet, count)
    }.toSet
    val expected = Set(
      (Set("a"), 4L),
      (Set("b"), 5L),
      (Set("a", "b"), 3L))
    assert(freqItemsets === expected)
  }
}

Source File: AssociationRulesSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
//频繁模式挖掘-Association Rules
class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("association rules using String type") {//使用字符串类型的关联规则
    val freqItemsets = sc.parallelize(Seq(//频繁项集
      (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
      (Set("r"), 3L),
      (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L),
      (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L),
      (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L),
      (Set("t", "y", "x"), 3L),
      (Set("t", "y", "x", "z"), 3L)
    ).map {
      case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq)
    })
  //频繁模式挖掘-Association Rules
    val ar = new AssociationRules()

    val results1 = ar
      .setMinConfidence(0.9)
      .run(freqItemsets)
      .collect()

    
    assert(results2.size === 30)
     //math.abs返回数的绝对值
    assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
  }
}

Source File: KernelDensitySuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {//核密度单样本
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
     //math.abs返回数的绝对值
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {//核密度多样本
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
     //math.abs返回数的绝对值
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MultivariateGaussianSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{ Vectors, Matrices }
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("univariate") {//单变量
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    //密集矩阵
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    //多元高斯
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {//多变量
    val x1 = Vectors.dense(0.0, 0.0)//创建密集向量
    val x2 = Vectors.dense(1.0, 1.0)//创建密集向量

    val mu = Vectors.dense(0.0, 0.0)//创建密集向量
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {//多元退化
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }

}

Source File: CoordinateMatrixSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.linalg.Vectors

    val blockMat = mat.toBlockMatrix(2, 2)
    assert(blockMat.numRows() === m)
    assert(blockMat.numCols() === n)
    assert(blockMat.toBreeze() === mat.toBreeze())

    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(-1, 2)
    }
    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(2, 0)
    }
  }
}

Source File: MLPairRDDFunctionsSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._

class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("topByKey") {
    
    val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5,
      1), (3, 5)), 2)
      //
      .topByKey(5)
      //以k转换map数组
      .collectAsMap()

    assert(topMap.size === 3)
    assert(topMap(1) === Array(7, 6, 3, 2, 1))
    assert(topMap(3) === Array(7, 5, 2))
    assert(topMap(5) === Array(1))
  }
}

Source File: RDDFunctionsSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.rdd.RDDFunctions._

class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("sliding") {//滑动
    val data = 0 until 6
    for (numPartitions <- 1 to 8) {
      val rdd = sc.parallelize(data, numPartitions)
      for (windowSize <- 1 to 6) {
        val sliding = rdd.sliding(windowSize).collect().map(_.toList).toList
        val expected = data.sliding(windowSize).map(_.toList).toList
        assert(sliding === expected)
      }
      assert(rdd.sliding(7).collect().isEmpty,
          //应该返回一个空盘如果窗口大小大于物品的数量
        "Should return an empty RDD if the window size is greater than the number of items.")
    }
  }

  test("sliding with empty partitions") {//带空分区的滑动
    val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7))
    // Array(1, 2, 3, 4, 5, 6, 7)
    val rdd = sc.parallelize(data, data.length).flatMap(s => s)
    //data.length = 5
    assert(rdd.partitions.size === data.length)
    
    //设置数据平滑窗口
    val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq)
    //expected: Seq[Seq[Int]] = Stream(List(1, 2, 3), ?)
    val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq)
    assert(sliding === expected)
  }
}

Source File: BaggedPointSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.tree.EnsembleTestHelper
import org.apache.spark.mllib.util.MLlibTestSparkContext


class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext  {

  test("BaggedPoint RDD: without subsampling") {
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42)
    baggedRDD.collect().foreach { baggedPoint =>
      assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 1.0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample)))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }
}

Source File: GradientBoostedTreesSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.internal.Logging
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite}
import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy}
import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Variance
import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError}
import org.apache.spark.mllib.util.MLlibTestSparkContext


class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {

  import testImplicits._

  test("runWithValidation stops early and performs better on a validation dataset") {
    // Set numIterations large enough so that it stops early.
    val numIterations = 20
    val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML)
    val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML)
    val trainDF = trainRdd.toDF()
    val validateDF = validateRdd.toDF()

    val algos = Array(Regression, Regression, Classification)
    val losses = Array(SquaredError, AbsoluteError, LogLoss)
    algos.zip(losses).foreach { case (algo, loss) =>
      val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2,
        categoricalFeaturesInfo = Map.empty)
      val boostingStrategy =
        new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0)
      val (validateTrees, validateTreeWeights) = GradientBoostedTrees
        .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L, "all")
      val numTrees = validateTrees.length
      assert(numTrees !== numIterations)

      // Test that it performs better on the validation dataset.
      val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L, "all")
      val (errorWithoutValidation, errorWithValidation) = {
        if (algo == Classification) {
          val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features))
          (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss),
            GradientBoostedTrees.computeError(remappedRdd, validateTrees,
              validateTreeWeights, loss))
        } else {
          (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss),
            GradientBoostedTrees.computeError(validateRdd, validateTrees,
              validateTreeWeights, loss))
        }
      }
      assert(errorWithValidation <= errorWithoutValidation)

      // Test that results from evaluateEachIteration comply with runWithValidation.
      // Note that convergenceTol is set to 0.0
      val evaluationArray = GradientBoostedTrees
        .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo)
      assert(evaluationArray.length === numIterations)
      assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1))
      var i = 1
      while (i < numTrees) {
        assert(evaluationArray(i) <= evaluationArray(i - 1))
        i += 1
      }
    }
  }

}

Source File: TopByKeyAggregatorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.recommendation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset


class TopByKeyAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {

  private def getTopK(k: Int): Dataset[(Int, Array[(Int, Float)])] = {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._

    val topKAggregator = new TopByKeyAggregator[Int, Int, Float](k, Ordering.by(_._2))
    Seq(
      (0, 3, 54f),
      (0, 4, 44f),
      (0, 5, 42f),
      (0, 6, 28f),
      (1, 3, 39f),
      (2, 3, 51f),
      (2, 5, 45f),
      (2, 6, 18f)
    ).toDS().groupByKey(_._1).agg(topKAggregator.toColumn)
  }

  test("topByKey with k < #items") {
    val topK = getTopK(2)
    assert(topK.count() === 3)

    val expected = Map(
      0 -> Array((3, 54f), (4, 44f)),
      1 -> Array((3, 39f)),
      2 -> Array((3, 51f), (5, 45f))
    )
    checkTopK(topK, expected)
  }

  test("topByKey with k > #items") {
    val topK = getTopK(5)
    assert(topK.count() === 3)

    val expected = Map(
      0 -> Array((3, 54f), (4, 44f), (5, 42f), (6, 28f)),
      1 -> Array((3, 39f)),
      2 -> Array((3, 51f), (5, 45f), (6, 18f))
    )
    checkTopK(topK, expected)
  }

  private def checkTopK(
      topK: Dataset[(Int, Array[(Int, Float)])],
      expected: Map[Int, Array[(Int, Float)]]): Unit = {
    topK.collect().foreach { case (id, recs) => assert(recs === expected(id)) }
  }
}

Source File: BinaryClassificationEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class BinaryClassificationEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new BinaryClassificationEvaluator)
  }

  test("read/write") {
    val evaluator = new BinaryClassificationEvaluator()
      .setRawPredictionCol("myRawPrediction")
      .setLabelCol("myLabel")
      .setMetricName("areaUnderPR")
    testDefaultReadWrite(evaluator)
  }

  test("should accept both vector and double raw prediction col") {
    val evaluator = new BinaryClassificationEvaluator()
      .setMetricName("areaUnderPR")

    val vectorDF = Seq(
      (0d, Vectors.dense(12, 2.5)),
      (1d, Vectors.dense(1, 3)),
      (0d, Vectors.dense(10, 2))
    ).toDF("label", "rawPrediction")
    assert(evaluator.evaluate(vectorDF) === 1.0)

    val doubleDF = Seq(
      (0d, 0d),
      (1d, 1d),
      (0d, 0d)
    ).toDF("label", "rawPrediction")
    assert(evaluator.evaluate(doubleDF) === 1.0)

    val stringDF = Seq(
      (0d, "0d"),
      (1d, "1d"),
      (0d, "0d")
    ).toDF("label", "rawPrediction")
    val thrown = intercept[IllegalArgumentException] {
      evaluator.evaluate(stringDF)
    }
    assert(thrown.getMessage.replace("\n", "") contains "Column rawPrediction must be of type " +
      "equal to one of the following types: [DoubleType, ")
    assert(thrown.getMessage.replace("\n", "") contains "but was actually of type StringType.")
  }

  test("should support all NumericType labels and not support other types") {
    val evaluator = new BinaryClassificationEvaluator().setRawPredictionCol("prediction")
    MLTestingUtils.checkNumericTypes(evaluator, spark)
  }
}

Source File: MulticlassClassificationEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MulticlassClassificationEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new MulticlassClassificationEvaluator)
  }

  test("read/write") {
    val evaluator = new MulticlassClassificationEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("accuracy")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new MulticlassClassificationEvaluator, spark)
  }
}

Source File: ClusteringEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset


class ClusteringEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  @transient var irisDataset: Dataset[_] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    irisDataset = spark.read.format("libsvm").load("../data/mllib/iris_libsvm.txt")
  }

  test("params") {
    ParamsSuite.checkParams(new ClusteringEvaluator)
  }

  test("read/write") {
    val evaluator = new ClusteringEvaluator()
      .setPredictionCol("myPrediction")
      .setFeaturesCol("myLabel")
    testDefaultReadWrite(evaluator)
  }

  
  test("squared euclidean Silhouette") {
    val evaluator = new ClusteringEvaluator()
        .setFeaturesCol("features")
        .setPredictionCol("label")

    assert(evaluator.evaluate(irisDataset) ~== 0.6564679231 relTol 1e-5)
  }

  test("number of clusters must be greater than one") {
    val singleClusterDataset = irisDataset.where($"label" === 0.0)
    val evaluator = new ClusteringEvaluator()
      .setFeaturesCol("features")
      .setPredictionCol("label")

    val e = intercept[AssertionError]{
      evaluator.evaluate(singleClusterDataset)
    }
    assert(e.getMessage.contains("Number of clusters must be greater than one"))
  }

}

Source File: RegressionEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
  }

  test("read/write") {
    val evaluator = new RegressionEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("r2")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark)
  }
}

Source File: RWrapperUtilsSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.{RFormula, RFormulaModel}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class RWrapperUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("avoid libsvm data column name conflicting") {
    val rFormula = new RFormula().setFormula("label ~ features")
    val data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt")

    // if not checking column name, then IllegalArgumentException
    intercept[IllegalArgumentException] {
      rFormula.fit(data)
    }

    // after checking, model build is ok
    RWrapperUtils.checkDataColumns(rFormula, data)

    assert(rFormula.getLabelCol == "label")
    assert(rFormula.getFeaturesCol.startsWith("features_"))

    val model = rFormula.fit(data)
    assert(model.isInstanceOf[RFormulaModel])

    assert(model.getLabelCol == "label")
    assert(model.getFeaturesCol.startsWith("features_"))
  }

}

Source File: PredictorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasWeightCol
import org.apache.spark.ml.util._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import PredictorSuite._

  test("should support all NumericType labels and weights, and not support other types") {
    val df = spark.createDataFrame(Seq(
      (0, 1, Vectors.dense(0, 2, 3)),
      (1, 2, Vectors.dense(0, 3, 9)),
      (0, 3, Vectors.dense(0, 2, 6))
    )).toDF("label", "weight", "features")

    val types =
      Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))

    val predictor = new MockPredictor().setWeightCol("weight")

    types.foreach { t =>
      predictor.fit(df.select(col("label").cast(t), col("weight").cast(t), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label").cast(StringType), col("weight"), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label"), col("weight").cast(StringType), col("features")))
    }
  }
}

object PredictorSuite {

  class MockPredictor(override val uid: String)
    extends Predictor[Vector, MockPredictor, MockPredictionModel] with HasWeightCol {

    def this() = this(Identifiable.randomUID("mockpredictor"))

    def setWeightCol(value: String): this.type = set(weightCol, value)

    override def train(dataset: Dataset[_]): MockPredictionModel = {
      require(dataset.schema("label").dataType == DoubleType)
      require(dataset.schema("weight").dataType == DoubleType)
      new MockPredictionModel(uid)
    }

    override def copy(extra: ParamMap): MockPredictor =
      throw new NotImplementedError()
  }

  class MockPredictionModel(override val uid: String)
    extends PredictionModel[Vector, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictormodel"))

    override def predict(features: Vector): Double =
      throw new NotImplementedError()

    override def copy(extra: ParamMap): MockPredictionModel =
      throw new NotImplementedError()
  }
}

Source File: CorrelationSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.stat

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.internal.Logging
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vectors}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}


class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {

  val xData = Array(1.0, 0.0, -2.0)
  val yData = Array(4.0, 5.0, 3.0)
  val zeros = new Array[Double](3)
  val data = Seq(
    Vectors.dense(1.0, 0.0, 0.0, -2.0),
    Vectors.dense(4.0, 5.0, 0.0, 3.0),
    Vectors.dense(6.0, 7.0, 0.0, 8.0),
    Vectors.dense(9.0, 0.0, 0.0, 1.0)
  )

  private def X = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

  private def extract(df: DataFrame): BDM[Double] = {
    val Array(Row(mat: Matrix)) = df.collect()
    mat.asBreeze.toDenseMatrix
  }


  test("corr(X) default, pearson") {
    val defaultMat = Correlation.corr(X, "features")
    val pearsonMat = Correlation.corr(X, "features", "pearson")
    // scalastyle:off
    val expected = Matrices.fromBreeze(BDM(
      (1.00000000, 0.05564149, Double.NaN, 0.4004714),
      (0.05564149, 1.00000000, Double.NaN, 0.9135959),
      (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
      (0.40047142, 0.91359586, Double.NaN, 1.0000000)))
    // scalastyle:on

    assert(Matrices.fromBreeze(extract(defaultMat)) ~== expected absTol 1e-4)
    assert(Matrices.fromBreeze(extract(pearsonMat)) ~== expected absTol 1e-4)
  }

  test("corr(X) spearman") {
    val spearmanMat = Correlation.corr(X, "features", "spearman")
    // scalastyle:off
    val expected = Matrices.fromBreeze(BDM(
      (1.0000000,  0.1054093,  Double.NaN, 0.4000000),
      (0.1054093,  1.0000000,  Double.NaN, 0.9486833),
      (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
      (0.4000000,  0.9486833,  Double.NaN, 1.0000000)))
    // scalastyle:on
    assert(Matrices.fromBreeze(extract(spearmanMat)) ~== expected absTol 1e-4)
  }

}

Source File: ChiSquareTestSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.stat

import java.util.Random

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.stat.test.ChiSqTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ChiSquareTestSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("test DataFrame of labeled points") {
    // labels: 1.0 (2 / 6), 0.0 (4 / 6)
    // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6)
    // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6)
    val data = Seq(
      LabeledPoint(0.0, Vectors.dense(0.5, 10.0)),
      LabeledPoint(0.0, Vectors.dense(1.5, 20.0)),
      LabeledPoint(1.0, Vectors.dense(1.5, 30.0)),
      LabeledPoint(0.0, Vectors.dense(3.5, 30.0)),
      LabeledPoint(0.0, Vectors.dense(3.5, 40.0)),
      LabeledPoint(1.0, Vectors.dense(3.5, 40.0)))
    for (numParts <- List(2, 4, 6, 8)) {
      val df = spark.createDataFrame(sc.parallelize(data, numParts))
      val chi = ChiSquareTest.test(df, "features", "label")
      val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
        chi.select("pValues", "degreesOfFreedom", "statistics")
          .as[(Vector, Array[Int], Vector)].head()
      assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4)
      assert(degreesOfFreedom === Array(2, 3))
      assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4)
    }
  }

  test("large number of features (SPARK-3087)") {
    // Test that the right number of results is returned
    val numCols = 1001
    val sparseData = Array(
      LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))),
      LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0)))))
    val df = spark.createDataFrame(sparseData)
    val chi = ChiSquareTest.test(df, "features", "label")
    val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
      chi.select("pValues", "degreesOfFreedom", "statistics")
        .as[(Vector, Array[Int], Vector)].head()
    assert(pValues.size === numCols)
    assert(degreesOfFreedom.length === numCols)
    assert(statistics.size === numCols)
    assert(pValues(1000) !== null)  // SPARK-3087
  }

  test("fail on continuous features or labels") {
    val tooManyCategories: Int = 100000
    assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " +
      "tooManyCategories be large enough to cause ChiSqTest to throw an exception.")

    val random = new Random(11L)
    val continuousLabel = Seq.fill(tooManyCategories)(
      LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2))))
    withClue("ChiSquare should throw an exception when given a continuous-valued label") {
      intercept[SparkException] {
        val df = spark.createDataFrame(continuousLabel)
        ChiSquareTest.test(df, "features", "label")
      }
    }
    val continuousFeature = Seq.fill(tooManyCategories)(
      LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble())))
    withClue("ChiSquare should throw an exception when given continuous-valued features") {
      intercept[SparkException] {
        val df = spark.createDataFrame(continuousFeature)
        ChiSquareTest.test(df, "features", "label")
      }
    }
  }
}

Source File: StopwatchSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.util

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert(sw.name === "sw")
    assert(sw.elapsed() === 0L)
    assert(!sw.isRunning)
    intercept[AssertionError] {
      sw.stop()
    }
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    sw.start()
    assert(sw.isRunning)
    intercept[AssertionError] {
      sw.start()
    }
  }

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      acc.add(checkStopwatch(sw))
    }
    assert(!sw.isRunning)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)
  }

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
      .addLocal("local")
      .addDistributed("spark")
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
      sw("some")
    }
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      sw("local").start()
      val duration = checkStopwatch(sw("spark"))
      sw("local").stop()
      acc.add(duration)
    }
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)
  }
}

private object StopwatchSuite extends SparkFunSuite {

  
  private def now: Long = System.currentTimeMillis()
}

Source File: RDDLossFunctionSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.optim.loss

import org.apache.spark.SparkFunSuite
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregatorSuite.TestAggregator
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD

class RDDLossFunctionSuite extends SparkFunSuite with MLlibTestSparkContext {

  @transient var instances: RDD[Instance] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    instances = sc.parallelize(Seq(
      Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)),
      Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)),
      Instance(2.0, 0.3, Vectors.dense(4.0, 0.5))
    ))
  }

  test("regularization") {
    val coefficients = Vectors.dense(0.5, -0.1)
    val regLossFun = new L2Regularization(0.1, (_: Int) => true, None)
    val getAgg = (bvec: Broadcast[Vector]) => new TestAggregator(2)(bvec.value)
    val lossNoReg = new RDDLossFunction(instances, getAgg, None)
    val lossWithReg = new RDDLossFunction(instances, getAgg, Some(regLossFun))

    val (loss1, grad1) = lossNoReg.calculate(coefficients.asBreeze.toDenseVector)
    val (regLoss, regGrad) = regLossFun.calculate(coefficients)
    val (loss2, grad2) = lossWithReg.calculate(coefficients.asBreeze.toDenseVector)

    BLAS.axpy(1.0, Vectors.fromBreeze(grad1), regGrad)
    assert(regGrad ~== Vectors.fromBreeze(grad2) relTol 1e-5)
    assert(loss1 + regLoss === loss2)
  }

  test("empty RDD") {
    val rdd = sc.parallelize(Seq.empty[Instance])
    val coefficients = Vectors.dense(0.5, -0.1)
    val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value)
    val lossFun = new RDDLossFunction(rdd, getAgg, None)
    withClue("cannot calculate cost for empty dataset") {
      intercept[IllegalArgumentException]{
        lossFun.calculate(coefficients.asBreeze.toDenseVector)
      }
    }
  }

  test("versus aggregating on an iterable") {
    val coefficients = Vectors.dense(0.5, -0.1)
    val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value)
    val lossFun = new RDDLossFunction(instances, getAgg, None)
    val (loss, grad) = lossFun.calculate(coefficients.asBreeze.toDenseVector)

    // just map the aggregator over the instances array
    val agg = new TestAggregator(2)(coefficients)
    instances.collect().foreach(agg.add)

    assert(loss === agg.loss)
    assert(Vectors.fromBreeze(grad) === agg.gradient)
  }

}

Source File: ANNSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ANNSuite extends SparkFunSuite with MLlibTestSparkContext {

  // TODO: test for weights comparison with Weka MLP
  test("ANN with Sigmoid learns XOR function with LBFGS optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(0.0, 1.0, 1.0, 0.0)
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 1)
    trainer.setWeights(initialWeights)
    trainer.LBFGSOptimizer.setNumIterations(20)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input)(0), label(0))
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(math.round(p) === l)
    }
  }

  test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(
      Array(1.0, 0.0),
      Array(0.0, 1.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0)
    )
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 2)
    // TODO: add a test for SGD
    trainer.LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(20)
    trainer.setWeights(initialWeights).setStackSize(1)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input), label)
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(p ~== l absTol 0.5)
    }
  }
}

Source File: GradientSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class GradientSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("Gradient computation against numerical differentiation") {
    val input = new BDM[Double](3, 1, Array(1.0, 1.0, 1.0))
    // output must contain zeros and one 1 for SoftMax
    val target = new BDM[Double](2, 1, Array(0.0, 1.0))
    val topology = FeedForwardTopology.multiLayerPerceptron(Array(3, 4, 2), softmaxOnTop = false)
    val layersWithErrors = Seq(
      new SigmoidLayerWithSquaredError(),
      new SoftmaxLayerWithCrossEntropyLoss()
    )
    // check all layers that provide loss computation
    // 1) compute loss and gradient given the model and initial weights
    // 2) modify weights with small number epsilon (per dimension i)
    // 3) compute new loss
    // 4) ((newLoss - loss) / epsilon) should be close to the i-th component of the gradient
    for (layerWithError <- layersWithErrors) {
      topology.layers(topology.layers.length - 1) = layerWithError
      val model = topology.model(seed = 12L)
      val weights = model.weights.toArray
      val numWeights = weights.size
      val gradient = Vectors.dense(Array.fill[Double](numWeights)(0.0))
      val loss = model.computeGradient(input, target, gradient, 1)
      val eps = 1e-4
      var i = 0
      val tol = 1e-4
      while (i < numWeights) {
        val originalValue = weights(i)
        weights(i) += eps
        val newModel = topology.model(Vectors.dense(weights))
        val newLoss = computeLoss(input, target, newModel)
        val derivativeEstimate = (newLoss - loss) / eps
        assert(math.abs(gradient(i) - derivativeEstimate) < tol, "Layer failed gradient check: " +
          layerWithError.getClass)
        weights(i) = originalValue
        i += 1
      }
    }
  }

  private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = {
    val outputs = model.forward(input, true)
    model.layerModels.last match {
      case layerWithLoss: LossFunction =>
        layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols))
      case _ =>
        throw new UnsupportedOperationException("Top layer is required to have loss." +
          " Failed layer:" + model.layerModels.last.getClass)
    }
  }
}

Source File: ImageSchemaSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.image

import java.nio.file.Paths
import java.util.Arrays

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.image.ImageSchema._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

class ImageSchemaSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Single column of images named "image"
  private lazy val imagePath = "../data/mllib/images"

  test("Smoke test: create basic ImageSchema dataframe") {
    val origin = "path"
    val width = 1
    val height = 1
    val nChannels = 3
    val data = Array[Byte](0, 0, 0)
    val mode = ocvTypes("CV_8UC3")

    // Internal Row corresponds to image StructType
    val rows = Seq(Row(Row(origin, height, width, nChannels, mode, data)),
      Row(Row(null, height, width, nChannels, mode, data)))
    val rdd = sc.makeRDD(rows)
    val df = spark.createDataFrame(rdd, ImageSchema.imageSchema)

    assert(df.count === 2, "incorrect image count")
    assert(df.schema("image").dataType == columnSchema, "data do not fit ImageSchema")
  }

  test("readImages count test") {
    var df = readImages(imagePath)
    assert(df.count === 1)

    df = readImages(imagePath, null, true, -1, false, 1.0, 0)
    assert(df.count === 10)

    df = readImages(imagePath, null, true, -1, true, 1.0, 0)
    val countTotal = df.count
    assert(countTotal === 8)

    df = readImages(imagePath, null, true, -1, true, 0.5, 0)
    // Random number about half of the size of the original dataset
    val count50 = df.count
    assert(count50 > 0 && count50 < countTotal)
  }

  test("readImages partition test") {
    val df = readImages(imagePath, null, true, 3, true, 1.0, 0)
    assert(df.rdd.getNumPartitions === 3)
  }

  // Images with the different number of channels
  test("readImages pixel values test") {

    val images = readImages(imagePath + "/multi-channel/").collect

    images.foreach { rrow =>
      val row = rrow.getAs[Row](0)
      val filename = Paths.get(getOrigin(row)).getFileName().toString()
      if (firstBytes20.contains(filename)) {
        val mode = getMode(row)
        val bytes20 = getData(row).slice(0, 20)

        val (expectedMode, expectedBytes) = firstBytes20(filename)
        assert(ocvTypes(expectedMode) === mode, "mode of the image is not read correctly")
        assert(Arrays.equals(expectedBytes, bytes20), "incorrect numeric value for flattened image")
      }
    }
  }

  // number of channels and first 20 bytes of OpenCV representation
  // - default representation for 3-channel RGB images is BGR row-wise:
  //   (B00, G00, R00,      B10, G10, R10,      ...)
  // - default representation for 4-channel RGB images is BGRA row-wise:
  //   (B00, G00, R00, A00, B10, G10, R10, A00, ...)
  private val firstBytes20 = Map(
    "grayscale.jpg" ->
      (("CV_8UC1", Array[Byte](-2, -33, -61, -60, -59, -59, -64, -59, -66, -67, -73, -73, -62,
        -57, -60, -63, -53, -49, -55, -69))),
    "chr30.4.184.jpg" -> (("CV_8UC3",
      Array[Byte](-9, -3, -1, -43, -32, -28, -75, -60, -57, -78, -59, -56, -74, -59, -57,
        -71, -58, -56, -73, -64))),
    "BGRA.png" -> (("CV_8UC4",
      Array[Byte](-128, -128, -8, -1, -128, -128, -8, -1, -128,
        -128, -8, -1, 127, 127, -9, -1, 127, 127, -9, -1))),
    "BGRA_alpha_60.png" -> (("CV_8UC4",
      Array[Byte](-128, -128, -8, 60, -128, -128, -8, 60, -128,
        -128, -8, 60, 127, 127, -9, 60, 127, 127, -9, 60)))
  )
}

Source File: ElementwiseProductSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("elementwise (hadamard) product should properly apply vector to dense data set") {
    val denseData = Array(
      Vectors.dense(1.0, 4.0, 1.9, -9.0)
    )
    val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
    val transformer = new ElementwiseProduct(scalingVec)
    val transformedData = transformer.transform(sc.makeRDD(denseData))
    val transformedVecs = transformedData.collect()
    val transformedVec = transformedVecs(0)
    val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
    assert(transformedVec ~== expectedVec absTol 1E-5,
      s"Expected transformed vector $expectedVec but found $transformedVec")
  }

  test("elementwise (hadamard) product should properly apply vector to sparse data set") {
    val sparseData = Array(
      Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
    )
    val dataRDD = sc.parallelize(sparseData, 3)
    val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
    val transformer = new ElementwiseProduct(scalingVec)
    val data2 = sparseData.map(transformer.transform)
    val data2RDD = transformer.transform(dataRDD)

    assert((sparseData, data2, data2RDD.collect()).zipped.forall {
      case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after hadamard product")

    assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
    assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
  }
}

Source File: IDFSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
        0
      }
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

}

Source File: PCASuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
  )

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect()

    pca_transform.zip(mat_multiply).foreach { case (calculated, expected) =>
      assert(calculated ~== expected relTol 1e-8)
    }
    assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)
  }

  test("memory cost computation") {
    assert(PCAUtil.memoryCost(10, 100) < Int.MaxValue)
    // check overflowing
    assert(PCAUtil.memoryCost(40000, 60000) > Int.MaxValue)
  }
}

Source File: HashingTFSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("hashing tf on a single doc") {
    val hashingTF = new HashingTF(1000)
    val doc = "a a b b c d".split(" ")
    val n = hashingTF.numFeatures
    val termFreqs = Seq(
      (hashingTF.indexOf("a"), 2.0),
      (hashingTF.indexOf("b"), 2.0),
      (hashingTF.indexOf("c"), 1.0),
      (hashingTF.indexOf("d"), 1.0))
    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
      "index must be in range [0, #features)")
    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")
    val expected = Vectors.sparse(n, termFreqs)
    assert(hashingTF.transform(doc) === expected)
  }

  test("hashing tf on an RDD") {
    val hashingTF = new HashingTF
    val localDocs: Seq[Seq[String]] = Seq(
      "a a b b b c d".split(" "),
      "a b c d a b c".split(" "),
      "c b a c b a a".split(" "))
    val docs = sc.parallelize(localDocs, 2)
    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
  }

  test("applying binary term freqs") {
    val hashingTF = new HashingTF(100).setBinary(true)
    val doc = "a a b c c c".split(" ")
    val n = hashingTF.numFeatures
    val expected = Vectors.sparse(n, Seq(
      (hashingTF.indexOf("a"), 1.0),
      (hashingTF.indexOf("b"), 1.0),
      (hashingTF.indexOf("c"), 1.0)))
    assert(hashingTF.transform(doc) ~== expected absTol 1e-14)
  }
}

Source File: MatrixFactorizationModelSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.recommendation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils

class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext {

  val rank = 2
  var userFeatures: RDD[(Int, Array[Double])] = _
  var prodFeatures: RDD[(Int, Array[Double])] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0))))
    prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0))))
  }

  test("constructor") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    assert(model.predict(0, 2) ~== 17.0 relTol 1e-14)

    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(1, userFeatures, prodFeatures)
    }

    val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures1, prodFeatures)
    }

    val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures, prodFeatures1)
    }
  }

  test("save/load") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = {
      features.mapValues(_.toSeq).collect().toSet
    }
    try {
      model.save(sc, path)
      val newModel = MatrixFactorizationModel.load(sc, path)
      assert(newModel.rank === rank)
      assert(collect(newModel.userFeatures) === collect(userFeatures))
      assert(collect(newModel.productFeatures) === collect(prodFeatures))
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }

  test("batch predict API recommendProductsForUsers") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendProductsForUsers(topK).collectAsMap()

    assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14)
    assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14)
  }

  test("batch predict API recommendUsersForProducts") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendUsersForProducts(topK).collectAsMap()

    assert(recommendations(2)(0).user == 1)
    assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14)
    assert(recommendations(2)(1).user == 0)
    assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14)
  }
}

Source File: RankingMetricsSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("Ranking metrics: MAP, NDCG") {
    val predictionAndLabels = sc.parallelize(
      Seq(
        (Array(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array(1, 2, 3, 4, 5)),
        (Array(4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array(1, 2, 3)),
        (Array(1, 2, 3, 4, 5), Array.empty[Int])
      ), 2)
    val eps = 1.0E-5

    val metrics = new RankingMetrics(predictionAndLabels)
    val map = metrics.meanAveragePrecision

    assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps)
    assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps)

    assert(map ~== 0.355026 absTol eps)

    assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps)
    assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps)
    assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps)
  }

  test("MAP, NDCG with few predictions (SPARK-14886)") {
    val predictionAndLabels = sc.parallelize(
      Seq(
        (Array(1, 6, 2), Array(1, 2, 3, 4, 5)),
        (Array.empty[Int], Array(1, 2, 3))
      ), 2)
    val eps = 1.0E-5

    val metrics = new RankingMetrics(predictionAndLabels)
    assert(metrics.precisionAt(1) ~== 0.5 absTol eps)
    assert(metrics.precisionAt(2) ~== 0.25 absTol eps)
    assert(metrics.ndcgAt(1) ~== 0.5 absTol eps)
    assert(metrics.ndcgAt(2) ~== 0.30657 absTol eps)
  }

}

Source File: AreaUnderCurveSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("auc computation") {
    val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0))
    val auc = 4.0
    assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5)
  }

  test("auc of an empty curve") {
    val curve = Seq.empty[(Double, Double)]
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }

  test("auc of a curve with a single point") {
    val curve = Seq((1.0, 1.0))
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }
}

Source File: FPTreeSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import scala.language.existentials

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("add transaction") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    assert(tree.root.children.size == 2)
    assert(tree.root.children.contains("a"))
    assert(tree.root.children("a").item.equals("a"))
    assert(tree.root.children("a").count == 2)
    assert(tree.root.children.contains("b"))
    assert(tree.root.children("b").item.equals("b"))
    assert(tree.root.children("b").count == 1)
    var child = tree.root.children("a")
    assert(child.children.size == 1)
    assert(child.children.contains("b"))
    assert(child.children("b").item.equals("b"))
    assert(child.children("b").count == 2)
    child = child.children("b")
    assert(child.children.size == 2)
    assert(child.children.contains("c"))
    assert(child.children.contains("y"))
    assert(child.children("c").item.equals("c"))
    assert(child.children("y").item.equals("y"))
    assert(child.children("c").count == 1)
    assert(child.children("y").count == 1)
  }

  test("merge tree") {
    val tree1 = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    val tree2 = new FPTree[String]
      .add(Seq("a", "b"))
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "c", "d"))
      .add(Seq("a", "x"))
      .add(Seq("a", "x", "y"))
      .add(Seq("c", "n"))
      .add(Seq("c", "m"))

    val tree3 = tree1.merge(tree2)

    assert(tree3.root.children.size == 3)
    assert(tree3.root.children("a").count == 7)
    assert(tree3.root.children("b").count == 1)
    assert(tree3.root.children("c").count == 2)
    val child1 = tree3.root.children("a")
    assert(child1.children.size == 2)
    assert(child1.children("b").count == 5)
    assert(child1.children("x").count == 2)
    val child2 = child1.children("b")
    assert(child2.children.size == 2)
    assert(child2.children("y").count == 1)
    assert(child2.children("c").count == 3)
    val child3 = child2.children("c")
    assert(child3.children.size == 1)
    assert(child3.children("d").count == 1)
    val child4 = child1.children("x")
    assert(child4.children.size == 1)
    assert(child4.children("y").count == 1)
    val child5 = tree3.root.children("c")
    assert(child5.children.size == 2)
    assert(child5.children("n").count == 1)
    assert(child5.children("m").count == 1)
  }

  test("extract freq itemsets") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("a", "b"))
      .add(Seq("a"))
      .add(Seq("b"))
      .add(Seq("b", "n"))

    val freqItemsets = tree.extract(3L).map { case (items, count) =>
      (items.toSet, count)
    }.toSet
    val expected = Set(
      (Set("a"), 4L),
      (Set("b"), 5L),
      (Set("a", "b"), 3L))
    assert(freqItemsets === expected)
  }
}

Source File: AssociationRulesSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("association rules using String type") {
    val freqItemsets = sc.parallelize(Seq(
      (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
      (Set("r"), 3L),
      (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L),
      (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L),
      (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L),
      (Set("t", "y", "x"), 3L),
      (Set("t", "y", "x", "z"), 3L)
    ).map {
      case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq)
    })

    val ar = new AssociationRules()

    val results1 = ar
      .setMinConfidence(0.9)
      .run(freqItemsets)
      .collect()

    
    assert(results2.size === 30)
    assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
  }
}

Source File: KernelDensitySuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MultivariateGaussianSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Matrices, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }

}

Source File: CoordinateMatrixSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {

  val m = 5
  val n = 4
  var mat: CoordinateMatrix = _

  override def beforeAll() {
    super.beforeAll()
    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    mat = new CoordinateMatrix(entries)
  }

  test("size") {
    assert(mat.numRows() === m)
    assert(mat.numCols() === n)
  }

  test("empty entries") {
    val entries = sc.parallelize(Seq[MatrixEntry](), 1)
    val emptyMat = new CoordinateMatrix(entries)
    intercept[RuntimeException] {
      emptyMat.numCols()
    }
    intercept[RuntimeException] {
      emptyMat.numRows()
    }
  }

  test("toBreeze") {
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(mat.toBreeze() === expected)
  }

  test("transpose") {
    val transposed = mat.transpose()
    assert(mat.toBreeze().t === transposed.toBreeze())
  }

  test("toIndexedRowMatrix") {
    val indexedRowMatrix = mat.toIndexedRowMatrix()
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(indexedRowMatrix.toBreeze() === expected)
  }

  test("toRowMatrix") {
    val rowMatrix = mat.toRowMatrix()
    val rows = rowMatrix.rows.collect().toSet
    val expected = Set(
      Vectors.dense(1.0, 2.0, 0.0, 0.0),
      Vectors.dense(0.0, 3.0, 4.0, 0.0),
      Vectors.dense(0.0, 0.0, 5.0, 6.0),
      Vectors.dense(7.0, 0.0, 0.0, 8.0),
      Vectors.dense(0.0, 9.0, 0.0, 0.0))
    assert(rows === expected)
  }

  test("toBlockMatrix") {
    val blockMat = mat.toBlockMatrix(2, 2)
    assert(blockMat.numRows() === m)
    assert(blockMat.numCols() === n)
    assert(blockMat.toBreeze() === mat.toBreeze())

    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(-1, 2)
    }
    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(2, 0)
    }
  }
}

Source File: MLPairRDDFunctionsSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("topByKey") {
    val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5,
      1), (3, 5)), 2)
      .topByKey(5)
      .collectAsMap()

    assert(topMap.size === 3)
    assert(topMap(1) === Array(7, 6, 3, 2, 1))
    assert(topMap(3) === Array(7, 5, 2))
    assert(topMap(5) === Array(1))
  }
}

Source File: RDDFunctionsSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.rdd.RDDFunctions._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("sliding") {
    val data = 0 until 6
    for (numPartitions <- 1 to 8) {
      val rdd = sc.parallelize(data, numPartitions)
      for (windowSize <- 1 to 6) {
        for (step <- 1 to 3) {
          val sliding = rdd.sliding(windowSize, step).collect().map(_.toList).toList
          val expected = data.sliding(windowSize, step)
            .map(_.toList).toList.filter(l => l.size == windowSize)
          assert(sliding === expected)
        }
      }
      assert(rdd.sliding(7).collect().isEmpty,
        "Should return an empty RDD if the window size is greater than the number of items.")
    }
  }

  test("sliding with empty partitions") {
    val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7))
    val rdd = sc.parallelize(data, data.length).flatMap(s => s)
    assert(rdd.partitions.length === data.length)
    val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq)
    val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq)
    assert(sliding === expected)
  }
}

Source File: OneHotEncoderSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col

class OneHotEncoderSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  def stringIndexed(): DataFrame = {
    val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
    val df = sqlContext.createDataFrame(data).toDF("id", "label")
    val indexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("labelIndex")
      .fit(df)
    indexer.transform(df)
  }

  test("params") {
    ParamsSuite.checkParams(new OneHotEncoder)
  }

  test("OneHotEncoder dropLast = false") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
      .setInputCol("labelIndex")
      .setOutputCol("labelVec")
      .setDropLast(false)
    val encoded = encoder.transform(transformed)

    val output = encoded.select("id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1), vec(2))
    }.collect().toSet
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0),
      (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0))
    assert(output === expected)
  }

  test("OneHotEncoder dropLast = true") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
      .setInputCol("labelIndex")
      .setOutputCol("labelVec")
    val encoded = encoder.transform(transformed)

    val output = encoded.select("id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1))
    }.collect().toSet
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0),
      (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0))
    assert(output === expected)
  }

  test("input column with ML attribute") {
    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size")
      .select(col("size").as("size", attr.toMetadata()))
    val encoder = new OneHotEncoder()
      .setInputCol("size")
      .setOutputCol("encoded")
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1))
  }

  test("input column without ML attribute") {
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index")
    val encoder = new OneHotEncoder()
      .setInputCol("index")
      .setOutputCol("encoded")
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1))
  }

  test("read/write") {
    val t = new OneHotEncoder()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setDropLast(false)
    testDefaultReadWrite(t)
  }
}

Source File: ChiSqSelectorSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{Row, SQLContext}

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
  with DefaultReadWriteTest {

  test("Test Chi-Square selector") {
    val sqlContext = SQLContext.getOrCreate(sc)
    import sqlContext.implicits._

    val data = Seq(
      LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
      LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))
    )

    val preFilteredData = Seq(
      Vectors.dense(0.0),
      Vectors.dense(6.0),
      Vectors.dense(8.0),
      Vectors.dense(5.0)
    )

    val df = sc.parallelize(data.zip(preFilteredData))
      .map(x => (x._1.label, x._1.features, x._2))
      .toDF("label", "data", "preFilteredData")

    val model = new ChiSqSelector()
      .setNumTopFeatures(1)
      .setFeaturesCol("data")
      .setLabelCol("label")
      .setOutputCol("filtered")

    model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
      case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 ~== vec2 absTol 1e-1)
    }
  }

  test("ChiSqSelector read/write") {
    val t = new ChiSqSelector()
      .setFeaturesCol("myFeaturesCol")
      .setLabelCol("myLabelCol")
      .setOutputCol("myOutputCol")
      .setNumTopFeatures(2)
    testDefaultReadWrite(t)
  }

  test("ChiSqSelectorModel read/write") {
    val oldModel = new feature.ChiSqSelectorModel(Array(1, 3))
    val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel)
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.selectedFeatures === instance.selectedFeatures)
  }
}

Source File: DCTSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

@BeanInfo
case class DCTTestData(vec: Vector, wantedVec: Vector)

class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("forward transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = false

    testDCT(data, inverse)
  }

  test("inverse transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = true

    testDCT(data, inverse)
  }

  test("read/write") {
    val t = new DCT()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setInverse(true)
    testDefaultReadWrite(t)
  }

  private def testDCT(data: Vector, inverse: Boolean): Unit = {
    val expectedResultBuffer = data.toArray.clone()
    if (inverse) {
      (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true)
    } else {
      (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true)
    }
    val expectedResult = Vectors.dense(expectedResultBuffer)

    val dataset = sqlContext.createDataFrame(Seq(
      DCTTestData(data, expectedResult)
    ))

    val transformer = new DCT()
      .setInputCol("vec")
      .setOutputCol("resultVec")
      .setInverse(inverse)

    transformer.transform(dataset)
      .select("resultVec", "wantedVec")
      .collect()
      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
    }
  }
}

Source File: VectorSlicerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{StructField, StructType}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    val slicer = new VectorSlicer().setInputCol("feature")
    ParamsSuite.checkParams(slicer)
    assert(slicer.getIndices.length === 0)
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {
        slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true))))
      }
    }
  }

  test("feature validity checks") {
    import VectorSlicer._
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(validIndices(Array.empty[Int]))
    assert(!validIndices(Array(-1)))
    assert(!validIndices(Array(1, 2, 1)))

    assert(validNames(Array("a", "b")))
    assert(validNames(Array.empty[String]))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))
  }

  test("Test vector slicer") {
    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())
    )

    // Expected after selecting indices 1, 4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) }
    val df = spark.createDataFrame(rdd,
      StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))

    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {
      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      }
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
      resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
        assert(a === b)
      }
    }

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    validateResults(vectorSlicer.transform(df))
  }

  test("read/write") {
    val t = new VectorSlicer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setIndices(Array(1, 3))
      .setNames(Array("a", "d"))
    testDefaultReadWrite(t)
  }
}

Source File: MaxAbsScalerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("MaxAbsScaler fit basic case") {
    val data = Array(
      Vectors.dense(1, 0, 100),
      Vectors.dense(2, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-2, -100)),
      Vectors.sparse(3, Array(0), Array(-1.5)))

    val expected: Array[Vector] = Array(
      Vectors.dense(0.5, 0, 1),
      Vectors.dense(1, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-1, -1)),
      Vectors.sparse(3, Array(0), Array(-0.75)))

    val df = data.zip(expected).toSeq.toDF("features", "expected")
    val scaler = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaled")

    val model = scaler.fit(df)
    model.transform(df).select("expected", "scaled").collect()
      .foreach { case Row(vector1: Vector, vector2: Vector) =>
      assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1")
    }

    // copied model must have the same parent.
    MLTestingUtils.checkCopy(model)
  }

  test("MaxAbsScaler read/write") {
    val t = new MaxAbsScaler()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }

  test("MaxAbsScalerModel read/write") {
    val instance = new MaxAbsScalerModel(
      "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0))
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.maxAbs === instance.maxAbs)
  }

}

Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
  with DefaultReadWriteTest {

  test("Test Chi-Square selector") {
    import testImplicits._
    val data = Seq(
      LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
      LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))
    )

    val preFilteredData = Seq(
      Vectors.dense(8.0),
      Vectors.dense(0.0),
      Vectors.dense(0.0),
      Vectors.dense(8.0)
    )

    val df = sc.parallelize(data.zip(preFilteredData))
      .map(x => (x._1.label, x._1.features, x._2))
      .toDF("label", "data", "preFilteredData")

    val selector = new ChiSqSelector()
      .setSelectorType("kbest")
      .setNumTopFeatures(1)
      .setFeaturesCol("data")
      .setLabelCol("label")
      .setOutputCol("filtered")

    selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
      case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 ~== vec2 absTol 1e-1)
    }

    selector.setSelectorType("percentile").setPercentile(0.34).fit(df).transform(df)
      .select("filtered", "preFilteredData").collect().foreach {
        case Row(vec1: Vector, vec2: Vector) =>
          assert(vec1 ~== vec2 absTol 1e-1)
      }

    val preFilteredData2 = Seq(
      Vectors.dense(8.0, 7.0),
      Vectors.dense(0.0, 9.0),
      Vectors.dense(0.0, 9.0),
      Vectors.dense(8.0, 9.0)
    )

    val df2 = sc.parallelize(data.zip(preFilteredData2))
      .map(x => (x._1.label, x._1.features, x._2))
      .toDF("label", "data", "preFilteredData")

    selector.setSelectorType("fpr").setAlpha(0.2).fit(df2).transform(df2)
      .select("filtered", "preFilteredData").collect().foreach {
        case Row(vec1: Vector, vec2: Vector) =>
          assert(vec1 ~== vec2 absTol 1e-1)
      }
  }

  test("ChiSqSelector read/write") {
    val t = new ChiSqSelector()
      .setFeaturesCol("myFeaturesCol")
      .setLabelCol("myLabelCol")
      .setOutputCol("myOutputCol")
      .setNumTopFeatures(2)
    testDefaultReadWrite(t)
  }

  test("ChiSqSelectorModel read/write") {
    val oldModel = new feature.ChiSqSelectorModel(Array(1, 3))
    val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel)
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.selectedFeatures === instance.selectedFeatures)
  }

  test("should support all NumericType labels and not support other types") {
    val css = new ChiSqSelector()
    MLTestingUtils.checkNumericTypes[ChiSqSelectorModel, ChiSqSelector](
      css, spark) { (expected, actual) =>
        assert(expected.selectedFeatures === actual.selectedFeatures)
      }
  }
}

Source File: DCTSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

@BeanInfo
case class DCTTestData(vec: Vector, wantedVec: Vector)

class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("forward transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = false

    testDCT(data, inverse)
  }

  test("inverse transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = true

    testDCT(data, inverse)
  }

  test("read/write") {
    val t = new DCT()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setInverse(true)
    testDefaultReadWrite(t)
  }

  private def testDCT(data: Vector, inverse: Boolean): Unit = {
    val expectedResultBuffer = data.toArray.clone()
    if (inverse) {
      new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true)
    } else {
      new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true)
    }
    val expectedResult = Vectors.dense(expectedResultBuffer)

    val dataset = Seq(DCTTestData(data, expectedResult)).toDF()

    val transformer = new DCT()
      .setInputCol("vec")
      .setOutputCol("resultVec")
      .setInverse(inverse)

    transformer.transform(dataset)
      .select("resultVec", "wantedVec")
      .collect()
      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
    }
  }
}

Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ElementwiseProductSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("read/write") {
    val ep = new ElementwiseProduct()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setScalingVec(Vectors.dense(0.1, 0.2))
    testDefaultReadWrite(ep)
  }
}

Source File: BinarizerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  @transient var data: Array[Double] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
  }

  test("params") {
    ParamsSuite.checkParams(new Binarizer)
  }

  test("Binarize continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize continuous features with setter") {
    val threshold: Double = 0.2
    val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize vector of continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = Seq(
      (Vectors.dense(data), Vectors.dense(defaultBinarized))
    ).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x == y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize vector of continuous features with setter") {
    val threshold: Double = 0.2
    val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = Seq(
      (Vectors.dense(data), Vectors.dense(defaultBinarized))
    ).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x == y, "The feature value is not correct after binarization.")
    }
  }


  test("read/write") {
    val t = new Binarizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setThreshold(0.1)
    testDefaultReadWrite(t)
  }
}

Source File: SQLTransformerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.types.{LongType, StructField, StructType}

class SQLTransformerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new SQLTransformer())
  }

  test("transform numeric data") {
    val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    val result = sqlTrans.transform(original)
    val resultSchema = sqlTrans.transformSchema(original.schema)
    val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
      .toDF("id", "v1", "v2", "v3", "v4")
    assert(result.schema.toString == resultSchema.toString)
    assert(resultSchema == expected.schema)
    assert(result.collect().toSeq == expected.collect().toSeq)
    assert(original.sparkSession.catalog.listTables().count() == 0)
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }

  test("transformSchema") {
    val df = spark.range(10)
    val outputSchema = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
      .transformSchema(df.schema)
    val expected = StructType(Seq(StructField("id1", LongType, nullable = false)))
    assert(outputSchema === expected)
  }
}

Source File: TokenizerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}

@BeanInfo
case class TokenizerTestData(rawText: String, wantedTokens: Array[String])

class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new Tokenizer)
  }

  test("read/write") {
    val t = new Tokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }
}

class RegexTokenizerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import org.apache.spark.ml.feature.RegexTokenizerSuite._
  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegexTokenizer)
  }

  test("RegexTokenizer") {
    val tokenizer0 = new RegexTokenizer()
      .setGaps(false)
      .setPattern("\\w+|\\p{Punct}")
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset0 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")),
      TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct"))
    ).toDF()
    testRegexTokenizer(tokenizer0, dataset0)

    val dataset1 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")),
      TokenizerTestData("Te,st. punct", Array("punct"))
    ).toDF()
    tokenizer0.setMinTokenLength(3)
    testRegexTokenizer(tokenizer0, dataset1)

    val tokenizer2 = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset2 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")),
      TokenizerTestData("Te,st.  punct", Array("te,st.", "punct"))
    ).toDF()
    testRegexTokenizer(tokenizer2, dataset2)
  }

  test("RegexTokenizer with toLowercase false") {
    val tokenizer = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
      .setToLowercase(false)
    val dataset = Seq(
      TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")),
      TokenizerTestData("java scala", Array("java", "scala"))
    ).toDF()
    testRegexTokenizer(tokenizer, dataset)
  }

  test("read/write") {
    val t = new RegexTokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setMinTokenLength(2)
      .setGaps(false)
      .setPattern("hi")
      .setToLowercase(false)
    testDefaultReadWrite(t)
  }
}

object RegexTokenizerSuite extends SparkFunSuite {

  def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = {
    t.transform(dataset)
      .select("tokens", "wantedTokens")
      .collect()
      .foreach { case Row(tokens, wantedTokens) =>
        assert(tokens === wantedTokens)
      }
  }
}

Source File: NGramSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}

@BeanInfo
case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String])

class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import org.apache.spark.ml.feature.NGramSuite._
  import testImplicits._

  test("default behavior yields bigram features") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
    val dataset = Seq(NGramTestData(
      Array("Test", "for", "ngram", "."),
      Array("Test for", "for ngram", "ngram .")
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("NGramLength=4 yields length 4 n-grams") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = Seq(NGramTestData(
      Array("a", "b", "c", "d", "e"),
      Array("a b c d", "b c d e")
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("empty input yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = Seq(NGramTestData(Array(), Array())).toDF()
    testNGram(nGram, dataset)
  }

  test("input array < n yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(6)
    val dataset = Seq(NGramTestData(
      Array("a", "b", "c", "d", "e"),
      Array()
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("read/write") {
    val t = new NGram()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setN(3)
    testDefaultReadWrite(t)
  }
}

object NGramSuite extends SparkFunSuite {

  def testNGram(t: NGram, dataset: Dataset[_]): Unit = {
    t.transform(dataset)
      .select("nGrams", "wantedNGrams")
      .collect()
      .foreach { case Row(actualNGrams, wantedNGrams) =>
        assert(actualNGrams === wantedNGrams)
      }
  }
}

Source File: PCASuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new PCA)
    val mat = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix]
    val explainedVariance = Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector]
    val model = new PCAModel("pca", mat, explainedVariance)
    ParamsSuite.checkParams(model)
  }

  test("pca") {
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )

    val dataRDD = sc.parallelize(data, 2)

    val mat = new RowMatrix(dataRDD.map(OldVectors.fromML))
    val pc = mat.computePrincipalComponents(3)
    val expected = mat.multiply(pc).rows.map(_.asML)

    val df = dataRDD.zip(expected).toDF("features", "expected")

    val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pca_features")
      .setK(3)
      .fit(df)

    // copied model must have the same parent.
    MLTestingUtils.checkCopy(pca)

    pca.transform(df).select("pca_features", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
    }
  }

  test("PCA read/write") {
    val t = new PCA()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setK(3)
    testDefaultReadWrite(t)
  }

  test("PCAModel read/write") {
    val instance = new PCAModel("myPCAModel",
      Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix],
      Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector])
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.pc === instance.pc)
  }
}

Source File: HashingTFSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new HashingTF)
  }

  test("hashingTF") {
    val df = Seq((0, "a a b b c d".split(" ").toSeq)).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
      .setInputCol("words")
      .setOutputCol("features")
      .setNumFeatures(n)
    val output = hashingTF.transform(df)
    val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
    require(attrGroup.numAttributes === Some(n))
    val features = output.select("features").first().getAs[Vector](0)
    // Assume perfect hash on "a", "b", "c", and "d".
    def idx: Any => Int = murmur3FeatureIdx(n)
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))
    assert(features ~== expected absTol 1e-14)
  }

  test("applying binary term freqs") {
    val df = Seq((0, "a a b c c c".split(" ").toSeq)).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
        .setInputCol("words")
        .setOutputCol("features")
        .setNumFeatures(n)
        .setBinary(true)
    val output = hashingTF.transform(df)
    val features = output.select("features").first().getAs[Vector](0)
    def idx: Any => Int = murmur3FeatureIdx(n)  // Assume perfect hash on input features
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 1.0), (idx("b"), 1.0), (idx("c"), 1.0)))
    assert(features ~== expected absTol 1e-14)
  }

  test("read/write") {
    val t = new HashingTF()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setNumFeatures(10)
    testDefaultReadWrite(t)
  }

  private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = {
    Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures)
  }
}

Source File: BaggedPointSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.tree.EnsembleTestHelper
import org.apache.spark.mllib.util.MLlibTestSparkContext


class BaggedPointSuite extends SparkFunSuite with MLlibTestSparkContext  {

  test("BaggedPoint RDD: without subsampling") {
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42)
    baggedRDD.collect().foreach { baggedPoint =>
      assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 1.0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") {
    val numSubsamples = 100
    val (expectedMean, expectedStddev) = (1.0, 0)

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }

  test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") {
    val numSubsamples = 100
    val subsample = 0.5
    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample)))

    val seeds = Array(123, 5354, 230, 349867, 23987)
    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
    val rdd = sc.parallelize(arr)
    seeds.foreach { seed =>
      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed)
      val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect()
      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean,
        expectedStddev, epsilon = 0.01)
    }
  }
}

Source File: GradientBoostedTreesSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.internal.Logging
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite}
import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy}
import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Variance
import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError}
import org.apache.spark.mllib.util.MLlibTestSparkContext


class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {

  import testImplicits._

  test("runWithValidation stops early and performs better on a validation dataset") {
    // Set numIterations large enough so that it stops early.
    val numIterations = 20
    val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML)
    val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML)
    val trainDF = trainRdd.toDF()
    val validateDF = validateRdd.toDF()

    val algos = Array(Regression, Regression, Classification)
    val losses = Array(SquaredError, AbsoluteError, LogLoss)
    algos.zip(losses).foreach { case (algo, loss) =>
      val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2,
        categoricalFeaturesInfo = Map.empty)
      val boostingStrategy =
        new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0)
      val (validateTrees, validateTreeWeights) = GradientBoostedTrees
        .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L)
      val numTrees = validateTrees.length
      assert(numTrees !== numIterations)

      // Test that it performs better on the validation dataset.
      val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L)
      val (errorWithoutValidation, errorWithValidation) = {
        if (algo == Classification) {
          val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features))
          (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss),
            GradientBoostedTrees.computeError(remappedRdd, validateTrees,
              validateTreeWeights, loss))
        } else {
          (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss),
            GradientBoostedTrees.computeError(validateRdd, validateTrees,
              validateTreeWeights, loss))
        }
      }
      assert(errorWithValidation <= errorWithoutValidation)

      // Test that results from evaluateEachIteration comply with runWithValidation.
      // Note that convergenceTol is set to 0.0
      val evaluationArray = GradientBoostedTrees
        .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo)
      assert(evaluationArray.length === numIterations)
      assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1))
      var i = 1
      while (i < numTrees) {
        assert(evaluationArray(i) <= evaluationArray(i - 1))
        i += 1
      }
    }
  }

}

Source File: BinaryClassificationEvaluatorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class BinaryClassificationEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new BinaryClassificationEvaluator)
  }

  test("read/write") {
    val evaluator = new BinaryClassificationEvaluator()
      .setRawPredictionCol("myRawPrediction")
      .setLabelCol("myLabel")
      .setMetricName("areaUnderPR")
    testDefaultReadWrite(evaluator)
  }

  test("should accept both vector and double raw prediction col") {
    val evaluator = new BinaryClassificationEvaluator()
      .setMetricName("areaUnderPR")

    val vectorDF = Seq(
      (0d, Vectors.dense(12, 2.5)),
      (1d, Vectors.dense(1, 3)),
      (0d, Vectors.dense(10, 2))
    ).toDF("label", "rawPrediction")
    assert(evaluator.evaluate(vectorDF) === 1.0)

    val doubleDF = Seq(
      (0d, 0d),
      (1d, 1d),
      (0d, 0d)
    ).toDF("label", "rawPrediction")
    assert(evaluator.evaluate(doubleDF) === 1.0)

    val stringDF = Seq(
      (0d, "0d"),
      (1d, "1d"),
      (0d, "0d")
    ).toDF("label", "rawPrediction")
    val thrown = intercept[IllegalArgumentException] {
      evaluator.evaluate(stringDF)
    }
    assert(thrown.getMessage.replace("\n", "") contains "Column rawPrediction must be of type " +
      "equal to one of the following types: [DoubleType, ")
    assert(thrown.getMessage.replace("\n", "") contains "but was actually of type StringType.")
  }

  test("should support all NumericType labels and not support other types") {
    val evaluator = new BinaryClassificationEvaluator().setRawPredictionCol("prediction")
    MLTestingUtils.checkNumericTypes(evaluator, spark)
  }
}

Source File: MulticlassClassificationEvaluatorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MulticlassClassificationEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new MulticlassClassificationEvaluator)
  }

  test("read/write") {
    val evaluator = new MulticlassClassificationEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("accuracy")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new MulticlassClassificationEvaluator, spark)
  }
}

Source File: RegressionEvaluatorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
  }

  test("read/write") {
    val evaluator = new RegressionEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("r2")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark)
  }
}

Source File: RWrapperUtilsSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.{RFormula, RFormulaModel}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class RWrapperUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("avoid libsvm data column name conflicting") {
    val rFormula = new RFormula().setFormula("label ~ features")
    val data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt")

    // if not checking column name, then IllegalArgumentException
    intercept[IllegalArgumentException] {
      rFormula.fit(data)
    }

    // after checking, model build is ok
    RWrapperUtils.checkDataColumns(rFormula, data)

    assert(rFormula.getLabelCol == "label")
    assert(rFormula.getFeaturesCol.startsWith("features_"))

    val model = rFormula.fit(data)
    assert(model.isInstanceOf[RFormulaModel])

    assert(model.getLabelCol == "label")
    assert(model.getFeaturesCol.startsWith("features_"))
  }

}

Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: StopwatchSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.util

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert(sw.name === "sw")
    assert(sw.elapsed() === 0L)
    assert(!sw.isRunning)
    intercept[AssertionError] {
      sw.stop()
    }
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    sw.start()
    assert(sw.isRunning)
    intercept[AssertionError] {
      sw.start()
    }
  }

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      acc.add(checkStopwatch(sw))
    }
    assert(!sw.isRunning)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)
  }

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
      .addLocal("local")
      .addDistributed("spark")
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
      sw("some")
    }
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      sw("local").start()
      val duration = checkStopwatch(sw("spark"))
      sw("local").stop()
      acc.add(duration)
    }
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)
  }
}

private object StopwatchSuite extends SparkFunSuite {

  
  private def now: Long = System.currentTimeMillis()
}

Source File: ANNSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ANNSuite extends SparkFunSuite with MLlibTestSparkContext {

  // TODO: test for weights comparison with Weka MLP
  test("ANN with Sigmoid learns XOR function with LBFGS optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(0.0, 1.0, 1.0, 0.0)
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 1)
    trainer.setWeights(initialWeights)
    trainer.LBFGSOptimizer.setNumIterations(20)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input)(0), label(0))
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(math.round(p) === l)
    }
  }

  test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(
      Array(1.0, 0.0),
      Array(0.0, 1.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0)
    )
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 2)
    // TODO: add a test for SGD
    trainer.LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(20)
    trainer.setWeights(initialWeights).setStackSize(1)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input), label)
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(p ~== l absTol 0.5)
    }
  }
}

Source File: GradientSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class GradientSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("Gradient computation against numerical differentiation") {
    val input = new BDM[Double](3, 1, Array(1.0, 1.0, 1.0))
    // output must contain zeros and one 1 for SoftMax
    val target = new BDM[Double](2, 1, Array(0.0, 1.0))
    val topology = FeedForwardTopology.multiLayerPerceptron(Array(3, 4, 2), softmaxOnTop = false)
    val layersWithErrors = Seq(
      new SigmoidLayerWithSquaredError(),
      new SoftmaxLayerWithCrossEntropyLoss()
    )
    // check all layers that provide loss computation
    // 1) compute loss and gradient given the model and initial weights
    // 2) modify weights with small number epsilon (per dimension i)
    // 3) compute new loss
    // 4) ((newLoss - loss) / epsilon) should be close to the i-th component of the gradient
    for (layerWithError <- layersWithErrors) {
      topology.layers(topology.layers.length - 1) = layerWithError
      val model = topology.model(seed = 12L)
      val weights = model.weights.toArray
      val numWeights = weights.size
      val gradient = Vectors.dense(Array.fill[Double](numWeights)(0.0))
      val loss = model.computeGradient(input, target, gradient, 1)
      val eps = 1e-4
      var i = 0
      val tol = 1e-4
      while (i < numWeights) {
        val originalValue = weights(i)
        weights(i) += eps
        val newModel = topology.model(Vectors.dense(weights))
        val newLoss = computeLoss(input, target, newModel)
        val derivativeEstimate = (newLoss - loss) / eps
        assert(math.abs(gradient(i) - derivativeEstimate) < tol, "Layer failed gradient check: " +
          layerWithError.getClass)
        weights(i) = originalValue
        i += 1
      }
    }
  }

  private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = {
    val outputs = model.forward(input)
    model.layerModels.last match {
      case layerWithLoss: LossFunction =>
        layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols))
      case _ =>
        throw new UnsupportedOperationException("Top layer is required to have loss." +
          " Failed layer:" + model.layerModels.last.getClass)
    }
  }
}

Source File: RandomForestRegressorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.regression

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.tree.impl.TreeTests
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest}
import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame


  def compareAPIs(
      data: RDD[LabeledPoint],
      rf: RandomForestRegressor,
      categoricalFeatures: Map[Int, Int]): Unit = {
    val numFeatures = data.first().features.size
    val oldStrategy =
      rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity)
    val oldModel = OldRandomForest.trainRegressor(data.map(OldLabeledPoint.fromML), oldStrategy,
      rf.getNumTrees, rf.getFeatureSubsetStrategy, rf.getSeed.toInt)
    val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
    val newModel = rf.fit(newData)
    // Use parent from newTree since this is not checked anyways.
    val oldModelAsNew = RandomForestRegressionModel.fromOld(
      oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures)
    TreeTests.checkEqual(oldModelAsNew, newModel)
    assert(newModel.numFeatures === numFeatures)
  }
}

Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {

  

  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(8.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSet
    assert(filteredData == preFilteredData)
  }

  test("ChiSqSelector by FPR transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(9.0))))
    val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSet
    assert(filteredData == preFilteredData)
  }

  test("model load / save") {
    val model = ChiSqSelectorSuite.createModel()
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    try {
      model.save(sc, path)
      val sameModel = ChiSqSelectorModel.load(sc, path)
      ChiSqSelectorSuite.checkEqual(model, sameModel)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

object ChiSqSelectorSuite extends SparkFunSuite {

  def createModel(): ChiSqSelectorModel = {
    val arr = Array(1, 2, 3, 4)
    new ChiSqSelectorModel(arr)
  }

  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
  }
}

Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("elementwise (hadamard) product should properly apply vector to dense data set") {
    val denseData = Array(
      Vectors.dense(1.0, 4.0, 1.9, -9.0)
    )
    val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
    val transformer = new ElementwiseProduct(scalingVec)
    val transformedData = transformer.transform(sc.makeRDD(denseData))
    val transformedVecs = transformedData.collect()
    val transformedVec = transformedVecs(0)
    val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
    assert(transformedVec ~== expectedVec absTol 1E-5,
      s"Expected transformed vector $expectedVec but found $transformedVec")
  }

  test("elementwise (hadamard) product should properly apply vector to sparse data set") {
    val sparseData = Array(
      Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
    )
    val dataRDD = sc.parallelize(sparseData, 3)
    val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
    val transformer = new ElementwiseProduct(scalingVec)
    val data2 = sparseData.map(transformer.transform)
    val data2RDD = transformer.transform(dataRDD)

    assert((sparseData, data2, data2RDD.collect()).zipped.forall {
      case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after hadamard product")

    assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
    assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
  }
}

Source File: IDFSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
        0
      }
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

}

Source File: PCASuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
  )

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect()

    pca_transform.zip(mat_multiply).foreach { case (calculated, expected) =>
      assert(calculated ~== expected relTol 1e-8)
    }
    assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)
  }
}

Source File: HashingTFSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("hashing tf on a single doc") {
    val hashingTF = new HashingTF(1000)
    val doc = "a a b b c d".split(" ")
    val n = hashingTF.numFeatures
    val termFreqs = Seq(
      (hashingTF.indexOf("a"), 2.0),
      (hashingTF.indexOf("b"), 2.0),
      (hashingTF.indexOf("c"), 1.0),
      (hashingTF.indexOf("d"), 1.0))
    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
      "index must be in range [0, #features)")
    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")
    val expected = Vectors.sparse(n, termFreqs)
    assert(hashingTF.transform(doc) === expected)
  }

  test("hashing tf on an RDD") {
    val hashingTF = new HashingTF
    val localDocs: Seq[Seq[String]] = Seq(
      "a a b b b c d".split(" "),
      "a b c d a b c".split(" "),
      "c b a c b a a".split(" "))
    val docs = sc.parallelize(localDocs, 2)
    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
  }

  test("applying binary term freqs") {
    val hashingTF = new HashingTF(100).setBinary(true)
    val doc = "a a b c c c".split(" ")
    val n = hashingTF.numFeatures
    val expected = Vectors.sparse(n, Seq(
      (hashingTF.indexOf("a"), 1.0),
      (hashingTF.indexOf("b"), 1.0),
      (hashingTF.indexOf("c"), 1.0)))
    assert(hashingTF.transform(doc) ~== expected absTol 1e-14)
  }
}

Source File: MatrixFactorizationModelSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.recommendation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils

class MatrixFactorizationModelSuite extends SparkFunSuite with MLlibTestSparkContext {

  val rank = 2
  var userFeatures: RDD[(Int, Array[Double])] = _
  var prodFeatures: RDD[(Int, Array[Double])] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    userFeatures = sc.parallelize(Seq((0, Array(1.0, 2.0)), (1, Array(3.0, 4.0))))
    prodFeatures = sc.parallelize(Seq((2, Array(5.0, 6.0))))
  }

  test("constructor") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    assert(model.predict(0, 2) ~== 17.0 relTol 1e-14)

    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(1, userFeatures, prodFeatures)
    }

    val userFeatures1 = sc.parallelize(Seq((0, Array(1.0)), (1, Array(3.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures1, prodFeatures)
    }

    val prodFeatures1 = sc.parallelize(Seq((2, Array(5.0))))
    intercept[IllegalArgumentException] {
      new MatrixFactorizationModel(rank, userFeatures, prodFeatures1)
    }
  }

  test("save/load") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = {
      features.mapValues(_.toSeq).collect().toSet
    }
    try {
      model.save(sc, path)
      val newModel = MatrixFactorizationModel.load(sc, path)
      assert(newModel.rank === rank)
      assert(collect(newModel.userFeatures) === collect(userFeatures))
      assert(collect(newModel.productFeatures) === collect(prodFeatures))
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }

  test("batch predict API recommendProductsForUsers") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendProductsForUsers(topK).collectAsMap()

    assert(recommendations(0)(0).rating ~== 17.0 relTol 1e-14)
    assert(recommendations(1)(0).rating ~== 39.0 relTol 1e-14)
  }

  test("batch predict API recommendUsersForProducts") {
    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
    val topK = 10
    val recommendations = model.recommendUsersForProducts(topK).collectAsMap()

    assert(recommendations(2)(0).user == 1)
    assert(recommendations(2)(0).rating ~== 39.0 relTol 1e-14)
    assert(recommendations(2)(1).user == 0)
    assert(recommendations(2)(1).rating ~== 17.0 relTol 1e-14)
  }
}

Source File: MultilabelMetricsSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD

class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("Multilabel evaluation metrics") {
    
    val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize(
      Seq((Array(0.0, 1.0), Array(0.0, 2.0)),
        (Array(0.0, 2.0), Array(0.0, 1.0)),
        (Array.empty[Double], Array(0.0)),
        (Array(2.0), Array(2.0)),
        (Array(2.0, 0.0), Array(2.0, 0.0)),
        (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)),
        (Array(1.0), Array(1.0, 2.0))), 2)
    val metrics = new MultilabelMetrics(scoreAndLabels)
    val delta = 0.00001
    val precision0 = 4.0 / (4 + 0)
    val precision1 = 2.0 / (2 + 1)
    val precision2 = 2.0 / (2 + 2)
    val recall0 = 4.0 / (4 + 1)
    val recall1 = 2.0 / (2 + 1)
    val recall2 = 2.0 / (2 + 2)
    val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0)
    val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1)
    val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2)
    val sumTp = 4 + 2 + 2
    assert(sumTp == (1 + 1 + 0 + 1 + 2 + 2 + 1))
    val microPrecisionClass = sumTp.toDouble / (4 + 0 + 2 + 1 + 2 + 2)
    val microRecallClass = sumTp.toDouble / (4 + 1 + 2 + 1 + 2 + 2)
    val microF1MeasureClass = 2.0 * sumTp.toDouble /
      (2 * sumTp.toDouble + (1 + 1 + 2) + (0 + 1 + 2))
    val macroPrecisionDoc = 1.0 / 7 *
      (1.0 / 2 + 1.0 / 2 + 0 + 1.0 / 1 + 2.0 / 2 + 2.0 / 3 + 1.0 / 1.0)
    val macroRecallDoc = 1.0 / 7 *
      (1.0 / 2 + 1.0 / 2 + 0 / 1 + 1.0 / 1 + 2.0 / 2 + 2.0 / 2 + 1.0 / 2)
    val macroF1MeasureDoc = (1.0 / 7) *
      2 * ( 1.0 / (2 + 2) + 1.0 / (2 + 2) + 0 + 1.0 / (1 + 1) +
        2.0 / (2 + 2) + 2.0 / (3 + 2) + 1.0 / (1 + 2) )
    val hammingLoss = (1.0 / (7 * 3)) * (2 + 2 + 1 + 0 + 0 + 1 + 1)
    val strictAccuracy = 2.0 / 7
    val accuracy = 1.0 / 7 * (1.0 / 3 + 1.0 /3 + 0 + 1.0 / 1 + 2.0 / 2 + 2.0 / 3 + 1.0 / 2)
    assert(math.abs(metrics.precision(0.0) - precision0) < delta)
    assert(math.abs(metrics.precision(1.0) - precision1) < delta)
    assert(math.abs(metrics.precision(2.0) - precision2) < delta)
    assert(math.abs(metrics.recall(0.0) - recall0) < delta)
    assert(math.abs(metrics.recall(1.0) - recall1) < delta)
    assert(math.abs(metrics.recall(2.0) - recall2) < delta)
    assert(math.abs(metrics.f1Measure(0.0) - f1measure0) < delta)
    assert(math.abs(metrics.f1Measure(1.0) - f1measure1) < delta)
    assert(math.abs(metrics.f1Measure(2.0) - f1measure2) < delta)
    assert(math.abs(metrics.microPrecision - microPrecisionClass) < delta)
    assert(math.abs(metrics.microRecall - microRecallClass) < delta)
    assert(math.abs(metrics.microF1Measure - microF1MeasureClass) < delta)
    assert(math.abs(metrics.precision - macroPrecisionDoc) < delta)
    assert(math.abs(metrics.recall - macroRecallDoc) < delta)
    assert(math.abs(metrics.f1Measure - macroF1MeasureDoc) < delta)
    assert(math.abs(metrics.hammingLoss - hammingLoss) < delta)
    assert(math.abs(metrics.subsetAccuracy - strictAccuracy) < delta)
    assert(math.abs(metrics.accuracy - accuracy) < delta)
    assert(metrics.labels.sameElements(Array(0.0, 1.0, 2.0)))
  }
}

Source File: RankingMetricsSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("Ranking metrics: MAP, NDCG") {
    val predictionAndLabels = sc.parallelize(
      Seq(
        (Array(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array(1, 2, 3, 4, 5)),
        (Array(4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array(1, 2, 3)),
        (Array(1, 2, 3, 4, 5), Array.empty[Int])
      ), 2)
    val eps = 1.0E-5

    val metrics = new RankingMetrics(predictionAndLabels)
    val map = metrics.meanAveragePrecision

    assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.precisionAt(4) ~== 0.75/3 absTol eps)
    assert(metrics.precisionAt(5) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(10) ~== 0.8/3 absTol eps)
    assert(metrics.precisionAt(15) ~== 8.0/45 absTol eps)

    assert(map ~== 0.355026 absTol eps)

    assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps)
    assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps)
    assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps)
    assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps)
  }

  test("MAP, NDCG with few predictions (SPARK-14886)") {
    val predictionAndLabels = sc.parallelize(
      Seq(
        (Array(1, 6, 2), Array(1, 2, 3, 4, 5)),
        (Array.empty[Int], Array(1, 2, 3))
      ), 2)
    val eps = 1.0E-5

    val metrics = new RankingMetrics(predictionAndLabels)
    assert(metrics.precisionAt(1) ~== 0.5 absTol eps)
    assert(metrics.precisionAt(2) ~== 0.25 absTol eps)
    assert(metrics.ndcgAt(1) ~== 0.5 absTol eps)
    assert(metrics.ndcgAt(2) ~== 0.30657 absTol eps)
  }

}

Source File: AreaUnderCurveSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class AreaUnderCurveSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("auc computation") {
    val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0))
    val auc = 4.0
    assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5)
  }

  test("auc of an empty curve") {
    val curve = Seq.empty[(Double, Double)]
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }

  test("auc of a curve with a single point") {
    val curve = Seq((1.0, 1.0))
    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
    val rddCurve = sc.parallelize(curve, 2)
    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
  }
}

Source File: RegressionMetricsSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class RegressionMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
  val obs = List[Double](77, 85, 62, 55, 63, 88, 57, 81, 51)
  val eps = 1E-5

  test("regression metrics for unbiased (includes intercept term) predictor") {
    
    val preds = obs
    val predictionAndObservations = sc.parallelize(preds.zip(obs), 2)
    val metrics = new RegressionMetrics(predictionAndObservations)
    assert(metrics.explainedVariance ~== 174.83951 absTol eps,
      "explained variance regression score mismatch")
    assert(metrics.meanAbsoluteError ~== 0.0 absTol eps, "mean absolute error mismatch")
    assert(metrics.meanSquaredError ~== 0.0 absTol eps, "mean squared error mismatch")
    assert(metrics.rootMeanSquaredError ~== 0.0 absTol eps,
      "root mean squared error mismatch")
    assert(metrics.r2 ~== 1.0 absTol eps, "r2 score mismatch")
  }
}

Source File: MulticlassMetricsSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Matrices
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("Multiclass evaluation metrics") {
    
    val confusionMatrix = Matrices.dense(3, 3, Array(2, 1, 0, 1, 3, 0, 1, 0, 1))
    val labels = Array(0.0, 1.0, 2.0)
    val predictionAndLabels = sc.parallelize(
      Seq((0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0),
        (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2)
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val delta = 0.0000001
    val tpRate0 = 2.0 / (2 + 2)
    val tpRate1 = 3.0 / (3 + 1)
    val tpRate2 = 1.0 / (1 + 0)
    val fpRate0 = 1.0 / (9 - 4)
    val fpRate1 = 1.0 / (9 - 4)
    val fpRate2 = 1.0 / (9 - 1)
    val precision0 = 2.0 / (2 + 1)
    val precision1 = 3.0 / (3 + 1)
    val precision2 = 1.0 / (1 + 1)
    val recall0 = 2.0 / (2 + 2)
    val recall1 = 3.0 / (3 + 1)
    val recall2 = 1.0 / (1 + 0)
    val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0)
    val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1)
    val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2)
    val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0)
    val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1)
    val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2)

    assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray))
    assert(math.abs(metrics.truePositiveRate(0.0) - tpRate0) < delta)
    assert(math.abs(metrics.truePositiveRate(1.0) - tpRate1) < delta)
    assert(math.abs(metrics.truePositiveRate(2.0) - tpRate2) < delta)
    assert(math.abs(metrics.falsePositiveRate(0.0) - fpRate0) < delta)
    assert(math.abs(metrics.falsePositiveRate(1.0) - fpRate1) < delta)
    assert(math.abs(metrics.falsePositiveRate(2.0) - fpRate2) < delta)
    assert(math.abs(metrics.precision(0.0) - precision0) < delta)
    assert(math.abs(metrics.precision(1.0) - precision1) < delta)
    assert(math.abs(metrics.precision(2.0) - precision2) < delta)
    assert(math.abs(metrics.recall(0.0) - recall0) < delta)
    assert(math.abs(metrics.recall(1.0) - recall1) < delta)
    assert(math.abs(metrics.recall(2.0) - recall2) < delta)
    assert(math.abs(metrics.fMeasure(0.0) - f1measure0) < delta)
    assert(math.abs(metrics.fMeasure(1.0) - f1measure1) < delta)
    assert(math.abs(metrics.fMeasure(2.0) - f1measure2) < delta)
    assert(math.abs(metrics.fMeasure(0.0, 2.0) - f2measure0) < delta)
    assert(math.abs(metrics.fMeasure(1.0, 2.0) - f2measure1) < delta)
    assert(math.abs(metrics.fMeasure(2.0, 2.0) - f2measure2) < delta)

    assert(math.abs(metrics.accuracy -
      (2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta)
    assert(math.abs(metrics.accuracy - metrics.precision) < delta)
    assert(math.abs(metrics.accuracy - metrics.recall) < delta)
    assert(math.abs(metrics.accuracy - metrics.fMeasure) < delta)
    assert(math.abs(metrics.accuracy - metrics.weightedRecall) < delta)
    assert(math.abs(metrics.weightedTruePositiveRate -
      ((4.0 / 9) * tpRate0 + (4.0 / 9) * tpRate1 + (1.0 / 9) * tpRate2)) < delta)
    assert(math.abs(metrics.weightedFalsePositiveRate -
      ((4.0 / 9) * fpRate0 + (4.0 / 9) * fpRate1 + (1.0 / 9) * fpRate2)) < delta)
    assert(math.abs(metrics.weightedPrecision -
      ((4.0 / 9) * precision0 + (4.0 / 9) * precision1 + (1.0 / 9) * precision2)) < delta)
    assert(math.abs(metrics.weightedRecall -
      ((4.0 / 9) * recall0 + (4.0 / 9) * recall1 + (1.0 / 9) * recall2)) < delta)
    assert(math.abs(metrics.weightedFMeasure -
      ((4.0 / 9) * f1measure0 + (4.0 / 9) * f1measure1 + (1.0 / 9) * f1measure2)) < delta)
    assert(math.abs(metrics.weightedFMeasure(2.0) -
      ((4.0 / 9) * f2measure0 + (4.0 / 9) * f2measure1 + (1.0 / 9) * f2measure2)) < delta)
    assert(metrics.labels.sameElements(labels))
  }
}

Source File: FPTreeSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import scala.language.existentials

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class FPTreeSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("add transaction") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    assert(tree.root.children.size == 2)
    assert(tree.root.children.contains("a"))
    assert(tree.root.children("a").item.equals("a"))
    assert(tree.root.children("a").count == 2)
    assert(tree.root.children.contains("b"))
    assert(tree.root.children("b").item.equals("b"))
    assert(tree.root.children("b").count == 1)
    var child = tree.root.children("a")
    assert(child.children.size == 1)
    assert(child.children.contains("b"))
    assert(child.children("b").item.equals("b"))
    assert(child.children("b").count == 2)
    child = child.children("b")
    assert(child.children.size == 2)
    assert(child.children.contains("c"))
    assert(child.children.contains("y"))
    assert(child.children("c").item.equals("c"))
    assert(child.children("y").item.equals("y"))
    assert(child.children("c").count == 1)
    assert(child.children("y").count == 1)
  }

  test("merge tree") {
    val tree1 = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("b"))

    val tree2 = new FPTree[String]
      .add(Seq("a", "b"))
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "c", "d"))
      .add(Seq("a", "x"))
      .add(Seq("a", "x", "y"))
      .add(Seq("c", "n"))
      .add(Seq("c", "m"))

    val tree3 = tree1.merge(tree2)

    assert(tree3.root.children.size == 3)
    assert(tree3.root.children("a").count == 7)
    assert(tree3.root.children("b").count == 1)
    assert(tree3.root.children("c").count == 2)
    val child1 = tree3.root.children("a")
    assert(child1.children.size == 2)
    assert(child1.children("b").count == 5)
    assert(child1.children("x").count == 2)
    val child2 = child1.children("b")
    assert(child2.children.size == 2)
    assert(child2.children("y").count == 1)
    assert(child2.children("c").count == 3)
    val child3 = child2.children("c")
    assert(child3.children.size == 1)
    assert(child3.children("d").count == 1)
    val child4 = child1.children("x")
    assert(child4.children.size == 1)
    assert(child4.children("y").count == 1)
    val child5 = tree3.root.children("c")
    assert(child5.children.size == 2)
    assert(child5.children("n").count == 1)
    assert(child5.children("m").count == 1)
  }

  test("extract freq itemsets") {
    val tree = new FPTree[String]
      .add(Seq("a", "b", "c"))
      .add(Seq("a", "b", "y"))
      .add(Seq("a", "b"))
      .add(Seq("a"))
      .add(Seq("b"))
      .add(Seq("b", "n"))

    val freqItemsets = tree.extract(3L).map { case (items, count) =>
      (items.toSet, count)
    }.toSet
    val expected = Set(
      (Set("a"), 4L),
      (Set("b"), 5L),
      (Set("a", "b"), 3L))
    assert(freqItemsets === expected)
  }
}

Source File: AssociationRulesSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.fpm

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("association rules using String type") {
    val freqItemsets = sc.parallelize(Seq(
      (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
      (Set("r"), 3L),
      (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L),
      (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L),
      (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L),
      (Set("t", "y", "x"), 3L),
      (Set("t", "y", "x", "z"), 3L)
    ).map {
      case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq)
    })

    val ar = new AssociationRules()

    val results1 = ar
      .setMinConfidence(0.9)
      .run(freqItemsets)
      .collect()

    
    assert(results2.size === 30)
    assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
  }
}

Source File: KernelDensitySuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MultivariateGaussianSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Matrices, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }

}

Source File: CoordinateMatrixSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {

  val m = 5
  val n = 4
  var mat: CoordinateMatrix = _

  override def beforeAll() {
    super.beforeAll()
    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    mat = new CoordinateMatrix(entries)
  }

  test("size") {
    assert(mat.numRows() === m)
    assert(mat.numCols() === n)
  }

  test("empty entries") {
    val entries = sc.parallelize(Seq[MatrixEntry](), 1)
    val emptyMat = new CoordinateMatrix(entries)
    intercept[RuntimeException] {
      emptyMat.numCols()
    }
    intercept[RuntimeException] {
      emptyMat.numRows()
    }
  }

  test("toBreeze") {
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(mat.toBreeze() === expected)
  }

  test("transpose") {
    val transposed = mat.transpose()
    assert(mat.toBreeze().t === transposed.toBreeze())
  }

  test("toIndexedRowMatrix") {
    val indexedRowMatrix = mat.toIndexedRowMatrix()
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(indexedRowMatrix.toBreeze() === expected)
  }

  test("toRowMatrix") {
    val rowMatrix = mat.toRowMatrix()
    val rows = rowMatrix.rows.collect().toSet
    val expected = Set(
      Vectors.dense(1.0, 2.0, 0.0, 0.0),
      Vectors.dense(0.0, 3.0, 4.0, 0.0),
      Vectors.dense(0.0, 0.0, 5.0, 6.0),
      Vectors.dense(7.0, 0.0, 0.0, 8.0),
      Vectors.dense(0.0, 9.0, 0.0, 0.0))
    assert(rows === expected)
  }

  test("toBlockMatrix") {
    val blockMat = mat.toBlockMatrix(2, 2)
    assert(blockMat.numRows() === m)
    assert(blockMat.numCols() === n)
    assert(blockMat.toBreeze() === mat.toBreeze())

    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(-1, 2)
    }
    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(2, 0)
    }
  }
}

Source File: MLPairRDDFunctionsSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class MLPairRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("topByKey") {
    val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5,
      1), (3, 5)), 2)
      .topByKey(5)
      .collectAsMap()

    assert(topMap.size === 3)
    assert(topMap(1) === Array(7, 6, 3, 2, 1))
    assert(topMap(3) === Array(7, 5, 2))
    assert(topMap(5) === Array(1))
  }
}

Source File: RDDFunctionsSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.rdd

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.rdd.RDDFunctions._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class RDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("sliding") {
    val data = 0 until 6
    for (numPartitions <- 1 to 8) {
      val rdd = sc.parallelize(data, numPartitions)
      for (windowSize <- 1 to 6) {
        for (step <- 1 to 3) {
          val sliding = rdd.sliding(windowSize, step).collect().map(_.toList).toList
          val expected = data.sliding(windowSize, step)
            .map(_.toList).toList.filter(l => l.size == windowSize)
          assert(sliding === expected)
        }
      }
      assert(rdd.sliding(7).collect().isEmpty,
        "Should return an empty RDD if the window size is greater than the number of items.")
    }
  }

  test("sliding with empty partitions") {
    val data = Seq(Seq(1, 2, 3), Seq.empty[Int], Seq(4), Seq.empty[Int], Seq(5, 6, 7))
    val rdd = sc.parallelize(data, data.length).flatMap(s => s)
    assert(rdd.partitions.length === data.length)
    val sliding = rdd.sliding(3).collect().toSeq.map(_.toSeq)
    val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq)
    assert(sliding === expected)
  }
}

Source File: SpLinopMatrixSuite.scala From spark-lp with Apache License 2.0

5 votes


package org.apache.spark.mllib.optimization.lp

import org.scalatest.FunSuite

import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.lp.VectorSpace._
import org.apache.spark.mllib.optimization.lp.fs.dvector.dmatrix._

class SpLinopMatrixSuite extends FunSuite with MLlibTestSparkContext {

  test("SpLinopMatrix.apply is implemented properly") {

    val matrix: DMatrix = sc.parallelize(Array(
      Vectors.dense(1.0, 2.0, 3.0),
      Vectors.dense(4.0, 5.0, 6.0)),
      2)

    val vector: DVector = sc.parallelize(Array(2.0, 3.0), 2).glom.map(new DenseVector(_))

    val expectApply: DMatrix = sc.parallelize(Array(
      Vectors.dense(2.0 * 1.0, 2.0 * 2.0, 2.0 * 3.0),
      Vectors.dense(3.0 * 4.0, 3.0 * 5.0, 3.0 * 6.0)),
      2)
    assert((new SpLinopMatrix(vector))(matrix).collect().deep == expectApply.collect().deep, // or sameElements
      "SpLinopMatrix.apply should return the correct result.")
  }
}

Source File: InitializeSuite.scala From spark-lp with Apache License 2.0

5 votes


package org.apache.spark.mllib.optimization.lp

import org.scalatest.FunSuite

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.mllib.optimization.lp.VectorSpace._
import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace
import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, _}
import org.apache.spark.mllib.optimization.tfocs.VectorSpace.{DMatrix, DVector}

class InitializeSuite extends FunSuite with MLlibTestSparkContext {

  val numPartitions = 2
  val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0)
  val BArray = Array(
    Array(12.0, 16.0, 30.0, 1.0, 0.0),
    Array(24.0, 16.0, 12.0, 0.0, 1.0),
    Array(-1.0, 0.0, 0.0, 0.0, 0.0),
    Array(0.0, -1.0, 0.0, 0.0, 0.0),
    Array(0.0, 0.0, -1.0, 0.0, 0.0),
    Array(0.0, 0.0, 0.0, 1.0, 0.0),
    Array(0.0, 0.0, 0.0, 0.0, 1.0))
  val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0)

  lazy val c: DVector = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_))
  lazy val rows: DMatrix = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_))
  lazy val b: DenseVector = new DenseVector(bArray)

  val cBrz = new BDV[Double](cArray)
  val BBrz = new BDM[Double](7, 5,
    BArray.flatMap(x => x),
    offset = 0,
    majorStride = 5,
    isTranspose = true)
  val bBrz = new BDV[Double](bArray)
  // (BT * B) ^(-1)
  val BTBInv = inv(BBrz.t * BBrz)
  // xTilda = B * BTBInv * b
  val xTilda: BDV[Double] = BBrz * (BTBInv * bBrz)
  // lambdaTilda = BTBInv * (B^T * c)
  val lambdaTilda: BDV[Double] = BTBInv * (BBrz.t * cBrz)
  // sTilda = c - B * lambdaTilda
  val sTilda = cBrz - BBrz * lambdaTilda
  val deltax = Math.max(1.5 * max(xTilda), 0)
  val deltas = Math.max(1.5 * max(sTilda), 0)
  val xHat = xTilda :+ deltax
  val sHat = sTilda :+ deltas
  val deltaxHat: Double = 0.5 * (xHat.t * sHat) / sum(sHat)
  val deltasHat: Double = 0.5 * (xHat.t * sHat) / sum(xHat)
  // x = xHat + deltaxHat * e
  val expectedx: BDV[Double] = xHat :+ deltaxHat
  // val expectedLambda = lambdaTilda
  val expecteds: BDV[Double] = sHat :+ deltasHat


  test("Initialize.init is implemented properly") {

    val result = Initialize.init(c, rows, b)
    //println(LP.solve(c, rows, b, 1e-4, 1).collect())
    assert(Vectors.dense(expectedx.toArray) ~= Vectors.dense(result._1.flatMap(_.toArray).collect()) relTol 1e-6,
      "Initialize.init x0 is not computed correctly.")
    assert(Vectors.dense(lambdaTilda.toArray) ~= Vectors.dense(result._2.toArray) relTol 1e-6,
      "Initialize.init lambda0 is not computed correctly.")
    assert(Vectors.dense(expecteds.toArray) ~= Vectors.dense(result._3.flatMap(_.toArray).collect()) relTol 1e-6,
      "Initialize.init s0 should return the correct answer.")
  }
}

Source File: LPSuite.scala From spark-lp with Apache License 2.0

5 votes


package org.apache.spark.mllib.optimization.lp

import org.scalatest.FunSuite

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.mllib.optimization.lp.VectorSpace._
import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace

class LPSuite extends FunSuite with MLlibTestSparkContext {

  val numPartitions = 2
  val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0)
  val BArray = Array(
    Array(12.0, 16.0, 30.0, 1.0, 0.0),
    Array(24.0, 16.0, 12.0, 0.0, 1.0),
    Array(-1.0, 0.0, 0.0, 0.0, 0.0),
    Array(0.0, -1.0, 0.0, 0.0, 0.0),
    Array(0.0, 0.0, -1.0, 0.0, 0.0),
    Array(0.0, 0.0, 0.0, 1.0, 0.0),
    Array(0.0, 0.0, 0.0, 0.0, 1.0))
  val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0)

  lazy val c = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_))
  lazy val rows = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_))
  lazy val b = new DenseVector(bArray)

  test("LP solve is implemented properly") {
    val (v, x) = LP.solve(c, rows, b, sc=sc)
    // solution obtained from scipy.optimize.linprog and octave glgk lpsolver with fun_val = 12.083
    val expectedSol = Vectors.dense(
      Array(1.66666667, 5.83333333, 40.0, 0.0, 0.0, 13.33333333, 9.16666667))
    val xx = Vectors.dense(x.flatMap(_.toArray).collect())
    println(s"$xx")
    println("optimal min value: " + v)
    assert(xx ~== expectedSol absTol 1e-6, "LP.solve x should return the correct answer.")

  }

}

Source File: VectorSpaceSuite.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs

import org.scalatest.FunSuite

import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.VectorSpace._
import org.apache.spark.mllib.optimization.tfocs.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.tfocs.vs.dvectordouble.DVectorDoubleSpace
import org.apache.spark.mllib.optimization.tfocs.vs.vector.DenseVectorSpace

class VectorSpaceSuite extends FunSuite with MLlibTestSparkContext {

  test("DenseVectorSpace.combine is implemented properly") {
    val alpha = 1.1
    val a = new DenseVector(Array(2.0, 3.0))
    val beta = 4.0
    val b = new DenseVector(Array(5.0, 6.0))
    val expectedCombination = Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0)
    assert(DenseVectorSpace.combine(alpha, a, beta, b) == expectedCombination,
      "DenseVectorSpace.combine should return the correct result.")
  }

  test("DenseVectorSpace.dot is implemented properly") {
    val a = new DenseVector(Array(2.0, 3.0))
    val b = new DenseVector(Array(5.0, 6.0))
    val expectedDot = 2.0 * 5.0 + 3.0 * 6.0
    assert(DenseVectorSpace.dot(a, b) == expectedDot,
      "DenseVectorSpace.dot should return the correct result.")
  }

  test("DVectorSpace.combine is implemented properly") {
    val alpha = 1.1
    val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2)
    val beta = 4.0
    val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2)
    val combination = DVectorSpace.combine(alpha, a, beta, b)
    val expectedCombination =
      Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0)
    assert(Vectors.dense(combination.collectElements) == expectedCombination,
      "DVectorSpace.combine should return the correct result.")
  }

  test("DVectorSpace.dot is implemented properly") {
    val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2)
    val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2)
    val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0
    assert(DVectorSpace.dot(a, b) == expectedDot,
      "DVectorSpace.dot should return the correct result.")
  }

  test("DVectorDoubleSpace.combine is implemented properly") {
    val alpha = 1.1
    val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))),
      2), 9.9)
    val beta = 4.0
    val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))),
      2), 11.11)
    val combination = DVectorDoubleSpace.combine(alpha, a, beta, b)
    val expectedCombination =
      (Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0),
        1.1 * 9.9 + 4.0 * 11.11)
    assert(Vectors.dense(combination._1.collectElements) == expectedCombination._1,
      "DVectorVectorSpace.combine should return the correct result.")
    assert(combination._2 == expectedCombination._2,
      "DVectorVectorSpace.combine should return the correct result.")
  }

  test("DVectorDoubleSpace.dot is implemented properly") {
    val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))),
      2), 9.9)
    val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))),
      2), 11.11)
    val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0 + 9.9 * 11.11
    assert(DVectorDoubleSpace.dot(a, b) == expectedDot,
      "DVectorVectorSpace.dot should return the correct result.")
  }
}

Source File: LinearOperatorSuite.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs

import org.scalatest.FunSuite

import org.apache.spark.SparkException
import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix
import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint
import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvectordouble.{ LinopMatrix => LinopMatrixVector }
import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector.{ LinopMatrixAdjoint => LinopMatrixVectorAdjoint }
import org.apache.spark.mllib.util.MLlibTestSparkContext

class LinearOperatorSuite extends FunSuite with MLlibTestSparkContext {

  lazy val matrix = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0),
    Vectors.dense(4.0, 5.0, 6.0)), 2)

  lazy val vector = new DenseVector(Array(2.2, 3.3, 4.4))

  test("LinopMatrix multiplies properly") {

    val f = new LinopMatrix(matrix)
    val x = new DenseVector(Array(7.0, 8.0, 9.0))
    val result = f(x)
    val expectedResult = Vectors.dense(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)
    assert(Vectors.dense(result.collectElements) == expectedResult,
      "should return the correct product")
  }

  test("LinopMatrixAdjoint multiplies properly") {

    val f = new LinopMatrixAdjoint(matrix)
    val y = sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2)
    val result = f(y)
    val expectedResult = Vectors.dense(1 * 5 + 4 * 6, 2 * 5 + 5 * 6, 3 * 5 + 6 * 6)
    assert(result == expectedResult, "should return the correct product")
  }

  test("LinopMatrixAdjoint checks for mismatched partition vectors") {

    val f = new LinopMatrixAdjoint(matrix)
    val y = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2)
    intercept[SparkException] {
      f(y)
    }
  }

  test("LinopMatrixVector multiplies properly") {

    val f = new LinopMatrixVector(matrix, vector)
    val x = new DenseVector(Array(7.0, 8.0, 9.0))
    val result = f(x)
    val expectedResult = (new DenseVector(Array(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)),
      7.0 * 2.2 + 8.0 * 3.3 + 9.0 * 4.4)
    assert(Vectors.dense(result._1.collectElements) == expectedResult._1,
      "should return the correct product")
    assert(result._2 == expectedResult._2, "should return the correct product")
  }

  test("LinopMatrixVectorAdjoint multiplies properly") {

    var f = new LinopMatrixVectorAdjoint(matrix, vector)
    val y = (sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2),
      8.8)
    val result = f(y)
    val expectedResult =
      Vectors.dense(1 * 5 + 4 * 6 + 2.2, 2 * 5 + 5 * 6 + 3.3, 3 * 5 + 6 * 6 + 4.4)
    assert(result == expectedResult, "should return the correct product")
  }

  test("LinopMatrixVectorAdjoint checks for mismatched partition vectors") {

    val f = new LinopMatrixVectorAdjoint(matrix, vector)
    val y = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2),
      8.8)
    intercept[SparkException] {
      f(y)
    }
  }
}

Source File: LocalTrainingPlanSuite.scala From oraf with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.tree.OptimizedLearningNode
import org.apache.spark.mllib.tree.configuration.{DefaultTimePredictionStrategy, TimePredictionStrategy}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class LocalTrainingPlanSuite extends SparkFunSuite with MLlibTestSparkContext {

  val timePredictonStrategy: TimePredictionStrategy = new DefaultTimePredictionStrategy

  test("memory restriction") {
    val plan = new LocalTrainingPlan(10, timePredictonStrategy, Int.MaxValue)

    plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 1, rows = 2, impurity = 1.0))
    plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 1, rows = 2, impurity = 1.0))
    plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 1, rows = 2, impurity = 1.0))

    plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 1, rows = 9, impurity = 1.0))

    assert(plan.bins.length == 2)
    assert(plan.bins.head.tasks.length == 3)
    assert(plan.bins(1).tasks.length == 1)
  }

  test("count restriction") {
    val plan = new LocalTrainingPlan(10, timePredictonStrategy, 2)

    plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 1, rows = 2, impurity = 1.0))
    plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 1, rows = 2, impurity = 1.0))
    plan.scheduleTask(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 1, rows = 2, impurity = 1.0))

    assert(plan.bins.length == 2)
    assert(plan.bins.head.tasks.length == 2)
    assert(plan.bins(1).tasks.length == 1)
  }

  test("task implicit ordering by memory usage descending") {
    val l = List(new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 1, rows = 1, impurity = 1.0),
      new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 2, rows = 5, impurity = 1.0),
      new LocalTrainingTask(node = OptimizedLearningNode.emptyNode(1),
      treeIndex = 3, rows = 3, impurity = 1.0)
    )

    val sorted = l.sorted

    assert(sorted.head.treeIndex == 2)
  }
}

Source File: LocalTreeIntegrationSuite.scala From oraf with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.mllib.tree.DecisionTreeSuite
import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.DataFrame


  private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val distribTree = setParams(new DecisionTreeRegressor(), testParams)
    val localTree = setParams(new LocalDecisionTreeRegressor(), testParams)
    val localModel = localTree.fit(train)
    val model = distribTree.fit(train)
    OptimizedTreeTests.checkEqual(model, localModel)
  }


  test("Local & distributed training produce the same tree on a toy dataset") {
    val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a slightly larger toy dataset") {
    val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce the same tree on a larger toy dataset") {
    val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce same tree on a dataset of categorical features") {
    val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances())
    // Create a map of categorical feature index to arity; each feature has arity nclasses
    val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3)
    // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its
    // categorical features
    val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a dataset of continuous features") {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._
    // Use maxDepth = 5 and default params
    val params = medDepthTreeSettings
    val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
      nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
      .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray)))
      .toDF().cache()
    testEquivalence(data, params)
  }

  test("Local & distributed training produce the same tree on a dataset of constant features") {
    // Generate constant, continuous data
    val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

}

org.apache.spark.mllib.util.MLlibTestSparkContext Scala Examples