org.apache.spark.mllib.recommendation.Rating Scala Example

Source File: RecommendationExample.scala From Spark-2.3.1 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
// $example off$

object RecommendationExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("CollaborativeFilteringExample")
    val sc = new SparkContext(conf)
    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/als/test.data")
    val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
      Rating(user.toInt, item.toInt, rate.toDouble)
    })

    // Build the recommendation model using ALS
    val rank = 10
    val numIterations = 10
    val model = ALS.train(ratings, rank, numIterations, 0.01)

    // Evaluate the model on rating data
    val usersProducts = ratings.map { case Rating(user, product, rate) =>
      (user, product)
    }
    val predictions =
      model.predict(usersProducts).map { case Rating(user, product, rate) =>
        ((user, product), rate)
      }
    val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
      ((user, product), rate)
    }.join(predictions)
    val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
      val err = (r1 - r2)
      err * err
    }.mean()
    println(s"Mean Squared Error = $MSE")

    // Save and load model
    model.save(sc, "target/tmp/myCollaborativeFilter")
    val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: PythonMLLibAPISuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {

  SerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

  }
}

Source File: MatrixFactorizationModelWrapper.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }
}

Source File: PythonMLLibAPISuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.recommendation.Rating

class PythonMLLibAPISuite extends SparkFunSuite {

  SerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array[Double]()
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

  }
}

Source File: ALStrainImplicit.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.examples.mllib
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating

    val userMap = userIndex.collectAsMap
    //广播userMap
    val broadcastUserMap = sc.broadcast(userMap)
    //广播songMap
    val broadcastSongMap = sc.broadcast(songMap)
    //将triplets数据转换为一个数组
    val tripArray = triplets.map(_.split("\\W+"))
    //导入Rating包
    import org.apache.spark.mllib.recommendation.Rating
    //将tripArray数组转换为评级对象RDD
    val ratings = tripArray.map { case Array(user, song, plays)=>
    val userId = broadcastUserMap.value.getOrElse(user, 0)
    val songId = broadcastUserMap.value.getOrElse(song, 0)
    Rating(userId, songId, plays.toDouble)
    }
    //导入ALS
    import org.apache.spark.mllib.recommendation.ALS
    //将Rank设置为10,迭代次数设为10,Rank模型中的潜在特征数    
    val model = ALS.trainImplicit(ratings, 10, 10)
    //从triplet中导出用户和歌曲元组
    val usersSongs = ratings.map( r => {
      println(r.user+"|||"+r.product)
      (r.user, r.product) 
    })
  

    //预测用户和歌曲
    val predictions = model.predict(usersSongs)
    predictions.foreach { x => println(x.user.toString()+"|||||"+x.rating.toString()+"======="+x.product.toString()) }
  }
}

Source File: ALSExample.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating

object ALSExample {
  def main(args: Array[String]) {
    

    //为每个用户进行推荐,推荐的结果可以以用户id为key,结果为value存入redis或者hbase中
    val users = data.map(_.split(",") match {
      case Array(user, product, rate) => (user)
    }).distinct().collect()
    //users: Array[String] = Array(4, 2, 3, 1)
    users.foreach(
      user => {
        //依次为用户推荐商品   
        var rs = model.recommendProducts(user.toInt, numIterations)
        var value = ""
        var key = 0

        //拼接推荐结果
        rs.foreach(r => {
          key = r.user
          value = value + r.product + ":" + r.rating + ","
        })
        println(key.toString + "   " + value)
      })

    //对预测结果按预测的评分排序
    predictions.collect.sortBy(_._2)
    //对预测结果按用户进行分组,然后合并推荐结果,这部分代码待修正
    predictions.map { case ((user, product), rate) => (user, (product, rate)) }.groupByKey.collect
    //格式化测试评分和实际评分的结果
    val formatedRatesAndPreds = ratesAndPreds.map {
      case ((user, product), (rate, pred)) => user + "," + product + "," + rate + "," + pred
    }
    formatedRatesAndPreds.collect()
  }
}

Source File: ALSDome.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating

        Rating(userId.toInt, itemId.toInt, rating.toDouble)
       }
   //绑定评分数据和个人评分数据
    val movieratings = ratings.union(pratings)
    //使用ALS建立模型,设定rank为5,迭代次数为10以及lambda为0.01
    val model = ALS.train(movieratings, 10, 10, 0.01)
     //在模型上选定一部电影预测我的评分,让我们从电影ID为195的<终结者>开始
    model.predict(sc.parallelize(Array((944,195)))).collect.foreach(println)
    //在模型上选定一部电影预测我的评分,让我们从电影ID为402<人鬼情未了>
    model.predict(sc.parallelize(Array((944,402)))).collect.foreach(println)
    //在模型上选定一部电影预测我的评分,让我们从电影ID为148<黑夜幽灵>
    model.predict(sc.parallelize(Array((944,402)))).collect.foreach(println)  
  }
}

Source File: MatrixFactorizationModelWrapper.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }
}

Source File: PythonMLLibAPISuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.recommendation.Rating

class PythonMLLibAPISuite extends SparkFunSuite {

  SerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle labeled point") {
    val points = Seq(
   
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array[Double]()
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

  }
}

Source File: MatrixFactorizationModelWrapper.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
}

Source File: MatrixFactorizationModelWrapper.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
}

Source File: PythonMLLibAPISuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {

  SerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

  }
}

Source File: RecommendationExample.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkContext, SparkConf}
// $example on$
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
// $example off$

object RecommendationExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("CollaborativeFilteringExample")
    val sc = new SparkContext(conf)
    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/als/test.data")
    val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
      Rating(user.toInt, item.toInt, rate.toDouble)
    })

    // Build the recommendation model using ALS
    val rank = 10
    val numIterations = 10
    val model = ALS.train(ratings, rank, numIterations, 0.01)

    // Evaluate the model on rating data
    val usersProducts = ratings.map { case Rating(user, product, rate) =>
      (user, product)
    }
    val predictions =
      model.predict(usersProducts).map { case Rating(user, product, rate) =>
        ((user, product), rate)
      }
    val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
      ((user, product), rate)
    }.join(predictions)
    val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
      val err = (r1 - r2)
      err * err
    }.mean()
    println("Mean Squared Error = " + MSE)

    // Save and load model
    model.save(sc, "target/tmp/myCollaborativeFilter")
    val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
    // $example off$
  }
}
// scalastyle:on println

Source File: MatrixFactorizationModelWrapper.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
}

Source File: PythonMLLibAPISuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.recommendation.Rating

class PythonMLLibAPISuite extends SparkFunSuite {

  SerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array[Double]()
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

  }
}

Source File: EvaluateResult.scala From learning-spark with Apache License 2.0

5 votes

package com.javachen.grab

import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


object EvaluateResult {
  def coverage(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])])={
    userRecommends.flatMap(_._2).distinct().count.toDouble / training.map(_.product).distinct().count
  }

  def popularity(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])])={
    var ret = 0.0
    var n=0
    val item_popularity=training.map{ case Rating(user, product, rate) =>
      (product,(user, rate))
    }.groupByKey(4).map{case (product,list)=>
      (product,list.size)
    }.collectAsMap()

    userRecommends.flatMap(_._2).collect().foreach { p =>
      ret = ret + math.log(1 + item_popularity.get(p).get)
      n = n + 1
    }

    ret/n
  }

  def recallAndPrecisionAndF1(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])]):(Double, Double,Double) = {
    val usersProducts: RDD[(Int, Int)] = training.map { case Rating(user, product, rate) =>
      (user, product)
    }

    val groupData=userRecommends.join(usersProducts.groupByKey().map {case (k,v) => (k,v.toList)})

    val (hit, testNum, recNum) = groupData.map{ case (user, (mItems, tItems)) =>
      var count = 0
      // 计算准确率：推荐命中商品数/实际推荐商品数, topN为推荐上限值
      val precNum = mItems.length
      for (i <- 0 until precNum)
        if (tItems.contains(mItems(i)))
          count += 1
      (count, tItems.length, precNum) }.reduce( (t1, t2) => (t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3) )

      val recall: Double = hit * 1.0 / testNum
      val precision: Double = hit * 1.0 / recNum
      val f1: Double = 2 * recall * precision / (recall + precision)

      println(s"$hit,$testNum,$recNum")
      (recall,precision,f1)
  }

  def recallAndPrecision(test:RDD[Rating],result:RDD[Rating]):Double = {
    val numHit: Long = result.intersection(test).count
    val recall: Double = numHit * 1.0 / test.count
    val precision: Double = numHit * 1.0 / result.count
    val f1: Double = 2 * recall * precision / (recall + precision)
    System.out.println("recall : " + recall + "\nprecision : " + precision + "\nf1 : " + f1)
    f1
  }
}

Source File: Recommender.scala From awesome-recommendation-engine with Apache License 2.0

5 votes

package example.utils

import example.model.AmazonRating
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.joda.time.{Seconds, DateTime}
import scala.io.Source
import scala.util.Random

class Recommender(@transient sc: SparkContext, ratingFile: String) extends Serializable {
  val NumRecommendations = 10
  val MinRecommendationsPerUser = 10
  val MaxRecommendationsPerUser = 20
  val MyUsername = "myself"
  val NumPartitions = 20

  @transient val random = new Random() with Serializable

  println("Using this ratingFile: " + ratingFile)
  // first create an RDD out of the rating file
  val rawTrainingRatings = sc.textFile(ratingFile).map {
    line =>
      val Array(userId, productId, scoreStr) = line.split(",")
      AmazonRating(userId, productId, scoreStr.toDouble)
  }

  // only keep users that have rated between MinRecommendationsPerUser and MaxRecommendationsPerUser products
  val trainingRatings = rawTrainingRatings.groupBy(_.userId)
                                          .filter(r => MinRecommendationsPerUser <= r._2.size  && r._2.size < MaxRecommendationsPerUser)
                                          .flatMap(_._2)
                                          .repartition(NumPartitions)
                                          .cache()

  println(s"Parsed $ratingFile. Kept ${trainingRatings.count()} ratings out of ${rawTrainingRatings.count()}")

  // create user and item dictionaries
  val userDict = new Dictionary(MyUsername +: trainingRatings.map(_.userId).distinct.collect)
  println("User Dictionary have " + userDict.size + " elements.")
  val productDict = new Dictionary(trainingRatings.map(_.productId).distinct.collect)
  println("Product Dictionary have " + productDict.size + " elements.")

  private def toSparkRating(amazonRating: AmazonRating) = {
    Rating(userDict.getIndex(amazonRating.userId),
      productDict.getIndex(amazonRating.productId),
      amazonRating.rating)
  }

  private def toAmazonRating(rating: Rating) = {
    AmazonRating(userDict.getWord(rating.user),
      productDict.getWord(rating.product),
      rating.rating
    )
  }

  // convert to Spark Ratings using the dictionaries
  val sparkRatings = trainingRatings.map(toSparkRating)

  def getRandomProductId = productDict.getWord(random.nextInt(productDict.size))

  def predict(ratings: Seq[AmazonRating]) = {
    // train model
    val myRatings = ratings.map(toSparkRating)
    val myRatingRDD = sc.parallelize(myRatings)

    val startAls = DateTime.now
    val model = ALS.train((sparkRatings ++ myRatingRDD).repartition(NumPartitions), 10, 20, 0.01)

    val myProducts = myRatings.map(_.product).toSet
    val candidates = sc.parallelize((0 until productDict.size).filterNot(myProducts.contains))

    // get ratings of all products not in my history ordered by rating (higher first) and only keep the first NumRecommendations
    val myUserId = userDict.getIndex(MyUsername)
    val recommendations = model.predict(candidates.map((myUserId, _))).collect
    val endAls = DateTime.now
    val result = recommendations.sortBy(-_.rating).take(NumRecommendations).map(toAmazonRating)
    val alsTime = Seconds.secondsBetween(startAls, endAls).getSeconds

    println(s"ALS Time: $alsTime seconds")
    result
  }
}

Source File: RatingDataGenerator.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.ml

import com.intel.hibench.sparkbench.common.IOCommon

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.mllib.random._
import org.apache.spark.rdd.{PairRDDFunctions, RDD}
import org.apache.spark.mllib.linalg.{Vectors, Vector}

object RatingDataGenerator {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("RatingDataGeneration")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numUsers: Int = 100
    var numProducts: Int = 100
    var sparsity: Double = 0.05
    var implicitPrefs: Boolean = false
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 5) {
      outputPath = args(0)
      numUsers = args(1).toInt
      numProducts = args(2).toInt
      sparsity = args(3).toDouble
      implicitPrefs = args(4).toBoolean

      println(s"Output Path: $outputPath")
      println(s"Num of Users: $numUsers")
      println(s"Num of Products: $numProducts")
      println(s"Sparsity: $sparsity")
      println(s"Implicit Prefs: $implicitPrefs")
    } else {
      System.err.println(
        s"Usage: $RatingDataGenerator <OUTPUT_PATH> <NUM_USERS> <NUM_PRODUCTS> <SPARSITY>  <IMPLICITPREFS>"
      )
      System.exit(1)
    }

    val rawData: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, numUsers, numProducts, numPartitions)
    val rng = new java.util.Random()
    val data = rawData.map{v =>
      val a = Array.fill[Double](v.size)(0.0)
      v.foreachActive{(i,vi) =>
         if(rng.nextDouble <= sparsity){
           a(i) = vi
         }
      }
      Vectors.dense(a).toSparse
   }


    data.saveAsObjectFile(outputPath)

    sc.stop()
  }
}

Source File: ScalaApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

import java.text.SimpleDateFormat
import java.util.Calendar

import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.{ALS, Rating}

//import org.apache.spark.

    val predictedRating = model.predict(789, 123)
    println(predictedRating)
    val userId = 789
    val K = 10
    val topKRecs = model.recommendProducts(userId, K)
    println(topKRecs.mkString("\n"))

    val movies = sc.textFile(PATH + "/ml-100k/u.item")
    val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap()
    titles(123)
    // res68: String = Frighteners, The (1996)
    val moviesForUser = ratings.keyBy(_.user).lookup(789)
    // moviesForUser: Seq[org.apache.spark.mllib.recommendation.Rating] = WrappedArray(Rating(789,1012,4.0), Rating(789,127,5.0), Rating(789,475,5.0), Rating(789,93,4.0), ...
    // ...
    println(moviesForUser.size)
    moviesForUser.sortBy(-_.rating).take(10).map(rating => (titles(rating.product), rating.rating)).foreach(println)
    topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println)
    sc.stop()
    //bw.close()
  }

  class Util {
    def getDate(): String = {
      val today = Calendar.getInstance().getTime()
      // (2) create a date "formatter" (the date format we want)
      val formatter = new SimpleDateFormat("yyyy-MM-dd-hh.mm.ss")
   
      // (3) create a new String using the date format we want
      val folderName = formatter.format(today)
      return folderName
    }
  }

}

Source File: MatrixFactorizationModelWrapper.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
}

Source File: PythonMLLibAPISuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {

  SerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

  }
}

Source File: ModelTrainer.scala From recommendersystem with Apache License 2.0

5 votes

package com.infosupport.recommendedcontent.core

import akka.actor.{Props, ActorLogging, Actor}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.{Rating, ALS, MatrixFactorizationModel}

import com.datastax.spark.connector._


  private def trainModel() = {
    val table = context.system.settings.config.getString("cassandra.table")
    val keyspace = context.system.settings.config.getString("cassandra.keyspace")

    // Retrieve the ratings given by users from the database.
    // Map them to the rating structure needed by the Alternate Least Squares algorithm.
val ratings = sc.cassandraTable(keyspace, table).map(record => Rating(record.get[Int]("user_id"),
  record.get[Int]("item_id"), record.get[Double]("rating")))

// These settings control how well the predictions are going
// to fit the actual observations we loaded from Cassandra.
// Modify these to optimize the model!
val rank = 10
val iterations = 10
val lambda = 0.01

val model = ALS.train(ratings, rank, iterations, lambda)
    sender ! TrainingResult(model)

    context.stop(self)
  }
}

Source File: RecommendationModelReuse.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object RecommendationModelReuse {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("JavaLDAExample")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/").
      getOrCreate()

    val ratigsFile = "data/ratings.csv"
    val ratingDF = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile)
    val selectedRatingsDF = ratingDF.select(ratingDF.col("userId"), ratingDF.col("movieId"), ratingDF.col("rating"), ratingDF.col("timestamp"))

    // Randomly split ratings RDD into training data RDD (75%) and test data RDD (25%)
    val splits = selectedRatingsDF.randomSplit(Array(0.75, 0.25), seed = 12345L)
    val testData = splits(1)

    val testRDD = testData.rdd.map(row => {
      val userId = row.getString(0)
      val movieId = row.getString(1)
      val ratings = row.getString(2)
      Rating(userId.toInt, movieId.toInt, ratings.toDouble)
    })

    //Load the workflow back
    val same_model = MatrixFactorizationModel.load(spark.sparkContext, "model/MovieRecomModel/")

    // Making Predictions. Get the top 6 movie predictions for user 668
    println("Rating:(UserID, MovieID, Rating)")
    println("----------------------------------")
    val topRecsForUser = same_model.recommendProducts(458, 10)
    for (rating <- topRecsForUser) {
      println(rating.toString())
    }
    println("----------------------------------")

    val rmseTest = MovieRecommendation.computeRmse(same_model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    println("----------------------------------")
    val recommendationsUser = same_model.recommendProducts(458, 10)
    recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
    println("----------------------------------")

    spark.stop()
  }
}

Source File: MovieRecommendation.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLImplicits
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object MovieRecommendation {  
  //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. 
  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = {
    val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
    val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating)
    }.join(data.map(x => ((x.user, x.product), x.rating))).values
    if (implicitPrefs) {
      println("(Prediction, Rating)")
      println(predictionsAndRatings.take(5).mkString("\n"))
    }
    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
  }

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("JavaLDAExample")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/").
      getOrCreate()

    val ratigsFile = "data/ratings.csv"
    val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile)

    val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp"))
    ratingsDF.show(false)

    val moviesFile = "data/movies.csv"
    val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile)

    val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres"))
    moviesDF.show(false)

    ratingsDF.createOrReplaceTempView("ratings")
    moviesDF.createOrReplaceTempView("movies")

    

    var rmseTest = computeRmse(model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    println("----------------------------------")
    val recommendationsUser = model.recommendProducts(668, 6)
    recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
    println("----------------------------------")

    spark.stop()
  }
}

Source File: L9-12CollabFiltering.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CollabFilteringApp {

  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: CollabFilteringApp <appname> <batchInterval> <iPath>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, iPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val ratingStream = ssc.textFileStream(iPath).map(_.split(" ") match {
      case Array(subject, activity, freq) =>
        Rating(subject.toInt, activity.toInt, freq.toDouble)
    })

    val rank = 10
    val numIterations = 10
    val lambda = 0.01
    ratingStream.foreachRDD(ratingRDD => {
      val testTrain = ratingRDD.randomSplit(Array(0.3, 0.7))
      val model = ALS.train(testTrain(1), rank, numIterations, lambda)
      val test = testTrain(0).map {
        case Rating(subject, activity, freq) =>
          (subject, activity)
      }
      val prediction = model.predict(test)
      prediction.take(5).map(println)
    })

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: MovieLensDataPowerIterationClustering.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.apache.spark.{SparkConf, SparkContext}



object MovieLensDataPowerIterationClustering {
  val PATH= "../data/ml-100k"
  def main(args: Array[String]): Unit = {
    val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp").
      set("spark.driver.allowMultipleContexts", "true")
    val sc = new SparkContext(spConfig)
    //val path = PATH + "../data/"
    //val rdd = sc.wholeTextFiles(path)
    val movies = sc.textFile(PATH + "/u.item")
    println(movies.first)
    val genres = sc.textFile(PATH + "/u.genre")
    genres.take(5).foreach(println)

    val genreMap = genres.filter(!_.isEmpty).map(line => line.split("\\|")).
      map(array => (array(1), array(0))).collectAsMap


    val titlesAndGenres = movies.map(_.split("\\|")).map { array =>
      val genres = array.toSeq.slice(5, array.size)
      val genresAssigned = genres.zipWithIndex.filter { case (g, idx)
      =>
        g == "1"
      }.map { case (g, idx) =>
        genreMap(idx.toString)
      }
      (array(0).toInt, (array(1), genresAssigned))
    }

    val rawData = sc.textFile(PATH + "/u.data")
    val rawRatings = rawData.map(_.split("\t").take(3))
    val ratings = rawRatings.map{ case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    ratings.cache
    val alsModel = ALS.train(ratings, 50, 10, 0.1)
    import org.apache.spark.mllib.linalg.Vectors
    val movieFactors = alsModel.productFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) }
    val movieVectors = movieFactors.map(_._2)
    val userFactors = alsModel.userFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) }
    val userVectors = userFactors.map(_._2)
    

    val numClusters = 5
    val numIterations = 10
    val numRuns = 3
    import org.apache.spark.mllib.clustering.PowerIterationClustering
    //val bKMeans = new PowerIterationClustering()()
    val piClustering = new PowerIterationClustering()
    piClustering.setMaxIterations(10)
    piClustering.setK(numClusters)


    println("done")


  }
}

Source File: MovieLensFPGrowthApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

import org.apache.spark.SparkContext
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.mllib.recommendation.Rating

import scala.collection.mutable.ListBuffer


    val rawRatings = rawData.map(_.split("\t").take(3))
    rawRatings.first()
    // 14/03/30 13:22:44 INFO SparkContext: Job finished: first at <console>:21, took 0.003703 s
    // res25: Array[String] = Array(196, 242, 3)

    val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    val ratingsFirst = ratings.first()
    println(ratingsFirst)

    val userId = 789
    val K = 10

    val movies = sc.textFile(PATH + "/ml-100k/u.item")
    val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap()
    titles(123)

    var eRDD = sc.emptyRDD
    var z = Seq[String]()

    val l = ListBuffer()
    val aj = new Array[String](100)
    var i = 0
    for( a <- 801 to 900) {
      val moviesForUserX = ratings.keyBy(_.user).lookup(a)
      val moviesForUserX_10 = moviesForUserX.sortBy(-_.rating).take(10)
      val moviesForUserX_10_1 = moviesForUserX_10.map(r => r.product)
      var temp = ""
      for( x <- moviesForUserX_10_1){
        temp = temp + " " + x
        println(temp)

      }

      aj(i) = temp
      i += 1
    }
    z = aj
    val transaction2 = z.map(_.split(" "))

    val rddx = sc.parallelize(transaction2, 2).cache()

    val fpg = new FPGrowth()
    val model6 = fpg
      .setMinSupport(0.1)
      .setNumPartitions(1)
      .run(rddx)

    model6.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }
    sc.stop()
  }

}

Source File: RecommendationExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
// $example off$

object RecommendationExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("CollaborativeFilteringExample")
    val sc = new SparkContext(conf)
    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/als/test.data")
    val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
      Rating(user.toInt, item.toInt, rate.toDouble)
    })

    // Build the recommendation model using ALS
    val rank = 10
    val numIterations = 10
    val model = ALS.train(ratings, rank, numIterations, 0.01)

    // Evaluate the model on rating data
    val usersProducts = ratings.map { case Rating(user, product, rate) =>
      (user, product)
    }
    val predictions =
      model.predict(usersProducts).map { case Rating(user, product, rate) =>
        ((user, product), rate)
      }
    val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
      ((user, product), rate)
    }.join(predictions)
    val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
      val err = (r1 - r2)
      err * err
    }.mean()
    println("Mean Squared Error = " + MSE)

    // Save and load model
    model.save(sc, "target/tmp/myCollaborativeFilter")
    val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
    // $example off$
  }
}
// scalastyle:on println

Source File: MovieLensFPGrowthApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package com.sparksample

import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.mllib.recommendation.Rating

import scala.collection.mutable.ListBuffer


    val rawRatings = rawData.map(_.split("\t").take(3))
    rawRatings.first()
    val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    val ratingsFirst = ratings.first()
    println(ratingsFirst)

    val movies = Util.getMovieData()
    val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap()
    titles(123)

    var eRDD = sc.emptyRDD
    var z = Seq[String]()

    val l = ListBuffer()
    val aj = new Array[String](400)
    var i = 0
    for( a <- 501 to 900) {
      val moviesForUserX = ratings.keyBy(_.user).lookup(a)
      val moviesForUserX_10 = moviesForUserX.sortBy(-_.rating).take(10)
      val moviesForUserX_10_1 = moviesForUserX_10.map(r => r.product)
      var temp = ""
      for( x <- moviesForUserX_10_1){
        if(temp.equals(""))
          temp = x.toString
        else {
          temp =  temp + " " + x
        }
      }

      aj(i) = temp
      i += 1
    }
    z = aj

    val transaction = z.map(_.split(" "))
    val rddx = sc.parallelize(transaction, 2).cache()

    val fpg = new FPGrowth()
    val model = fpg
      .setMinSupport(0.1)
      .setNumPartitions(1)
      .run(rddx)

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }
    sc.stop()
  }

}

Source File: RecommendationExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
// $example off$

object RecommendationExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("CollaborativeFilteringExample")
    val sc = new SparkContext(conf)
    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/als/test.data")
    val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
      Rating(user.toInt, item.toInt, rate.toDouble)
    })

    // Build the recommendation model using ALS
    val rank = 10
    val numIterations = 10
    val model = ALS.train(ratings, rank, numIterations, 0.01)

    // Evaluate the model on rating data
    val usersProducts = ratings.map { case Rating(user, product, rate) =>
      (user, product)
    }
    val predictions =
      model.predict(usersProducts).map { case Rating(user, product, rate) =>
        ((user, product), rate)
      }
    val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
      ((user, product), rate)
    }.join(predictions)
    val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
      val err = (r1 - r2)
      err * err
    }.mean()
    println("Mean Squared Error = " + MSE)

    // Save and load model
    model.save(sc, "target/tmp/myCollaborativeFilter")
    val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
    // $example off$
  }
}
// scalastyle:on println

Source File: MatrixFactorizationModelWrapper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
}

Source File: PythonMLLibAPISuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {

  SerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

  }
}

Source File: RankingDataProvider.scala From spark-ranking-metrics with The Unlicense

5 votes

package com.github.jongwook

import org.apache.spark.SparkConf
import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.sql.SparkSession
import org.scalatest._

object RankingDataProvider {

  
  def apply(ratings: Seq[Rating], k: Int = 100): (Seq[Rating], Seq[Rating]) = {

    val spark = SparkSession.builder().master(new SparkConf().get("spark.master", "local[8]")).getOrCreate()
    val sc = spark.sparkContext

    val Array(trainRatings, testRatings) = sc.parallelize(ratings).cache().randomSplit(Array(0.9, 0.1), 0)
    val model = ALS.trainImplicit(trainRatings, rank = 10, iterations = 2, lambda = 2, blocks = 100, alpha = 10)

    val testUsers = testRatings.map(_.user).collect().toSet
    val testUsersBroadcast = spark.sparkContext.broadcast(testUsers)
    val testUserFeatures = model.userFeatures.filter {
      case (user, feature) => testUsersBroadcast.value.contains(user)
    }.repartition(100).cache()

    val testModel = new MatrixFactorizationModel(model.rank, testUserFeatures, model.productFeatures.repartition(100).cache())

    val result = testModel.recommendProductsForUsers(k)

    val prediction = result.values.flatMap(ratings => ratings).collect()
    val groundTruth = testRatings.collect()

    (prediction, groundTruth)
  }
}

class RankingDataProvider extends FlatSpec with Matchers {
  "Ranking Data Provider" should "calculate the rankings" in {
    val ratings = MovieLensLoader.load()
    val (prediction, groundTruth) = RankingDataProvider(ratings)
    prediction.map(_.user).distinct.sorted should equal (groundTruth.map(_.user).distinct.sorted)
  }
}

Source File: MovieLensLoader.scala From spark-ranking-metrics with The Unlicense

5 votes

package com.github.jongwook

import org.apache.spark.mllib.recommendation.Rating
import org.scalatest._

import scala.io.Source

object MovieLensLoader {
  
  def load(): Seq[Rating] = {
    val input = getClass.getResource("u.data").openStream()
    try {
      Source.fromInputStream(input).getLines().toArray.map {
        _.split("\t") match {
          case Array(user, item, rating, timestamp) => Rating(user.toInt, item.toInt, rating.toDouble)
        }
      }
    } finally {
      input.close()
    }
  }
}

class MovieLensLoader extends FlatSpec with Matchers {
  "MovieLens Loader" should "load the ml-100k data" in {
    val data = MovieLensLoader.load()
    data.size should be (100000)
    data.map(_.rating).max should be (5.0)
    data.map(_.rating).min should be (1.0)
  }
}

Source File: RecommendationExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
// $example off$

object RecommendationExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("CollaborativeFilteringExample")
    val sc = new SparkContext(conf)
    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/als/test.data")
    val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
      Rating(user.toInt, item.toInt, rate.toDouble)
    })

    // Build the recommendation model using ALS
    val rank = 10
    val numIterations = 10
    val model = ALS.train(ratings, rank, numIterations, 0.01)

    // Evaluate the model on rating data
    val usersProducts = ratings.map { case Rating(user, product, rate) =>
      (user, product)
    }
    val predictions =
      model.predict(usersProducts).map { case Rating(user, product, rate) =>
        ((user, product), rate)
      }
    val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
      ((user, product), rate)
    }.join(predictions)
    val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
      val err = (r1 - r2)
      err * err
    }.mean()
    println("Mean Squared Error = " + MSE)

    // Save and load model
    model.save(sc, "target/tmp/myCollaborativeFilter")
    val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
    // $example off$
  }
}
// scalastyle:on println

org.apache.spark.mllib.recommendation.Rating Scala Examples