org.apache.spark.mllib.recommendation.Rating Scala Examples
The following examples show how to use org.apache.spark.mllib.recommendation.Rating.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RecommendationExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating // $example off$ object RecommendationExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("CollaborativeFilteringExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/als/test.data") val ratings = data.map(_.split(',') match { case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble) }) // Build the recommendation model using ALS val rank = 10 val numIterations = 10 val model = ALS.train(ratings, rank, numIterations, 0.01) // Evaluate the model on rating data val usersProducts = ratings.map { case Rating(user, product, rate) => (user, product) } val predictions = model.predict(usersProducts).map { case Rating(user, product, rate) => ((user, product), rate) } val ratesAndPreds = ratings.map { case Rating(user, product, rate) => ((user, product), rate) }.join(predictions) val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => val err = (r1 - r2) err * err }.mean() println(s"Mean Squared Error = $MSE") // Save and load model model.save(sc, "target/tmp/myCollaborativeFilter") val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") // $example off$ sc.stop() } } // scalastyle:on println
Example 2
Source File: PythonMLLibAPISuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 3
Source File: MatrixFactorizationModelWrapper.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } }
Example 4
Source File: PythonMLLibAPISuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.recommendation.Rating class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array[Double]() val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 5
Source File: ALStrainImplicit.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.Rating val userMap = userIndex.collectAsMap //广播userMap val broadcastUserMap = sc.broadcast(userMap) //广播songMap val broadcastSongMap = sc.broadcast(songMap) //将triplets数据转换为一个数组 val tripArray = triplets.map(_.split("\\W+")) //导入Rating包 import org.apache.spark.mllib.recommendation.Rating //将tripArray数组转换为评级对象RDD val ratings = tripArray.map { case Array(user, song, plays)=> val userId = broadcastUserMap.value.getOrElse(user, 0) val songId = broadcastUserMap.value.getOrElse(song, 0) Rating(userId, songId, plays.toDouble) } //导入ALS import org.apache.spark.mllib.recommendation.ALS //将Rank设置为10,迭代次数设为10,Rank模型中的潜在特征数 val model = ALS.trainImplicit(ratings, 10, 10) //从triplet中导出用户和歌曲元组 val usersSongs = ratings.map( r => { println(r.user+"|||"+r.product) (r.user, r.product) }) //预测用户和歌曲 val predictions = model.predict(usersSongs) predictions.foreach { x => println(x.user.toString()+"|||||"+x.rating.toString()+"======="+x.product.toString()) } } }
Example 6
Source File: ALSExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.Rating object ALSExample { def main(args: Array[String]) { //为每个用户进行推荐,推荐的结果可以以用户id为key,结果为value存入redis或者hbase中 val users = data.map(_.split(",") match { case Array(user, product, rate) => (user) }).distinct().collect() //users: Array[String] = Array(4, 2, 3, 1) users.foreach( user => { //依次为用户推荐商品 var rs = model.recommendProducts(user.toInt, numIterations) var value = "" var key = 0 //拼接推荐结果 rs.foreach(r => { key = r.user value = value + r.product + ":" + r.rating + "," }) println(key.toString + " " + value) }) //对预测结果按预测的评分排序 predictions.collect.sortBy(_._2) //对预测结果按用户进行分组,然后合并推荐结果,这部分代码待修正 predictions.map { case ((user, product), rate) => (user, (product, rate)) }.groupByKey.collect //格式化测试评分和实际评分的结果 val formatedRatesAndPreds = ratesAndPreds.map { case ((user, product), (rate, pred)) => user + "," + product + "," + rate + "," + pred } formatedRatesAndPreds.collect() } }
Example 7
Source File: ALSDome.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.Rating Rating(userId.toInt, itemId.toInt, rating.toDouble) } //绑定评分数据和个人评分数据 val movieratings = ratings.union(pratings) //使用ALS建立模型,设定rank为5,迭代次数为10以及lambda为0.01 val model = ALS.train(movieratings, 10, 10, 0.01) //在模型上选定一部电影预测我的评分,让我们从电影ID为195的<终结者>开始 model.predict(sc.parallelize(Array((944,195)))).collect.foreach(println) //在模型上选定一部电影预测我的评分,让我们从电影ID为402<人鬼情未了> model.predict(sc.parallelize(Array((944,402)))).collect.foreach(println) //在模型上选定一部电影预测我的评分,让我们从电影ID为148<黑夜幽灵> model.predict(sc.parallelize(Array((944,402)))).collect.foreach(println) } }
Example 8
Source File: MatrixFactorizationModelWrapper.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } }
Example 9
Source File: PythonMLLibAPISuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.recommendation.Rating class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array[Double]() val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 10
Source File: MatrixFactorizationModelWrapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 11
Source File: MatrixFactorizationModelWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 12
Source File: PythonMLLibAPISuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 13
Source File: RecommendationExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkContext, SparkConf} // $example on$ import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating // $example off$ object RecommendationExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("CollaborativeFilteringExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/als/test.data") val ratings = data.map(_.split(',') match { case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble) }) // Build the recommendation model using ALS val rank = 10 val numIterations = 10 val model = ALS.train(ratings, rank, numIterations, 0.01) // Evaluate the model on rating data val usersProducts = ratings.map { case Rating(user, product, rate) => (user, product) } val predictions = model.predict(usersProducts).map { case Rating(user, product, rate) => ((user, product), rate) } val ratesAndPreds = ratings.map { case Rating(user, product, rate) => ((user, product), rate) }.join(predictions) val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => val err = (r1 - r2) err * err }.mean() println("Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/myCollaborativeFilter") val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") // $example off$ } } // scalastyle:on println
Example 14
Source File: MatrixFactorizationModelWrapper.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 15
Source File: PythonMLLibAPISuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.recommendation.Rating class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array[Double]() val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 16
Source File: EvaluateResult.scala From learning-spark with Apache License 2.0 | 5 votes |
package com.javachen.grab import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD object EvaluateResult { def coverage(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])])={ userRecommends.flatMap(_._2).distinct().count.toDouble / training.map(_.product).distinct().count } def popularity(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])])={ var ret = 0.0 var n=0 val item_popularity=training.map{ case Rating(user, product, rate) => (product,(user, rate)) }.groupByKey(4).map{case (product,list)=> (product,list.size) }.collectAsMap() userRecommends.flatMap(_._2).collect().foreach { p => ret = ret + math.log(1 + item_popularity.get(p).get) n = n + 1 } ret/n } def recallAndPrecisionAndF1(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])]):(Double, Double,Double) = { val usersProducts: RDD[(Int, Int)] = training.map { case Rating(user, product, rate) => (user, product) } val groupData=userRecommends.join(usersProducts.groupByKey().map {case (k,v) => (k,v.toList)}) val (hit, testNum, recNum) = groupData.map{ case (user, (mItems, tItems)) => var count = 0 // 计算准确率:推荐命中商品数/实际推荐商品数, topN为推荐上限值 val precNum = mItems.length for (i <- 0 until precNum) if (tItems.contains(mItems(i))) count += 1 (count, tItems.length, precNum) }.reduce( (t1, t2) => (t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3) ) val recall: Double = hit * 1.0 / testNum val precision: Double = hit * 1.0 / recNum val f1: Double = 2 * recall * precision / (recall + precision) println(s"$hit,$testNum,$recNum") (recall,precision,f1) } def recallAndPrecision(test:RDD[Rating],result:RDD[Rating]):Double = { val numHit: Long = result.intersection(test).count val recall: Double = numHit * 1.0 / test.count val precision: Double = numHit * 1.0 / result.count val f1: Double = 2 * recall * precision / (recall + precision) System.out.println("recall : " + recall + "\nprecision : " + precision + "\nf1 : " + f1) f1 } }
Example 17
Source File: Recommender.scala From awesome-recommendation-engine with Apache License 2.0 | 5 votes |
package example.utils import example.model.AmazonRating import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.{ALS, Rating} import org.joda.time.{Seconds, DateTime} import scala.io.Source import scala.util.Random class Recommender(@transient sc: SparkContext, ratingFile: String) extends Serializable { val NumRecommendations = 10 val MinRecommendationsPerUser = 10 val MaxRecommendationsPerUser = 20 val MyUsername = "myself" val NumPartitions = 20 @transient val random = new Random() with Serializable println("Using this ratingFile: " + ratingFile) // first create an RDD out of the rating file val rawTrainingRatings = sc.textFile(ratingFile).map { line => val Array(userId, productId, scoreStr) = line.split(",") AmazonRating(userId, productId, scoreStr.toDouble) } // only keep users that have rated between MinRecommendationsPerUser and MaxRecommendationsPerUser products val trainingRatings = rawTrainingRatings.groupBy(_.userId) .filter(r => MinRecommendationsPerUser <= r._2.size && r._2.size < MaxRecommendationsPerUser) .flatMap(_._2) .repartition(NumPartitions) .cache() println(s"Parsed $ratingFile. Kept ${trainingRatings.count()} ratings out of ${rawTrainingRatings.count()}") // create user and item dictionaries val userDict = new Dictionary(MyUsername +: trainingRatings.map(_.userId).distinct.collect) println("User Dictionary have " + userDict.size + " elements.") val productDict = new Dictionary(trainingRatings.map(_.productId).distinct.collect) println("Product Dictionary have " + productDict.size + " elements.") private def toSparkRating(amazonRating: AmazonRating) = { Rating(userDict.getIndex(amazonRating.userId), productDict.getIndex(amazonRating.productId), amazonRating.rating) } private def toAmazonRating(rating: Rating) = { AmazonRating(userDict.getWord(rating.user), productDict.getWord(rating.product), rating.rating ) } // convert to Spark Ratings using the dictionaries val sparkRatings = trainingRatings.map(toSparkRating) def getRandomProductId = productDict.getWord(random.nextInt(productDict.size)) def predict(ratings: Seq[AmazonRating]) = { // train model val myRatings = ratings.map(toSparkRating) val myRatingRDD = sc.parallelize(myRatings) val startAls = DateTime.now val model = ALS.train((sparkRatings ++ myRatingRDD).repartition(NumPartitions), 10, 20, 0.01) val myProducts = myRatings.map(_.product).toSet val candidates = sc.parallelize((0 until productDict.size).filterNot(myProducts.contains)) // get ratings of all products not in my history ordered by rating (higher first) and only keep the first NumRecommendations val myUserId = userDict.getIndex(MyUsername) val recommendations = model.predict(candidates.map((myUserId, _))).collect val endAls = DateTime.now val result = recommendations.sortBy(-_.rating).take(NumRecommendations).map(toAmazonRating) val alsTime = Seconds.secondsBetween(startAls, endAls).getSeconds println(s"ALS Time: $alsTime seconds") result } }
Example 18
Source File: RatingDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.mllib.random._ import org.apache.spark.rdd.{PairRDDFunctions, RDD} import org.apache.spark.mllib.linalg.{Vectors, Vector} object RatingDataGenerator { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("RatingDataGeneration") val sc = new SparkContext(conf) var outputPath = "" var numUsers: Int = 100 var numProducts: Int = 100 var sparsity: Double = 0.05 var implicitPrefs: Boolean = false val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 5) { outputPath = args(0) numUsers = args(1).toInt numProducts = args(2).toInt sparsity = args(3).toDouble implicitPrefs = args(4).toBoolean println(s"Output Path: $outputPath") println(s"Num of Users: $numUsers") println(s"Num of Products: $numProducts") println(s"Sparsity: $sparsity") println(s"Implicit Prefs: $implicitPrefs") } else { System.err.println( s"Usage: $RatingDataGenerator <OUTPUT_PATH> <NUM_USERS> <NUM_PRODUCTS> <SPARSITY> <IMPLICITPREFS>" ) System.exit(1) } val rawData: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, numUsers, numProducts, numPartitions) val rng = new java.util.Random() val data = rawData.map{v => val a = Array.fill[Double](v.size)(0.0) v.foreachActive{(i,vi) => if(rng.nextDouble <= sparsity){ a(i) = vi } } Vectors.dense(a).toSparse } data.saveAsObjectFile(outputPath) sc.stop() } }
Example 19
Source File: ScalaApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import java.text.SimpleDateFormat import java.util.Calendar import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.{ALS, Rating} //import org.apache.spark. val predictedRating = model.predict(789, 123) println(predictedRating) val userId = 789 val K = 10 val topKRecs = model.recommendProducts(userId, K) println(topKRecs.mkString("\n")) val movies = sc.textFile(PATH + "/ml-100k/u.item") val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap() titles(123) // res68: String = Frighteners, The (1996) val moviesForUser = ratings.keyBy(_.user).lookup(789) // moviesForUser: Seq[org.apache.spark.mllib.recommendation.Rating] = WrappedArray(Rating(789,1012,4.0), Rating(789,127,5.0), Rating(789,475,5.0), Rating(789,93,4.0), ... // ... println(moviesForUser.size) moviesForUser.sortBy(-_.rating).take(10).map(rating => (titles(rating.product), rating.rating)).foreach(println) topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println) sc.stop() //bw.close() } class Util { def getDate(): String = { val today = Calendar.getInstance().getTime() // (2) create a date "formatter" (the date format we want) val formatter = new SimpleDateFormat("yyyy-MM-dd-hh.mm.ss") // (3) create a new String using the date format we want val folderName = formatter.format(today) return folderName } } }
Example 20
Source File: MatrixFactorizationModelWrapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 21
Source File: PythonMLLibAPISuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 22
Source File: ModelTrainer.scala From recommendersystem with Apache License 2.0 | 5 votes |
package com.infosupport.recommendedcontent.core import akka.actor.{Props, ActorLogging, Actor} import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.{Rating, ALS, MatrixFactorizationModel} import com.datastax.spark.connector._ private def trainModel() = { val table = context.system.settings.config.getString("cassandra.table") val keyspace = context.system.settings.config.getString("cassandra.keyspace") // Retrieve the ratings given by users from the database. // Map them to the rating structure needed by the Alternate Least Squares algorithm. val ratings = sc.cassandraTable(keyspace, table).map(record => Rating(record.get[Int]("user_id"), record.get[Int]("item_id"), record.get[Double]("rating"))) // These settings control how well the predictions are going // to fit the actual observations we loaded from Cassandra. // Modify these to optimize the model! val rank = 10 val iterations = 10 val lambda = 0.01 val model = ALS.train(ratings, rank, iterations, lambda) sender ! TrainingResult(model) context.stop(self) } }
Example 23
Source File: RecommendationModelReuse.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.MovieRecommendation import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating import scala.Tuple2 import org.apache.spark.rdd.RDD object RecommendationModelReuse { def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession .builder() .appName("JavaLDAExample") .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/"). getOrCreate() val ratigsFile = "data/ratings.csv" val ratingDF = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile) val selectedRatingsDF = ratingDF.select(ratingDF.col("userId"), ratingDF.col("movieId"), ratingDF.col("rating"), ratingDF.col("timestamp")) // Randomly split ratings RDD into training data RDD (75%) and test data RDD (25%) val splits = selectedRatingsDF.randomSplit(Array(0.75, 0.25), seed = 12345L) val testData = splits(1) val testRDD = testData.rdd.map(row => { val userId = row.getString(0) val movieId = row.getString(1) val ratings = row.getString(2) Rating(userId.toInt, movieId.toInt, ratings.toDouble) }) //Load the workflow back val same_model = MatrixFactorizationModel.load(spark.sparkContext, "model/MovieRecomModel/") // Making Predictions. Get the top 6 movie predictions for user 668 println("Rating:(UserID, MovieID, Rating)") println("----------------------------------") val topRecsForUser = same_model.recommendProducts(458, 10) for (rating <- topRecsForUser) { println(rating.toString()) } println("----------------------------------") val rmseTest = MovieRecommendation.computeRmse(same_model, testRDD, true) println("Test RMSE: = " + rmseTest) //Less is better //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668 println("Recommendations: (MovieId => Rating)") println("----------------------------------") val recommendationsUser = same_model.recommendProducts(458, 10) recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println) println("----------------------------------") spark.stop() } }
Example 24
Source File: MovieRecommendation.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.MovieRecommendation import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.SQLImplicits import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating import scala.Tuple2 import org.apache.spark.rdd.RDD object MovieRecommendation { //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = { val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product))) val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating) }.join(data.map(x => ((x.user, x.product), x.rating))).values if (implicitPrefs) { println("(Prediction, Rating)") println(predictionsAndRatings.take(5).mkString("\n")) } math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean()) } def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession .builder() .appName("JavaLDAExample") .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/"). getOrCreate() val ratigsFile = "data/ratings.csv" val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile) val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp")) ratingsDF.show(false) val moviesFile = "data/movies.csv" val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile) val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres")) moviesDF.show(false) ratingsDF.createOrReplaceTempView("ratings") moviesDF.createOrReplaceTempView("movies") var rmseTest = computeRmse(model, testRDD, true) println("Test RMSE: = " + rmseTest) //Less is better //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668 println("Recommendations: (MovieId => Rating)") println("----------------------------------") val recommendationsUser = model.recommendProducts(668, 6) recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println) println("----------------------------------") spark.stop() } }
Example 25
Source File: L9-12CollabFiltering.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CollabFilteringApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: CollabFilteringApp <appname> <batchInterval> <iPath>") System.exit(1) } val Seq(appName, batchInterval, iPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val ratingStream = ssc.textFileStream(iPath).map(_.split(" ") match { case Array(subject, activity, freq) => Rating(subject.toInt, activity.toInt, freq.toDouble) }) val rank = 10 val numIterations = 10 val lambda = 0.01 ratingStream.foreachRDD(ratingRDD => { val testTrain = ratingRDD.randomSplit(Array(0.3, 0.7)) val model = ALS.train(testTrain(1), rank, numIterations, lambda) val test = testTrain(0).map { case Rating(subject, activity, freq) => (subject, activity) } val prediction = model.predict(test) prediction.take(5).map(println) }) ssc.start() ssc.awaitTermination() } }
Example 26
Source File: MovieLensDataPowerIterationClustering.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.mllib.recommendation.{ALS, Rating} import org.apache.spark.{SparkConf, SparkContext} object MovieLensDataPowerIterationClustering { val PATH= "../data/ml-100k" def main(args: Array[String]): Unit = { val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp"). set("spark.driver.allowMultipleContexts", "true") val sc = new SparkContext(spConfig) //val path = PATH + "../data/" //val rdd = sc.wholeTextFiles(path) val movies = sc.textFile(PATH + "/u.item") println(movies.first) val genres = sc.textFile(PATH + "/u.genre") genres.take(5).foreach(println) val genreMap = genres.filter(!_.isEmpty).map(line => line.split("\\|")). map(array => (array(1), array(0))).collectAsMap val titlesAndGenres = movies.map(_.split("\\|")).map { array => val genres = array.toSeq.slice(5, array.size) val genresAssigned = genres.zipWithIndex.filter { case (g, idx) => g == "1" }.map { case (g, idx) => genreMap(idx.toString) } (array(0).toInt, (array(1), genresAssigned)) } val rawData = sc.textFile(PATH + "/u.data") val rawRatings = rawData.map(_.split("\t").take(3)) val ratings = rawRatings.map{ case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) } ratings.cache val alsModel = ALS.train(ratings, 50, 10, 0.1) import org.apache.spark.mllib.linalg.Vectors val movieFactors = alsModel.productFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) } val movieVectors = movieFactors.map(_._2) val userFactors = alsModel.userFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) } val userVectors = userFactors.map(_._2) val numClusters = 5 val numIterations = 10 val numRuns = 3 import org.apache.spark.mllib.clustering.PowerIterationClustering //val bKMeans = new PowerIterationClustering()() val piClustering = new PowerIterationClustering() piClustering.setMaxIterations(10) piClustering.setK(numClusters) println("done") } }
Example 27
Source File: MovieLensFPGrowthApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.mllib.recommendation.Rating import scala.collection.mutable.ListBuffer val rawRatings = rawData.map(_.split("\t").take(3)) rawRatings.first() // 14/03/30 13:22:44 INFO SparkContext: Job finished: first at <console>:21, took 0.003703 s // res25: Array[String] = Array(196, 242, 3) val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) } val ratingsFirst = ratings.first() println(ratingsFirst) val userId = 789 val K = 10 val movies = sc.textFile(PATH + "/ml-100k/u.item") val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap() titles(123) var eRDD = sc.emptyRDD var z = Seq[String]() val l = ListBuffer() val aj = new Array[String](100) var i = 0 for( a <- 801 to 900) { val moviesForUserX = ratings.keyBy(_.user).lookup(a) val moviesForUserX_10 = moviesForUserX.sortBy(-_.rating).take(10) val moviesForUserX_10_1 = moviesForUserX_10.map(r => r.product) var temp = "" for( x <- moviesForUserX_10_1){ temp = temp + " " + x println(temp) } aj(i) = temp i += 1 } z = aj val transaction2 = z.map(_.split(" ")) val rddx = sc.parallelize(transaction2, 2).cache() val fpg = new FPGrowth() val model6 = fpg .setMinSupport(0.1) .setNumPartitions(1) .run(rddx) model6.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } }
Example 28
Source File: RecommendationExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating // $example off$ object RecommendationExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("CollaborativeFilteringExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/als/test.data") val ratings = data.map(_.split(',') match { case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble) }) // Build the recommendation model using ALS val rank = 10 val numIterations = 10 val model = ALS.train(ratings, rank, numIterations, 0.01) // Evaluate the model on rating data val usersProducts = ratings.map { case Rating(user, product, rate) => (user, product) } val predictions = model.predict(usersProducts).map { case Rating(user, product, rate) => ((user, product), rate) } val ratesAndPreds = ratings.map { case Rating(user, product, rate) => ((user, product), rate) }.join(predictions) val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => val err = (r1 - r2) err * err }.mean() println("Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/myCollaborativeFilter") val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") // $example off$ } } // scalastyle:on println
Example 29
Source File: MovieLensFPGrowthApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package com.sparksample import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.mllib.recommendation.Rating import scala.collection.mutable.ListBuffer val rawRatings = rawData.map(_.split("\t").take(3)) rawRatings.first() val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) } val ratingsFirst = ratings.first() println(ratingsFirst) val movies = Util.getMovieData() val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap() titles(123) var eRDD = sc.emptyRDD var z = Seq[String]() val l = ListBuffer() val aj = new Array[String](400) var i = 0 for( a <- 501 to 900) { val moviesForUserX = ratings.keyBy(_.user).lookup(a) val moviesForUserX_10 = moviesForUserX.sortBy(-_.rating).take(10) val moviesForUserX_10_1 = moviesForUserX_10.map(r => r.product) var temp = "" for( x <- moviesForUserX_10_1){ if(temp.equals("")) temp = x.toString else { temp = temp + " " + x } } aj(i) = temp i += 1 } z = aj val transaction = z.map(_.split(" ")) val rddx = sc.parallelize(transaction, 2).cache() val fpg = new FPGrowth() val model = fpg .setMinSupport(0.1) .setNumPartitions(1) .run(rddx) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } }
Example 30
Source File: RecommendationExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating // $example off$ object RecommendationExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("CollaborativeFilteringExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/als/test.data") val ratings = data.map(_.split(',') match { case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble) }) // Build the recommendation model using ALS val rank = 10 val numIterations = 10 val model = ALS.train(ratings, rank, numIterations, 0.01) // Evaluate the model on rating data val usersProducts = ratings.map { case Rating(user, product, rate) => (user, product) } val predictions = model.predict(usersProducts).map { case Rating(user, product, rate) => ((user, product), rate) } val ratesAndPreds = ratings.map { case Rating(user, product, rate) => ((user, product), rate) }.join(predictions) val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => val err = (r1 - r2) err * err }.mean() println("Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/myCollaborativeFilter") val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") // $example off$ } } // scalastyle:on println
Example 31
Source File: MatrixFactorizationModelWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 32
Source File: PythonMLLibAPISuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 33
Source File: RankingDataProvider.scala From spark-ranking-metrics with The Unlicense | 5 votes |
package com.github.jongwook import org.apache.spark.SparkConf import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} import org.apache.spark.sql.SparkSession import org.scalatest._ object RankingDataProvider { def apply(ratings: Seq[Rating], k: Int = 100): (Seq[Rating], Seq[Rating]) = { val spark = SparkSession.builder().master(new SparkConf().get("spark.master", "local[8]")).getOrCreate() val sc = spark.sparkContext val Array(trainRatings, testRatings) = sc.parallelize(ratings).cache().randomSplit(Array(0.9, 0.1), 0) val model = ALS.trainImplicit(trainRatings, rank = 10, iterations = 2, lambda = 2, blocks = 100, alpha = 10) val testUsers = testRatings.map(_.user).collect().toSet val testUsersBroadcast = spark.sparkContext.broadcast(testUsers) val testUserFeatures = model.userFeatures.filter { case (user, feature) => testUsersBroadcast.value.contains(user) }.repartition(100).cache() val testModel = new MatrixFactorizationModel(model.rank, testUserFeatures, model.productFeatures.repartition(100).cache()) val result = testModel.recommendProductsForUsers(k) val prediction = result.values.flatMap(ratings => ratings).collect() val groundTruth = testRatings.collect() (prediction, groundTruth) } } class RankingDataProvider extends FlatSpec with Matchers { "Ranking Data Provider" should "calculate the rankings" in { val ratings = MovieLensLoader.load() val (prediction, groundTruth) = RankingDataProvider(ratings) prediction.map(_.user).distinct.sorted should equal (groundTruth.map(_.user).distinct.sorted) } }
Example 34
Source File: MovieLensLoader.scala From spark-ranking-metrics with The Unlicense | 5 votes |
package com.github.jongwook import org.apache.spark.mllib.recommendation.Rating import org.scalatest._ import scala.io.Source object MovieLensLoader { def load(): Seq[Rating] = { val input = getClass.getResource("u.data").openStream() try { Source.fromInputStream(input).getLines().toArray.map { _.split("\t") match { case Array(user, item, rating, timestamp) => Rating(user.toInt, item.toInt, rating.toDouble) } } } finally { input.close() } } } class MovieLensLoader extends FlatSpec with Matchers { "MovieLens Loader" should "load the ml-100k data" in { val data = MovieLensLoader.load() data.size should be (100000) data.map(_.rating).max should be (5.0) data.map(_.rating).min should be (1.0) } }
Example 35
Source File: RecommendationExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating // $example off$ object RecommendationExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("CollaborativeFilteringExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/als/test.data") val ratings = data.map(_.split(',') match { case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble) }) // Build the recommendation model using ALS val rank = 10 val numIterations = 10 val model = ALS.train(ratings, rank, numIterations, 0.01) // Evaluate the model on rating data val usersProducts = ratings.map { case Rating(user, product, rate) => (user, product) } val predictions = model.predict(usersProducts).map { case Rating(user, product, rate) => ((user, product), rate) } val ratesAndPreds = ratings.map { case Rating(user, product, rate) => ((user, product), rate) }.join(predictions) val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => val err = (r1 - r2) err * err }.mean() println("Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/myCollaborativeFilter") val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") // $example off$ } } // scalastyle:on println