org.apache.spark.mllib.fpm.FPGrowth Scala Examples
The following examples show how to use org.apache.spark.mllib.fpm.FPGrowth.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: FPGrowthExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.fpm.FPGrowth object FPGrowthExample { case class Params( input: String = null, minSupport: Double = 0.3, numPartition: Int = -1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("FPGrowthExample") { head("FPGrowth: an example FP-growth app.") opt[Double]("minSupport") .text(s"minimal support level, default: ${defaultParams.minSupport}") .action((x, c) => c.copy(minSupport = x)) opt[Int]("numPartition") .text(s"number of partition, default: ${defaultParams.numPartition}") .action((x, c) => c.copy(numPartition = x)) arg[String]("<input>") .text("input paths to input data set, whose file format is that each line " + "contains a transaction with each item in String and separated by a space") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"FPGrowthExample with $params") val sc = new SparkContext(conf) val transactions = sc.textFile(params.input).map(_.split(" ")).cache() println(s"Number of transactions: ${transactions.count()}") val model = new FPGrowth() .setMinSupport(params.minSupport) .setNumPartitions(params.numPartition) .run(transactions) println(s"Number of frequent itemsets: ${model.freqItemsets.count()}") model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } } // scalastyle:on println
Example 2
Source File: SimpleFPGrowth.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.rdd.RDD // $example off$ import org.apache.spark.{SparkContext, SparkConf} object SimpleFPGrowth { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleFPGrowth") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/sample_fpgrowth.txt") val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' ')) val fpg = new FPGrowth() .setMinSupport(0.2) .setNumPartitions(10) val model = fpg.run(transactions) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } val minConfidence = 0.8 model.generateAssociationRules(minConfidence).collect().foreach { rule => println( rule.antecedent.mkString("[", ",", "]") + " => " + rule.consequent .mkString("[", ",", "]") + ", " + rule.confidence) } // $example off$ } } // scalastyle:on println
Example 3
Source File: FPGrowthExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.{SparkConf, SparkContext} object FPGrowthExample { case class Params( input: String = null, minSupport: Double = 0.3, numPartition: Int = -1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("FPGrowthExample") { head("FPGrowth: an example FP-growth app.") opt[Double]("minSupport") .text(s"minimal support level, default: ${defaultParams.minSupport}") .action((x, c) => c.copy(minSupport = x)) opt[Int]("numPartition") .text(s"number of partition, default: ${defaultParams.numPartition}") .action((x, c) => c.copy(numPartition = x)) arg[String]("<input>") .text("input paths to input data set, whose file format is that each line " + "contains a transaction with each item in String and separated by a space") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"FPGrowthExample with $params") val sc = new SparkContext(conf) val transactions = sc.textFile(params.input).map(_.split(" ")).cache() println(s"Number of transactions: ${transactions.count()}") val model = new FPGrowth() .setMinSupport(params.minSupport) .setNumPartitions(params.numPartition) .run(transactions) println(s"Number of frequent itemsets: ${model.freqItemsets.count()}") model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } } // scalastyle:on println
Example 4
Source File: SimpleFPGrowth.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.rdd.RDD // $example off$ object SimpleFPGrowth { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleFPGrowth") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/sample_fpgrowth.txt") val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' ')) val fpg = new FPGrowth() .setMinSupport(0.2) .setNumPartitions(10) val model = fpg.run(transactions) model.freqItemsets.collect().foreach { itemset => println(s"${itemset.items.mkString("[", ",", "]")},${itemset.freq}") } val minConfidence = 0.8 model.generateAssociationRules(minConfidence).collect().foreach { rule => println(s"${rule.antecedent.mkString("[", ",", "]")}=> " + s"${rule.consequent .mkString("[", ",", "]")},${rule.confidence}") } // $example off$ sc.stop() } } // scalastyle:on println
Example 5
Source File: FPGrowthExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.fpm.FPGrowth object FPGrowthExample { case class Params( input: String = null, minSupport: Double = 0.3, numPartition: Int = -1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("FPGrowthExample") { head("FPGrowth: an example FP-growth app.") opt[Double]("minSupport") .text(s"minimal support level, default: ${defaultParams.minSupport}") .action((x, c) => c.copy(minSupport = x)) opt[Int]("numPartition") .text(s"number of partition, default: ${defaultParams.numPartition}") .action((x, c) => c.copy(numPartition = x)) arg[String]("<input>") .text("input paths to input data set, whose file format is that each line " + "contains a transaction with each item in String and separated by a space") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"FPGrowthExample with $params") val sc = new SparkContext(conf) val transactions = sc.textFile(params.input).map(_.split(" ")).cache() println(s"Number of transactions: ${transactions.count()}") val model = new FPGrowth() .setMinSupport(params.minSupport) .setNumPartitions(params.numPartition) .run(transactions) println(s"Number of frequent itemsets: ${model.freqItemsets.count()}") model.freqItemsets.collect().foreach { itemset => println(s"${itemset.items.mkString("[", ",", "]")}, ${itemset.freq}") } sc.stop() } } // scalastyle:on println
Example 6
Source File: FPGrowthExample.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.{SparkConf, SparkContext} object FPGrowthExample { case class Params( input: String = null, minSupport: Double = 0.3, numPartition: Int = -1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("FPGrowthExample") { head("FPGrowth: an example FP-growth app.") opt[Double]("minSupport") .text(s"minimal support level, default: ${defaultParams.minSupport}") .action((x, c) => c.copy(minSupport = x)) opt[Int]("numPartition") .text(s"number of partition, default: ${defaultParams.numPartition}") .action((x, c) => c.copy(numPartition = x)) arg[String]("<input>") .text("input paths to input data set, whose file format is that each line " + "contains a transaction with each item in String and separated by a space") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"FPGrowthExample with $params") val sc = new SparkContext(conf) val transactions = sc.textFile(params.input).map(_.split(" ")).cache() println(s"Number of transactions: ${transactions.count()}") val model = new FPGrowth() .setMinSupport(params.minSupport) .setNumPartitions(params.numPartition) .run(transactions) println(s"Number of frequent itemsets: ${model.freqItemsets.count()}") model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } }
Example 7
Source File: SimpleFPGrowth.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.rdd.RDD // $example off$ object SimpleFPGrowth { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleFPGrowth") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/sample_fpgrowth.txt") val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' ')) val fpg = new FPGrowth() .setMinSupport(0.2) .setNumPartitions(10) val model = fpg.run(transactions) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } val minConfidence = 0.8 model.generateAssociationRules(minConfidence).collect().foreach { rule => println( rule.antecedent.mkString("[", ",", "]") + " => " + rule.consequent .mkString("[", ",", "]") + ", " + rule.confidence) } // $example off$ } } // scalastyle:on println
Example 8
Source File: FPGrowthExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.fpm.FPGrowth object FPGrowthExample { case class Params( input: String = null, minSupport: Double = 0.3, numPartition: Int = -1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("FPGrowthExample") { head("FPGrowth: an example FP-growth app.") opt[Double]("minSupport") .text(s"minimal support level, default: ${defaultParams.minSupport}") .action((x, c) => c.copy(minSupport = x)) opt[Int]("numPartition") .text(s"number of partition, default: ${defaultParams.numPartition}") .action((x, c) => c.copy(numPartition = x)) arg[String]("<input>") .text("input paths to input data set, whose file format is that each line " + "contains a transaction with each item in String and separated by a space") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"FPGrowthExample with $params") val sc = new SparkContext(conf) val transactions = sc.textFile(params.input).map(_.split(" ")).cache() println(s"Number of transactions: ${transactions.count()}") val model = new FPGrowth() .setMinSupport(params.minSupport) .setNumPartitions(params.numPartition) .run(transactions) println(s"Number of frequent itemsets: ${model.freqItemsets.count()}") model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } } // scalastyle:on println
Example 9
Source File: FPGAlgorithm.scala From pio-template-fpm with Apache License 2.0 | 5 votes |
package org.template.fpm import org.apache.predictionio.controller.P2LAlgorithm import org.apache.predictionio.controller.Params import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import grizzled.slf4j.Logger import org.apache.spark.mllib.fpm.{FPGrowth,FPGrowthModel} case class AlgorithmParams( val minSupport: Double, val minConfidence: Double, val numPartitions: Int ) extends Params class FPGModel( val resultList: List[(String,Array[String],Double)] ) extends Serializable {} class FPGAlgorithm(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, FPGModel, Query, PredictedResult] { @transient lazy val logger = Logger[this.type] def train(sc: SparkContext, data: PreparedData): FPGModel = { println("Training FPM model.") val fpg = new FPGrowth().setMinSupport(ap.minSupport).setNumPartitions(ap.numPartitions) val model = fpg.run(data.transactions.cache) val res = model.generateAssociationRules(ap.minConfidence).map(x=>(x.antecedent.mkString(" "),x.consequent,x.confidence)).collect.toList new FPGModel(resultList=res) } def predict(model: FPGModel, query: Query): PredictedResult = { val qArr = query.items.toList.sorted.mkString(" ") val result = model.resultList.filter(x=>{x._1==qArr}).sortBy(_._3).map(x=>{new ConsequentItem(x._2,x._3)}) PredictedResult(consequentItems=result.toArray) } }
Example 10
Source File: SimpleFPGrowth.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.rdd.RDD // $example off$ object SimpleFPGrowth { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleFPGrowth") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/sample_fpgrowth.txt") val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' ')) val fpg = new FPGrowth() .setMinSupport(0.2) .setNumPartitions(10) val model = fpg.run(transactions) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } val minConfidence = 0.8 model.generateAssociationRules(minConfidence).collect().foreach { rule => println( rule.antecedent.mkString("[", ",", "]") + " => " + rule.consequent .mkString("[", ",", "]") + ", " + rule.confidence) } // $example off$ } } // scalastyle:on println
Example 11
Source File: FPGrowthExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.fpm.FPGrowth object FPGrowthExample { case class Params( input: String = null, minSupport: Double = 0.3, numPartition: Int = -1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("FPGrowthExample") { head("FPGrowth: an example FP-growth app.") opt[Double]("minSupport") .text(s"minimal support level, default: ${defaultParams.minSupport}") .action((x, c) => c.copy(minSupport = x)) opt[Int]("numPartition") .text(s"number of partition, default: ${defaultParams.numPartition}") .action((x, c) => c.copy(numPartition = x)) arg[String]("<input>") .text("input paths to input data set, whose file format is that each line " + "contains a transaction with each item in String and separated by a space") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"FPGrowthExample with $params") val sc = new SparkContext(conf) val transactions = sc.textFile(params.input).map(_.split(" ")).cache() println(s"Number of transactions: ${transactions.count()}") val model = new FPGrowth() .setMinSupport(params.minSupport) .setNumPartitions(params.numPartition) .run(transactions) println(s"Number of frequent itemsets: ${model.freqItemsets.count()}") model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } } // scalastyle:on println
Example 12
Source File: SampleFPGrowthApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.fpm.FPGrowth object SampleFPGrowthApp { def main(args: Array[String]) { val transactions = Seq( "r z h k p", "z y x w v u t s", "s x o n r", "x z y m t s q e", "z", "x z y r q t p") .map(_.split(" ")) val sc = new SparkContext("local[2]", "Chapter 5 App") val rdd = sc.parallelize(transactions, 2).cache() val fpg = new FPGrowth() val model = fpg .setMinSupport(0.2) .setNumPartitions(1) .run(rdd) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } } }
Example 13
Source File: SampleFPGrowthApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package com.sparksample import org.apache.spark.mllib.fpm.FPGrowth object SampleFPGrowthApp { def main(args: Array[String]) { val transactions = Seq( "r z h k p", "z y x w v u t s", "s x o n r", "x z y m t s q e", "z", "x z y r q t p") .map(_.split(" ")) val sc = Util.sc val rdd = sc.parallelize(transactions, 2).cache() val fpg = new FPGrowth() val model6 = fpg .setMinSupport(0.2) .setNumPartitions(1) .run(rdd) model6.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } } }
Example 14
Source File: MovieLensFPGrowthApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package com.sparksample import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.mllib.recommendation.Rating import scala.collection.mutable.ListBuffer val rawRatings = rawData.map(_.split("\t").take(3)) rawRatings.first() val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) } val ratingsFirst = ratings.first() println(ratingsFirst) val movies = Util.getMovieData() val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap() titles(123) var eRDD = sc.emptyRDD var z = Seq[String]() val l = ListBuffer() val aj = new Array[String](400) var i = 0 for( a <- 501 to 900) { val moviesForUserX = ratings.keyBy(_.user).lookup(a) val moviesForUserX_10 = moviesForUserX.sortBy(-_.rating).take(10) val moviesForUserX_10_1 = moviesForUserX_10.map(r => r.product) var temp = "" for( x <- moviesForUserX_10_1){ if(temp.equals("")) temp = x.toString else { temp = temp + " " + x } } aj(i) = temp i += 1 } z = aj val transaction = z.map(_.split(" ")) val rddx = sc.parallelize(transaction, 2).cache() val fpg = new FPGrowth() val model = fpg .setMinSupport(0.1) .setNumPartitions(1) .run(rddx) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } }
Example 15
Source File: MovieLensFPGrowthApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.mllib.recommendation.Rating import scala.collection.mutable.ListBuffer val rawRatings = rawData.map(_.split("\t").take(3)) rawRatings.first() // 14/03/30 13:22:44 INFO SparkContext: Job finished: first at <console>:21, took 0.003703 s // res25: Array[String] = Array(196, 242, 3) val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) } val ratingsFirst = ratings.first() println(ratingsFirst) val userId = 789 val K = 10 val movies = sc.textFile(PATH + "/ml-100k/u.item") val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap() titles(123) var eRDD = sc.emptyRDD var z = Seq[String]() val l = ListBuffer() val aj = new Array[String](100) var i = 0 for( a <- 801 to 900) { val moviesForUserX = ratings.keyBy(_.user).lookup(a) val moviesForUserX_10 = moviesForUserX.sortBy(-_.rating).take(10) val moviesForUserX_10_1 = moviesForUserX_10.map(r => r.product) var temp = "" for( x <- moviesForUserX_10_1){ temp = temp + " " + x println(temp) } aj(i) = temp i += 1 } z = aj val transaction2 = z.map(_.split(" ")) val rddx = sc.parallelize(transaction2, 2).cache() val fpg = new FPGrowth() val model6 = fpg .setMinSupport(0.1) .setNumPartitions(1) .run(rddx) model6.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } }
Example 16
Source File: FPGrowthTestv6.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.{SparkConf, SparkContext} import scopt.OptionParser import scala.reflect.runtime.universe._ object FPGrowthTestv6 { case class Params( input: String = null, minSupport: Double = 0.3, numPartition: Int = -1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("FPGrowthTestv6") { head("FPGrowth: an example FP-growth app.") opt[Double]("minSupport") .text(s"minimal support level, default: ${defaultParams.minSupport}") .action((x, c) => c.copy(minSupport = x)) opt[Int]("numPartition") .text(s"number of partition, default: ${defaultParams.numPartition}") .action((x, c) => c.copy(numPartition = x)) arg[String]("<input>") .text("input paths to input data set, whose file format is that each line " + "contains a transaction with each item in String and separated by a space") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"FPGrowthTestv6 with $params") val sc = new SparkContext(conf) val transactions = sc.textFile(params.input).map(_.split(" ")).cache() println(s"Number of transactions: ${transactions.count()}") val model = new FPGrowth() .setMinSupport(params.minSupport) .setNumPartitions(params.numPartition) .run(transactions) println(s"Number of frequent itemsets: ${model.freqItemsets.count()}") model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } sc.stop() } abstract class AbstractParams[T: TypeTag] { private def tag: TypeTag[T] = typeTag[T] override def toString: String = { val tpe = tag.tpe val allAccessors = tpe.declarations.collect { case m: MethodSymbol if m.isCaseAccessor => m } val mirror = runtimeMirror(getClass.getClassLoader) val instanceMirror = mirror.reflect(this) allAccessors.map { f => val paramName = f.name.toString val fieldMirror = instanceMirror.reflectField(f) val paramValue = fieldMirror.get s" $paramName:\t$paramValue" }.mkString("{\n", ",\n", "\n}") } } }
Example 17
Source File: L9-14FPMining.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object FPMiningApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: FPMiningApp <appname> <batchInterval> <iPath>") System.exit(1) } val Seq(appName, batchInterval, iPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val minSupport = 0.4 ssc.textFileStream(iPath) .map(r => r.split(" ")) .foreachRDD(transactionRDD => { val fpg = new FPGrowth() .setMinSupport(minSupport) val model = fpg.run(transactionRDD) model.freqItemsets .collect() .foreach(itemset => println("Items: %s, Frequency: %s".format(itemset.items.mkString(" "), itemset.freq))) }) ssc.start() ssc.awaitTermination() } }
Example 18
Source File: MSNBCPatternMining.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.{FPGrowth, PrefixSpan} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object MSNBCPatternMining extends App { val conf = new SparkConf() .setAppName("MSNBC.com data pattern mining") .setMaster("local[4]") val sc = new SparkContext(conf) val transactionTest = sc.parallelize(Array(Array("A", "B", "C"), Array("B", "C", "A"))) val fp = new FPGrowth().setMinSupport(0.8).setNumPartitions(5) fp.run(transactionTest) val transactions: RDD[Array[Int]] = sc.textFile("./msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } // NOTE: Caching data is recommended val uniqueTransactions: RDD[Array[Int]] = transactions.map(_.distinct).cache() val fpGrowth = new FPGrowth().setMinSupport(0.01) val model = fpGrowth.run(uniqueTransactions) val count = uniqueTransactions.count() model.freqItemsets.collect().foreach { itemset => if (itemset.items.length >= 3) println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq / count.toDouble ) } val rules = model.generateAssociationRules(confidence = 0.4) rules.collect().foreach { rule => println("[" + rule.antecedent.mkString(",") + "=>" + rule.consequent.mkString(",") + "]," + (100 * rule.confidence).round / 100.0) } val frontPageConseqRules = rules.filter(_.consequent.head == 1) frontPageConseqRules.count frontPageConseqRules.filter(_.antecedent.contains(2)).count rules.filter(_.antecedent.contains(7)).count val sequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(sequences) psModel.freqSequences.map(fs => (fs.sequence.length, 1)) .reduceByKey(_ + _) .sortByKey() .collect() .foreach(fs => println(s"${fs._1}: ${fs._2}")) psModel.freqSequences .map(fs => (fs.sequence.length, fs)) .groupByKey() .map(group => group._2.reduce((f1, f2) => if (f1.freq > f2.freq) f1 else f2)) .map(_.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) .collect.foreach(println) psModel.freqSequences .map(fs => (fs.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"), 1)) .reduceByKey(_ + _) .reduce( (f1, f2) => if (f1._2 > f2._2) f1 else f2 ) psModel.freqSequences.reduce( (f1, f2) => if (f1.freq > f2.freq) f1 else f2 ) psModel.freqSequences.filter(_.sequence.length == 1).map(_.sequence.toString).collect.foreach(println) psModel.freqSequences.collect().foreach { freqSequence => println( freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq ) } }
Example 19
Source File: SimpleFPGrowth.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.rdd.RDD // $example off$ object SimpleFPGrowth { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleFPGrowth") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/sample_fpgrowth.txt") val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' ')) val fpg = new FPGrowth() .setMinSupport(0.2) .setNumPartitions(10) val model = fpg.run(transactions) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } val minConfidence = 0.8 model.generateAssociationRules(minConfidence).collect().foreach { rule => println( rule.antecedent.mkString("[", ",", "]") + " => " + rule.consequent .mkString("[", ",", "]") + ", " + rule.confidence) } // $example off$ } } // scalastyle:on println