org.apache.spark.mllib.linalg.Vector Scala Examples
The following examples show how to use org.apache.spark.mllib.linalg.Vector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LDAExample.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel, LocalLDAModel} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import scopt.OptionParser object LDAExample { case class Params( inputPath: String = null, outputPath: String = null, numTopics: Int = 10, maxIterations: Int = 10, optimizer: String = "online", maxResultSize: String = "1g") def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("LDA") { head("LDA: an example app for LDA.") opt[String]("optimizer") .text(s"optimizer, default: ${defaultParams.optimizer}") .action((x, c) => c.copy(optimizer = x)) opt[String]("maxResultSize") .text("max resultSize, default: ${defaultParams.maxResultSize}") .action((x, c) => c.copy(maxResultSize = x)) opt[Int]("numTopics") .text(s"number of Topics, default: ${defaultParams.numTopics}") .action((x, c) => c.copy(numTopics = x)) opt[Int]("maxIterations") .text(s"number of max iterations, default: ${defaultParams.maxIterations}") .action((x, c) => c.copy(maxIterations = x)) arg[String]("<inputPath>") .required() .text("Input paths") .action((x, c) => c.copy(inputPath = x)) arg[String]("<outputPath>") .required() .text("outputPath paths") .action((x, c) => c.copy(outputPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf() .setAppName(s"LDA Example with $params") .set("spark.driver.maxResultSize", params.maxResultSize) .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val corpus: RDD[(Long, Vector)] = sc.objectFile(params.inputPath) // Cluster the documents into numTopics topics using LDA val ldaModel = new LDA().setK(params.numTopics).setMaxIterations(params.maxIterations).setOptimizer(params.optimizer).run(corpus) // Save and load model. ldaModel.save(sc, params.outputPath) val savedModel = LocalLDAModel.load(sc, params.outputPath) sc.stop() } }
Example 2
Source File: MultilayerPerceptronClassifierSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.LogisticRegressionSuite._ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSparkContext { test("XOR function learning as binary classification problem with two outputs.") { val dataFrame = sqlContext.createDataFrame(Seq( (Vectors.dense(0.0, 0.0), 0.0), (Vectors.dense(0.0, 1.0), 1.0), (Vectors.dense(1.0, 0.0), 1.0), (Vectors.dense(1.0, 1.0), 0.0)) ).toDF("features", "label") val layers = Array[Int](2, 5, 2) val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(1) .setSeed(11L) .setMaxIter(100) val model = trainer.fit(dataFrame) val result = model.transform(dataFrame) val predictionAndLabels = result.select("prediction", "label").collect() predictionAndLabels.foreach { case Row(p: Double, l: Double) => assert(p == l) } } // TODO: implement a more rigorous test test("3 class classification with 2 hidden layers") { val nPoints = 1000 // The following coefficients are taken from OneVsRestSuite.scala // they represent 3-class iris dataset val coefficients = Array( -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) val xMean = Array(5.843, 3.057, 3.758, 1.199) val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) // the input seed is somewhat magic, to make this test pass val rdd = sc.parallelize(generateMultinomialLogisticInput( coefficients, xMean, xVariance, true, nPoints, 1), 2) val dataFrame = sqlContext.createDataFrame(rdd).toDF("label", "features") val numClasses = 3 val numIterations = 100 val layers = Array[Int](4, 5, 4, numClasses) val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(1) .setSeed(11L) // currently this seed is ignored .setMaxIter(numIterations) val model = trainer.fit(dataFrame) val numFeatures = dataFrame.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label") .map { case Row(p: Double, l: Double) => (p, l) } // train multinomial logistic regression val lr = new LogisticRegressionWithLBFGS() .setIntercept(true) .setNumClasses(numClasses) lr.optimizer.setRegParam(0.0) .setNumIterations(numIterations) val lrModel = lr.run(rdd) val lrPredictionAndLabels = lrModel.predict(rdd.map(_.features)).zip(rdd.map(_.label)) // MLP's predictions should not differ a lot from LR's. val lrMetrics = new MulticlassMetrics(lrPredictionAndLabels) val mlpMetrics = new MulticlassMetrics(mlpPredictionAndLabels) assert(mlpMetrics.confusionMatrix ~== lrMetrics.confusionMatrix absTol 100) } }
Example 3
Source File: IDFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 4
Source File: HogHBaseCluster.scala From hogzilla with GNU General Public License v2.0 | 5 votes |
package org.hogzilla.hbase import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.Vector import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.client.Delete import org.hogzilla.cluster.HogClusterMember object HogHBaseCluster { def formatClusterTitle(clusterCentroid: List[(Long,Double)], clusterIdx:Int):String = { val mainTitle = "Group "+clusterIdx.toString+" - "+ clusterCentroid .filter({case (port,rate) => rate > 4.999 }) .map({case (port,rate) => port.toString()+":"+"%.0f".format(rate)+"%" }).mkString(", ") val onePercentList= clusterCentroid .filter({case (port,rate) => .9999 < rate & rate < 5 }) if(onePercentList.size>0) { mainTitle+", "+ onePercentList.map({case (port,rate) => port.toString() }).mkString("(",", ",")"+"> 1%") }else { mainTitle } } def deleteCluster(clusterIdx:Int)= { val del = new Delete(Bytes.toBytes(clusterIdx.toString)) HogHBaseRDD.hogzilla_clusters.delete(del) } def deleteClusterMember(memberIP:String)= { val del = new Delete(Bytes.toBytes(memberIP)) HogHBaseRDD.hogzilla_cluster_members.delete(del) } def saveCluster(clusterIdx:Int, clusterCentroid:List[(Long,Double)], clusterSize: Long, members:Array[String]) = { val memberString = members.mkString(",") val put = new Put(Bytes.toBytes(clusterIdx.toString)) put.add(Bytes.toBytes("info"), Bytes.toBytes("title"), Bytes.toBytes(formatClusterTitle(clusterCentroid,clusterIdx))) put.add(Bytes.toBytes("info"), Bytes.toBytes("size"), Bytes.toBytes(clusterSize.toString)) put.add(Bytes.toBytes("info"), Bytes.toBytes("centroid"), Bytes.toBytes(clusterCentroid.mkString("[",",","]"))) put.add(Bytes.toBytes("info"), Bytes.toBytes("members"), Bytes.toBytes(memberString)) HogHBaseRDD.hogzilla_clusters.put(put) } def saveClusterMember(clusterMember:HogClusterMember) = { val put = new Put(Bytes.toBytes(clusterMember.memberIP.toString)) put.add(Bytes.toBytes("info"), Bytes.toBytes("title"), Bytes.toBytes(clusterMember.formatTitle)) put.add(Bytes.toBytes("cluster"),Bytes.toBytes("size"), Bytes.toBytes(clusterMember.clusterSize.toString)) put.add(Bytes.toBytes("cluster"),Bytes.toBytes("centroid"), Bytes.toBytes(clusterMember.centroid.mkString("[",",","]"))) put.add(Bytes.toBytes("cluster"),Bytes.toBytes("idx"), Bytes.toBytes(clusterMember.clusterIdx.toString)) put.add(Bytes.toBytes("cluster"),Bytes.toBytes("description"),Bytes.toBytes(formatClusterTitle(clusterMember.centroid,clusterMember.clusterIdx))) put.add(Bytes.toBytes("member"), Bytes.toBytes("ports"), Bytes.toBytes("TCP: "+clusterMember.ports.mkString(""," ",""))) put.add(Bytes.toBytes("member"), Bytes.toBytes("frequencies"),Bytes.toBytes("TCP: "+ clusterMember.frequency_vector .filter({case (port,freq) => clusterMember.ports.contains(port)}) .map({case (port,freq) => port.toString+"="+ "%.0f".format(freq)+"%" }) .mkString(""," ","") )) put.add(Bytes.toBytes("member"), Bytes.toBytes("ip"), Bytes.toBytes(clusterMember.memberIP)) put.add(Bytes.toBytes("member"), Bytes.toBytes("distance"), Bytes.toBytes("%.2f".format(clusterMember.distance))) HogHBaseRDD.hogzilla_cluster_members.put(put) } }
Example 5
Source File: Embedding.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.story.linalg import org.apache.spark.mllib.linalg.{Vector, Vectors} case object DenseEmbedding extends Embedding { def embed(v: Vector): Vector = Vectors.dense(v.toArray) } object Embedding { val IDENTITY_EMBEDDING = "IDENTITY" val DENSE_EMBEDDING = "DENSE" val LOW_DIMENSIONAL_RI = "LOW_DIMENSIONAL_RI" val MEDIUM_DIMENSIONAL_RI = "MEDIUM_DIMENSIONAL_RI" val HIGH_DIMENSIONAL_RI = "HIGH_DIMENSIONAL_RI" val lowDimension = 64 val mediumDimension = 256 val highDimension = 1024 val epsilon = 0.01 def apply(embeddingName: String): Embedding = { embeddingName match { case IDENTITY_EMBEDDING => IdentityEmbedding case DENSE_EMBEDDING => DenseEmbedding case LOW_DIMENSIONAL_RI => new RandomIndexEmbedding(lowDimension, epsilon, seed = 0) case MEDIUM_DIMENSIONAL_RI => new RandomIndexEmbedding(mediumDimension, epsilon, seed = 0) case HIGH_DIMENSIONAL_RI => new RandomIndexEmbedding(highDimension, epsilon, seed = 0) case _ => throw new RuntimeException(s"unknown embedding name $embeddingName") } } }
Example 6
Source File: DistanceFromCentroid.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter16.SparkTesting import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.{Vector, Vectors} def calcDistance(sc: SparkContext, vPoints: RDD[Vector], centroid: Vector): Double = { // Broadcast centroid to all partitions val bcCentroid = sc.broadcast(centroid) // For each partition, calculate the sum of distances from centroid to each of the points in // that partition. Then, sum up the partial sums from all the partitions. val accmDistance = vPoints.mapPartitions{ points => { var sum = 0.0 points.foreach { point => { sum += EuclideanVectorSpace.distance(point, bcCentroid.value) }} Iterator(sum) }}.reduce(_ + _) accmDistance } }
Example 7
Source File: LogisticRegressionRecommender.scala From wordpress-posts-recommender with Apache License 2.0 | 5 votes |
package wordpressworkshop import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.param.ParamMap import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame case class LogisticRegressionRecommender(training: DataFrame) { val lr = new LogisticRegression() val paramMap = ParamMap(lr.maxIter -> 20) .put(lr.regParam -> 0.01) .put(lr.probabilityCol -> "probability") val model: LogisticRegressionModel = lr.fit(training, paramMap) def metrics(testData: DataFrame) = { val predictionAndLabels: RDD[(Double, Double)] = model.transform(testData).map(row => row.getAs[Vector]("probability")(1) -> row.getAs[Double]("label")) new BinaryClassificationMetrics(predictionAndLabels) } def likeScores(testData: DataFrame): RDD[(Long, Long, Double)] = model.transform(testData) .map(row => (row.getAs[Long]("userId"), row.getAs[Long]("postId"), row.getAs[Vector]("probability")(1))) }
Example 8
Source File: Utils.scala From awesome-recommendation-engine with Apache License 2.0 | 5 votes |
package com.databricks.apps.twitter_classifier import org.apache.commons.cli.{Options, ParseException, PosixParser} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.feature.HashingTF import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder object Utils { val numFeatures = 1000 val tf = new HashingTF(numFeatures) val CONSUMER_KEY = "consumerKey" val CONSUMER_SECRET = "consumerSecret" val ACCESS_TOKEN = "accessToken" val ACCESS_TOKEN_SECRET = "accessTokenSecret" val THE_OPTIONS = { val options = new Options() options.addOption(CONSUMER_KEY, true, "Twitter OAuth Consumer Key") options.addOption(CONSUMER_SECRET, true, "Twitter OAuth Consumer Secret") options.addOption(ACCESS_TOKEN, true, "Twitter OAuth Access Token") options.addOption(ACCESS_TOKEN_SECRET, true, "Twitter OAuth Access Token Secret") options } def parseCommandLineWithTwitterCredentials(args: Array[String]) = { val parser = new PosixParser try { val cl = parser.parse(THE_OPTIONS, args) //System.setProperty("twitter4j.oauth.consumerKey", cl.getOptionValue(CONSUMER_KEY)) //System.setProperty("twitter4j.oauth.consumerSecret", cl.getOptionValue(CONSUMER_SECRET)) //System.setProperty("twitter4j.oauth.accessToken", cl.getOptionValue(ACCESS_TOKEN)) //System.setProperty("twitter4j.oauth.accessTokenSecret", cl.getOptionValue(ACCESS_TOKEN_SECRET)) System.setProperty("twitter4j.oauth.consumerKey", "jREUiik4pE9bKcBUYr5xsV7jt") System.setProperty("twitter4j.oauth.consumerSecret", "LIUbDpJzgoJ8gz3w3OgQFGcMnMLyjPi9S3uBmtEdaLGzUBqkM9") System.setProperty("twitter4j.oauth.accessToken", "453844423-3P6XqQ8hXWY1K47gEL1LU9lRg9kcrzfEXDvVTMZM") System.setProperty("twitter4j.oauth.accessTokenSecret", "vrDBfnE1ya425mYIjM80OH8HmyYOQ3RUotk3t8gdFy6Yy") cl.getArgList.toArray } catch { case e: ParseException => System.err.println("Parsing failed. Reason: " + e.getMessage) System.exit(1) } } def getAuth = { Some(new OAuthAuthorization(new ConfigurationBuilder().build())) } def featurize(s: String): Vector = { tf.transform(s.sliding(2).toSeq) } object IntParam { def unapply(str: String): Option[Int] = { try { Some(str.toInt) } catch { case e: NumberFormatException => None } } } }
Example 9
Source File: Gradient.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.optimization import org.apache.spark.annotation.DeveloperApi import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} def compute( iter: Iterator[(Double, Vector)], weights: Vector, cumGradient: Vector): (Long, Double) = { var loss = 0D var count = 0L iter.foreach { t => loss += compute(t._2, t._1, weights, cumGradient) count += 1 } (count, loss) } }
Example 10
Source File: LDADataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Vector, Vectors} import scala.collection.mutable.{HashMap => MHashMap} import org.apache.spark.rdd.RDD def generateLDARDD( sc: SparkContext, numDocs: Long, numVocab: Int, docLenMin: Int, docLenMax: Int, numParts: Int = 3, seed: Long = System.currentTimeMillis()): RDD[(Long, Vector)] = { val data = sc.parallelize(0L until numDocs, numParts).mapPartitionsWithIndex { (idx, part) => val rng = new Random(seed ^ idx) part.map { case docIndex => var currentSize = 0 val entries = MHashMap[Int, Int]() val docLength = rng.nextInt(docLenMax - docLenMin + 1) + docLenMin while (currentSize < docLength) { val index = rng.nextInt(numVocab) entries(index) = entries.getOrElse(index, 0) + 1 currentSize += 1 } val iter = entries.toSeq.map(v => (v._1, v._2.toDouble)) (docIndex, Vectors.sparse(numVocab, iter)) } } data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("LDADataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numDocs: Long = 500L var numVocab: Int = 1000 var docLenMin: Int = 50 var docLenMax: Int = 10000 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 5) { outputPath = args(0) numDocs = args(1).toInt numVocab = args(2).toInt docLenMin = args(3).toInt docLenMax = args(4).toInt println(s"Output Path: $outputPath") println(s"Num of Documents: $numDocs") println(s"Vocabulary size: $numVocab") } else { System.err.println( s"Usage: $LDADataGenerator <OUTPUT_PATH> <NUM_DOCUMENTS> <VOCABULARY_SIZE>" ) System.exit(1) } val data = generateLDARDD(sc, numDocs, numVocab, docLenMin, docLenMax, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 11
Source File: RatingDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.mllib.random._ import org.apache.spark.rdd.{PairRDDFunctions, RDD} import org.apache.spark.mllib.linalg.{Vectors, Vector} object RatingDataGenerator { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("RatingDataGeneration") val sc = new SparkContext(conf) var outputPath = "" var numUsers: Int = 100 var numProducts: Int = 100 var sparsity: Double = 0.05 var implicitPrefs: Boolean = false val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 5) { outputPath = args(0) numUsers = args(1).toInt numProducts = args(2).toInt sparsity = args(3).toDouble implicitPrefs = args(4).toBoolean println(s"Output Path: $outputPath") println(s"Num of Users: $numUsers") println(s"Num of Products: $numProducts") println(s"Sparsity: $sparsity") println(s"Implicit Prefs: $implicitPrefs") } else { System.err.println( s"Usage: $RatingDataGenerator <OUTPUT_PATH> <NUM_USERS> <NUM_PRODUCTS> <SPARSITY> <IMPLICITPREFS>" ) System.exit(1) } val rawData: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, numUsers, numProducts, numPartitions) val rng = new java.util.Random() val data = rawData.map{v => val a = Array.fill[Double](v.size)(0.0) v.foreachActive{(i,vi) => if(rng.nextDouble <= sparsity){ a(i) = vi } } Vectors.dense(a).toSparse } data.saveAsObjectFile(outputPath) sc.stop() } }
Example 12
Source File: ProbabilisticClassifierSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} final class TestProbabilisticClassificationModel( override val uid: String, override val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] { override def copy(extra: org.apache.spark.ml.param.ParamMap): this.type = defaultCopy(extra) override protected def predictRaw(input: Vector): Vector = { input } override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { rawPrediction } def friendlyPredict(input: Vector): Double = { predict(input) } } class ProbabilisticClassifierSuite extends SparkFunSuite { test("test thresholding") { val thresholds = Array(0.5, 0.2) val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(thresholds) assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 1.0))) === 1.0) assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 0.2))) === 0.0) } test("test thresholding not required") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 2.0))) === 1.0) } } object ProbabilisticClassifierSuite { val allParamSettings: Map[String, Any] = ClassifierSuite.allParamSettings ++ Map( "probabilityCol" -> "myProbability", "thresholds" -> Array(0.4, 0.6) ) }
Example 13
Source File: SVDDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{Vectors,Vector} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.random.RandomRDDs object SVDDataGenerator { def generateDistributedRowMatrix( sc: SparkContext, m: Long, n: Int, numPartitions: Int, seed: Long = System.currentTimeMillis()): RDD[Vector] = { val data: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, m, n, numPartitions, seed) data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("SVDDataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 200000 var numFeatures: Int = 20 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $SVDDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generateDistributedRowMatrix(sc, numExamples, numFeatures, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 14
Source File: SVDExample.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD import scopt.OptionParser object SVDExample { case class Params( numFeatures: Int = 0, numSingularValues: Int = 0, computeU: Boolean = true, maxResultSize: String = "1g", dataPath: String = null ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("SVD") { head("SVD: an example of SVD for matrix decomposition.") opt[Int]("numFeatures") .text(s"numFeatures, default: ${defaultParams.numFeatures}") .action((x,c) => c.copy(numFeatures = x)) opt[Int]("numSingularValues") .text(s"numSingularValues, default: ${defaultParams.numSingularValues}") .action((x,c) => c.copy(numSingularValues = x)) opt[Boolean]("computeU") .text(s"computeU, default: ${defaultParams.computeU}") .action((x,c) => c.copy(computeU = x)) opt[String]("maxResultSize") .text(s"maxResultSize, default: ${defaultParams.maxResultSize}") .action((x,c) => c.copy(maxResultSize = x)) arg[String]("<dataPath>") .required() .text("data path of SVD") .action((x,c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf() .setAppName(s"SVD with $params") .set("spark.driver.maxResultSize", params.maxResultSize) .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val dataPath = params.dataPath val numFeatures = params.numFeatures val numSingularValues = params.numSingularValues val computeU = params.computeU val data: RDD[Vector] = sc.objectFile(dataPath) val mat: RowMatrix = new RowMatrix(data) val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(numSingularValues, computeU) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. sc.stop() } }
Example 15
Source File: Featurizer.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.{SparseVector, Vector} case class SentenceFeatures(id: Long, docId: String, features: SparseVector) class Featurizer(numStopwords: Int = 0) extends Serializable { private val hashingTF = new HashingTF() private val byIDF = Ordering[Double].on[(Int,Double)](_._2) def apply(tokens: RDD[SentenceTokens]) : RDD[SentenceFeatures] = { val idf = new IDF(minDocFreq = 2) val termFrequencies = tokens.map(t => { (t.id, t.docId, hashingTF.transform(t.tokens)) }) val idfModel = idf.fit(termFrequencies.map({ case (_, _, tf) => tf })) val stopwordIndices = identifyStopwords(idfModel.idf.toSparse, numStopwords) termFrequencies .map({ case (id, docId, tf) => val tfidf = idfModel.transform(tf).toSparse val features = removeStopwords(tfidf, stopwordIndices) SentenceFeatures(id, docId, features) }) .filter(_.features.indices.size > 0) } def indexOf(token: String): Int = { hashingTF.indexOf(token) } private def identifyStopwords(idf: SparseVector, numStopwords: Int) = { featureTuples(idf).sorted(byIDF).take(numStopwords).map(_._1) } private def removeStopwords(tf: SparseVector, stopwordIndices: Array[Int]) = { val (indices, values) = featureTuples(tf) .filter(p => !stopwordIndices.contains(p._1)) .unzip new SparseVector(tf.size, indices.toArray, values.toArray) } private def featureTuples(featureVector: SparseVector) = { featureVector.indices.zip(featureVector.values) } }
Example 16
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.etl.machinelearning.kudu import com.cloudera.sa.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 17
Source File: VectorDistance.scala From cosine-lsh-join-spark with MIT License | 5 votes |
package com.soundcloud.lsh import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.mllib.linalg.Vector object Cosine extends VectorDistance { def apply(vecA: Vector, vecB: Vector): Double = { val v1 = vecA.toArray.map(_.toFloat) val v2 = vecB.toArray.map(_.toFloat) apply(v1, v2) } def apply(vecA: Array[Float], vecB: Array[Float]): Double = { val n = vecA.length val norm1 = blas.snrm2(n, vecA, 1) val norm2 = blas.snrm2(n, vecB, 1) if (norm1 == 0 || norm2 == 0) return 0.0 blas.sdot(n, vecA, 1, vecB, 1) / norm1 / norm2 } }
Example 18
Source File: NearestNeighboursTest.scala From cosine-lsh-join-spark with MIT License | 5 votes |
package com.soundcloud.lsh import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.scalatest.{FunSuite, Matchers} class NearestNeighboursTest extends FunSuite with SparkLocalContext with Matchers { def denseVector(input: Double*): Vector = { Vectors.dense(input.toArray) } test("nearest neighbours cosine") { val vecA = denseVector(1.0, 0.0) val vecB = denseVector(0.0, 1.0) val vecC = denseVector(-1.0, 0.0) val vecD = denseVector(1.0, 0.0) val rows = Seq( IndexedRow(0, vecA), IndexedRow(1, vecB), IndexedRow(2, vecC), IndexedRow(3, vecD) ) val indexedMatrix = new IndexedRowMatrix(sc.parallelize(rows)) val nearestNeighbour = new NearestNeighbours(Cosine, 0.0, 1.0) val got = nearestNeighbour.join(indexedMatrix) val expected = Seq( MatrixEntry(0, 1, 0.0), MatrixEntry(0, 3, 1.0), MatrixEntry(1, 2, 0.0), MatrixEntry(1, 3, 0.0) ) val gotEntries = got.entries.collect().toSeq gotEntries should be(expected) } }
Example 19
Source File: QueryHammingTest.scala From cosine-lsh-join-spark with MIT License | 5 votes |
package com.soundcloud.lsh import com.soundcloud.TestHelper import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.scalatest.{FunSuite, Matchers} class QueryHammingTest extends FunSuite with SparkLocalContext with Matchers with TestHelper { def denseVector(input: Double*): Vector = { Vectors.dense(input.toArray) } val queryVectorA = denseVector(1.0, 1.0) val queryVectorB = denseVector(-1.0, 1.0) val catalogVectorA = denseVector(1.0, 1.0) val catalogVectorB = denseVector(-1.0, 1.0) val catalogVectorC = denseVector(-1.0, 0.5) val catalogVectorD = denseVector(1.0, 0.5) val queryRows = Seq( IndexedRow(0, queryVectorA), IndexedRow(1, queryVectorB) ) val catalogRows = Seq( IndexedRow(0, catalogVectorA), IndexedRow(1, catalogVectorB), IndexedRow(2, catalogVectorC), IndexedRow(3, catalogVectorD) ) val expected = Array( MatrixEntry(0, 0, Cosine(queryVectorA, catalogVectorA)), MatrixEntry(0, 3, Cosine(queryVectorA, catalogVectorD)), MatrixEntry(1, 1, Cosine(queryVectorB, catalogVectorB)), MatrixEntry(1, 2, Cosine(queryVectorB, catalogVectorC)) ) test("broadcast catalog") { val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows)) val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows)) val queryNearestNeighbour = new QueryHamming(0.1, 10000, 2, true) val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect implicit val equality = new MatrixEquality(0.02) got.sortBy(t => (t.i, t.j)) should equal(expected) } test("broadcast query") { val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows)) val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows)) val queryNearestNeighbour = new QueryHamming(0.1, 10000, 2, false) val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect implicit val equality = new MatrixEquality(0.02) got.sortBy(t => (t.i, t.j)) should equal(expected) } }
Example 20
Source File: QueryNearestNeighboursTest.scala From cosine-lsh-join-spark with MIT License | 5 votes |
package com.soundcloud.lsh import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.scalatest.{FunSuite, Matchers} class QueryNearestNeighboursTest extends FunSuite with SparkLocalContext with Matchers { def denseVector(input: Double*): Vector = { Vectors.dense(input.toArray) } test("nearest neighbours cosine") { val queryVectorA = denseVector(1.0, 1.0) val queryVectorB = denseVector(-1.0, 1.0) val catalogVectorA = denseVector(1.0, 1.0) val catalogVectorB = denseVector(-1.0, 1.0) val catalogVectorC = denseVector(-1.0, 0.5) val catalogVectorD = denseVector(1.0, 0.5) val queryRows = Seq( IndexedRow(0, queryVectorA), IndexedRow(1, queryVectorB) ) val catalogRows = Seq( IndexedRow(0, catalogVectorA), IndexedRow(1, catalogVectorB), IndexedRow(2, catalogVectorC), IndexedRow(3, catalogVectorD) ) val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows)) val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows)) val queryNearestNeighbour = new QueryNearestNeighbours(Cosine, 0.4, 1.0, 1.0) val expected = Seq( MatrixEntry(0, 0, Cosine(queryVectorA, catalogVectorA)), MatrixEntry(0, 3, Cosine(queryVectorA, catalogVectorD)), MatrixEntry(1, 1, Cosine(queryVectorB, catalogVectorB)), MatrixEntry(1, 2, Cosine(queryVectorB, catalogVectorC)) ) val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect got should be(expected) } }
Example 21
Source File: VectorDistanceTest.scala From cosine-lsh-join-spark with MIT License | 5 votes |
package com.soundcloud.lsh import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.scalatest.{FunSuite, Matchers} class VectorDistanceTest extends FunSuite with SparkLocalContext with Matchers { def denseVector(input: Double*): Vector = { Vectors.dense(input.toArray) } test("cosine similarity") { val vecA = denseVector(1.0, 0.0) val vecB = denseVector(0.0, 1.0) val vecC = denseVector(-1.0, 0.0) val vecD = denseVector(1.0, 0.0) val perpendicular = Cosine(vecA, vecB) perpendicular should be(0.0) val opposite = Cosine(vecA, vecC) opposite should be(-1.0) val same = Cosine(vecA, vecD) same should be(1.0) } test("similarities") { val vec1 = Vectors.dense(1.0, 2.0, 3.0) val vec2 = Vectors.dense(1.0, 2.0, 4.0) val vec3 = Vectors.dense(7.0, 7.0, 9.0) Cosine(vec1, vec2) should be >= Cosine(vec1, vec3) Cosine(vec2, vec1) should be >= Cosine(vec2, vec3) Cosine(vec3, vec1) should be >= Cosine(vec3, vec2) } }
Example 22
Source File: LabeledPoint.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 23
Source File: BisectingKMeansExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib // scalastyle:off println // $example on$ import org.apache.spark.mllib.clustering.BisectingKMeans import org.apache.spark.mllib.linalg.{Vector, Vectors} // $example off$ import org.apache.spark.{SparkConf, SparkContext} object BisectingKMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample") val sc = new SparkContext(sparkConf) // $example on$ // Loads and parses data def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble)) val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache() // Clustering the data into 6 clusters by BisectingKMeans. val bkm = new BisectingKMeans().setK(6) val model = bkm.run(data) // Show the compute cost and the cluster centers println(s"Compute Cost: ${model.computeCost(data)}") model.clusterCenters.zipWithIndex.foreach { case (center, idx) => println(s"Cluster Center ${idx}: ${center}") } // $example off$ sc.stop() } } // scalastyle:on println
Example 24
Source File: Normalizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.types.DataType def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) normalizer.transform } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 25
Source File: DCT.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.types.DataType def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 26
Source File: BinaryClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("1.2.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 27
Source File: LibSVMRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import com.google.common.base.Objects import org.apache.spark.Logging import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @Since("1.6.0") class DefaultSource extends RelationProvider with DataSourceRegister { @Since("1.6.0") override def shortName(): String = "libsvm" @Since("1.6.0") override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) : BaseRelation = { val path = parameters.getOrElse("path", throw new IllegalArgumentException("'path' must be specified")) val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt val vectorType = parameters.getOrElse("vectorType", "sparse") new LibSVMRelation(path, numFeatures, vectorType)(sqlContext) } }
Example 28
Source File: Normalizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 29
Source File: Word2VecModelWrapper.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = { rdd.rdd.map(model.transform) } def findSynonyms(word: String, num: Int): JList[Object] = { val vec = transform(word) findSynonyms(vec, num) } def findSynonyms(vector: Vector, num: Int): JList[Object] = { val result = model.findSynonyms(vector, num) val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map({case (k, v) => (k, v.toList.asJava)}).asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 30
Source File: PearsonCorrelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 31
Source File: SpearmanCorrelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 32
Source File: GLMClassificationModel.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.classification.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SQLContext} def loadData(sc: SparkContext, path: String, modelClass: String): Data = { val datapath = Loader.dataPath(path) val sqlContext = SQLContext.getOrCreate(sc) val dataRDD = sqlContext.read.parquet(datapath) val dataArray = dataRDD.select("weights", "intercept", "threshold").take(1) assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath") val data = dataArray(0) assert(data.size == 3, s"Unable to load $modelClass data from: $datapath") val (weights, intercept) = data match { case Row(weights: Vector, intercept: Double, _) => (weights, intercept) } val threshold = if (data.isNullAt(2)) { None } else { Some(data.getDouble(2)) } Data(weights, intercept, threshold) } } }
Example 33
Source File: SimpleTextClassificationPipeline.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import scala.beans.BeanInfo import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} @BeanInfo case class LabeledDocument(id: Long, text: String, label: Double) @BeanInfo case class Document(id: Long, text: String) object SimpleTextClassificationPipeline { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // Prepare training documents, which are labeled. val training = sc.parallelize(Seq( LabeledDocument(0L, "a b c d e spark", 1.0), LabeledDocument(1L, "b d", 0.0), LabeledDocument(2L, "spark f g h", 1.0), LabeledDocument(3L, "hadoop mapreduce", 0.0))) // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training.toDF()) // Prepare test documents, which are unlabeled. val test = sc.parallelize(Seq( Document(4L, "spark i j k"), Document(5L, "l m n"), Document(6L, "spark hadoop spark"), Document(7L, "apache hadoop"))) // Make predictions on test documents. model.transform(test.toDF()) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } sc.stop() } } // scalastyle:on println
Example 34
Source File: GLMRegressionModel.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{DataFrame, Row, SQLContext} def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = { val datapath = Loader.dataPath(path) val sqlContext = SQLContext.getOrCreate(sc) val dataRDD = sqlContext.read.parquet(datapath) val dataArray = dataRDD.select("weights", "intercept").take(1) assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath") val data = dataArray(0) assert(data.size == 2, s"Unable to load $modelClass data from: $datapath") data match { case Row(weights: Vector, intercept: Double) => assert(weights.size == numFeatures, s"Expected $numFeatures features, but" + s" found ${weights.size} features when loading $modelClass weights from $datapath") Data(weights, intercept) } } } }
Example 35
Source File: OneHotEncoderSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { def stringIndexed(): DataFrame = { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) val df = sqlContext.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") .fit(df) indexer.transform(df) } test("params") { ParamsSuite.checkParams(new OneHotEncoder) } test("OneHotEncoder dropLast = false") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") .setDropLast(false) val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1), vec(2)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0), (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0)) assert(output === expected) } test("OneHotEncoder dropLast = true") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0), (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0)) assert(output === expected) } test("input column with ML attribute") { val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large") val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size") .select(col("size").as("size", attr.toMetadata())) val encoder = new OneHotEncoder() .setInputCol("size") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1)) } test("input column without ML attribute") { val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index") val encoder = new OneHotEncoder() .setInputCol("index") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1)) } test("read/write") { val t = new OneHotEncoder() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setDropLast(false) testDefaultReadWrite(t) } }
Example 36
Source File: ChiSqSelectorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{Row, SQLContext} class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("Test Chi-Square selector") { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ val data = Seq( LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))) ) val preFilteredData = Seq( Vectors.dense(0.0), Vectors.dense(6.0), Vectors.dense(8.0), Vectors.dense(5.0) ) val df = sc.parallelize(data.zip(preFilteredData)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") val model = new ChiSqSelector() .setNumTopFeatures(1) .setFeaturesCol("data") .setLabelCol("label") .setOutputCol("filtered") model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } } test("ChiSqSelector read/write") { val t = new ChiSqSelector() .setFeaturesCol("myFeaturesCol") .setLabelCol("myLabelCol") .setOutputCol("myOutputCol") .setNumTopFeatures(2) testDefaultReadWrite(t) } test("ChiSqSelectorModel read/write") { val oldModel = new feature.ChiSqSelectorModel(Array(1, 3)) val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel) val newInstance = testDefaultReadWrite(instance) assert(newInstance.selectedFeatures === instance.selectedFeatures) } }
Example 37
Source File: DCTSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true) } else { (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = sqlContext.createDataFrame(Seq( DCTTestData(data, expectedResult) )) val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 38
Source File: MinMaxScalerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SQLContext} class MinMaxScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("MinMaxScaler fit basic case") { val sqlContext = new SQLContext(sc) val data = Array( Vectors.dense(1, 0, Long.MinValue), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(3, Long.MaxValue)), Vectors.sparse(3, Array(0), Array(1.5))) val expected: Array[Vector] = Array( Vectors.dense(-5, 0, -5), Vectors.dense(0, 0, 0), Vectors.sparse(3, Array(0, 2), Array(5, 5)), Vectors.sparse(3, Array(0), Array(-2.5))) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaled") .setMin(-5) .setMax(5) val model = scaler.fit(df) model.transform(df).select("expected", "scaled").collect() .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1.equals(vector2), "Transformed vector is different with expected.") } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } test("MinMaxScaler arguments max must be larger than min") { withClue("arguments max must be larger than min") { intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(10).setMax(0) scaler.validateParams() } intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(0).setMax(0) scaler.validateParams() } } } test("MinMaxScaler read/write") { val t = new MinMaxScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMax(1.0) .setMin(-1.0) testDefaultReadWrite(t) } test("MinMaxScalerModel read/write") { val instance = new MinMaxScalerModel( "myMinMaxScalerModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMin(-1.0) .setMax(1.0) val newInstance = testDefaultReadWrite(instance) assert(newInstance.originalMin === instance.originalMin) assert(newInstance.originalMax === instance.originalMax) } }
Example 39
Source File: PolynomialExpansionSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("read/write") { val t = new PolynomialExpansion() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setDegree(3) testDefaultReadWrite(t) } }
Example 40
Source File: IDFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } test("params") { ParamsSuite.checkParams(new IDF) val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0))) ParamsSuite.checkParams(model) } test("compute IDF with default parameter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((numOfData + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("IDF read/write") { val t = new IDF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinDocFreq(5) testDefaultReadWrite(t) } test("IDFModel read/write") { val instance = new IDFModel("myIDFModel", new OldIDFModel(Vectors.dense(1.0, 2.0))) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.idf === instance.idf) } }
Example 41
Source File: HashingTFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.Utils class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val df = sqlContext.createDataFrame(Seq( (0, "a a b b c d".split(" ").toSeq) )).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(n)) val features = output.select("features").first().getAs[Vector](0) // Assume perfect hash on "a", "b", "c", and "d". def idx(any: Any): Int = Utils.nonNegativeMod(any.##, n) val expected = Vectors.sparse(n, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) assert(features ~== expected absTol 1e-14) } test("read/write") { val t = new HashingTF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumFeatures(10) testDefaultReadWrite(t) } }
Example 42
Source File: RandomForestSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.impl.TreeTests import org.apache.spark.ml.tree.{ContinuousSplit, DecisionTreeModel, LeafNode, Node} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.tree.impurity.GiniCalculator import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.collection.OpenHashMap val leftImp = new GiniCalculator(Array(3.0, 2.0, 1.0)) val left = new LeafNode(0.0, leftImp.calculate(), leftImp) val rightImp = new GiniCalculator(Array(1.0, 2.0, 5.0)) val right = new LeafNode(2.0, rightImp.calculate(), rightImp) val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5)) val parentImp = parent.impurityStats val left2Imp = new GiniCalculator(Array(1.0, 6.0, 1.0)) val left2 = new LeafNode(0.0, left2Imp.calculate(), left2Imp) val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0)) val grandImp = grandParent.impurityStats // Test feature importance computed at different subtrees. def testNode(node: Node, expected: Map[Int, Double]): Unit = { val map = new OpenHashMap[Int, Double]() RandomForest.computeFeatureImportance(node, map) assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } // Leaf node testNode(left, Map.empty[Int, Double]) // Internal node with 2 leaf children val feature0importance = parentImp.calculate() * parentImp.count - (leftImp.calculate() * leftImp.count + rightImp.calculate() * rightImp.count) testNode(parent, Map(0 -> feature0importance)) // Full tree val feature1importance = grandImp.calculate() * grandImp.count - (left2Imp.calculate() * left2Imp.count + parentImp.calculate() * parentImp.count) testNode(grandParent, Map(0 -> feature0importance, 1 -> feature1importance)) // Forest consisting of (full tree) + (internal node with 2 leafs) val trees = Array(parent, grandParent).map { root => new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3) .asInstanceOf[DecisionTreeModel] } val importances: Vector = RandomForest.featureImportances(trees, 2) val tree2norm = feature0importance + feature1importance val expected = Vectors.dense((1.0 + feature0importance / tree2norm) / 2.0, (feature1importance / tree2norm) / 2.0) assert(importances ~== expected relTol 0.01) } test("normalizeMapValues") { val map = new OpenHashMap[Int, Double]() map(0) = 1.0 map(2) = 2.0 RandomForest.normalizeMapValues(map) val expected = Map(0 -> 1.0 / 3.0, 2 -> 2.0 / 3.0) assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } } private object RandomForestSuite { def mapToVec(map: Map[Int, Double]): Vector = { val size = (map.keys.toSeq :+ 0).max + 1 val (indices, values) = map.toSeq.sortBy(_._1).unzip Vectors.sparse(size, indices.toArray, values.toArray) } }
Example 43
Source File: SVDExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 44
Source File: BisectingKMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.BisectingKMeans import org.apache.spark.mllib.linalg.{Vector, Vectors} // $example off$ object BisectingKMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample") val sc = new SparkContext(sparkConf) // $example on$ // Loads and parses data def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble)) val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache() // Clustering the data into 6 clusters by BisectingKMeans. val bkm = new BisectingKMeans().setK(6) val model = bkm.run(data) // Show the compute cost and the cluster centers println(s"Compute Cost: ${model.computeCost(data)}") model.clusterCenters.zipWithIndex.foreach { case (center, idx) => println(s"Cluster Center ${idx}: ${center}") } // $example off$ sc.stop() } } // scalastyle:on println
Example 45
Source File: HashingTF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import java.lang.{Iterable => JavaIterable} import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.unsafe.hash.Murmur3_x86_32._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils private[spark] def murmur3Hash(term: Any): Int = { term match { case null => seed case b: Boolean => hashInt(if (b) 1 else 0, seed) case b: Byte => hashInt(b, seed) case s: Short => hashInt(s, seed) case i: Int => hashInt(i, seed) case l: Long => hashLong(l, seed) case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed) case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => val utf8 = UTF8String.fromString(s) hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed) case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } } }
Example 46
Source File: Normalizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 47
Source File: KMeansModel.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.clustering import scala.collection.JavaConverters._ import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.pmml.PMMLExportable import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} @Since("0.8.0") def computeCost(data: RDD[Vector]): Double = { val centersWithNorm = clusterCentersWithNorm val bcCentersWithNorm = data.context.broadcast(centersWithNorm) data.map(p => KMeans.pointCost(bcCentersWithNorm.value, new VectorWithNorm(p))).sum() } private def clusterCentersWithNorm: Iterable[VectorWithNorm] = clusterCenters.map(new VectorWithNorm(_)) @Since("1.4.0") override def save(sc: SparkContext, path: String): Unit = { KMeansModel.SaveLoadV1_0.save(sc, this, path) } override protected def formatVersion: String = "1.0" } @Since("1.4.0") object KMeansModel extends Loader[KMeansModel] { @Since("1.4.0") override def load(sc: SparkContext, path: String): KMeansModel = { KMeansModel.SaveLoadV1_0.load(sc, path) } private case class Cluster(id: Int, point: Vector) private object Cluster { def apply(r: Row): Cluster = { Cluster(r.getInt(0), r.getAs[Vector](1)) } } private[clustering] object SaveLoadV1_0 { private val thisFormatVersion = "1.0" private[clustering] val thisClassName = "org.apache.spark.mllib.clustering.KMeansModel" def save(sc: SparkContext, model: KMeansModel, path: String): Unit = { val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val metadata = compact(render( ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ ("k" -> model.k))) sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path)) val dataRDD = sc.parallelize(model.clusterCenters.zipWithIndex).map { case (point, id) => Cluster(id, point) } spark.createDataFrame(dataRDD).write.parquet(Loader.dataPath(path)) } def load(sc: SparkContext, path: String): KMeansModel = { implicit val formats = DefaultFormats val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val (className, formatVersion, metadata) = Loader.loadMetadata(sc, path) assert(className == thisClassName) assert(formatVersion == thisFormatVersion) val k = (metadata \ "k").extract[Int] val centroids = spark.read.parquet(Loader.dataPath(path)) Loader.checkSchema[Cluster](centroids.schema) val localCentroids = centroids.rdd.map(Cluster.apply).collect() assert(k == localCentroids.length) new KMeansModel(localCentroids.sortBy(_.id).map(_.point)) } } }
Example 48
Source File: GaussianMixtureModelWrapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import scala.collection.JavaConverters import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.GaussianMixtureModel import org.apache.spark.mllib.linalg.{Vector, Vectors} val gaussians: Array[Byte] = { val modelGaussians = model.gaussians.map { gaussian => Array[Any](gaussian.mu, gaussian.sigma) } SerDe.dumps(JavaConverters.seqAsJavaListConverter(modelGaussians).asJava) } def predictSoft(point: Vector): Vector = { Vectors.dense(model.predictSoft(point)) } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 49
Source File: Word2VecModelWrapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 50
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => DBM, DenseVector => DBV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.MLUtils private def calculateCovarianceConstants: (DBM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(sigma.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = MLUtils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 51
Source File: PearsonCorrelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 52
Source File: SpearmanCorrelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 53
Source File: Updater.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization import scala.math._ import breeze.linalg.{axpy => brzAxpy, norm => brzNorm, Vector => BV} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.mllib.linalg.{Vector, Vectors} @DeveloperApi class SquaredL2Updater extends Updater { override def compute( weightsOld: Vector, gradient: Vector, stepSize: Double, iter: Int, regParam: Double): (Vector, Double) = { // add up both updates from the gradient of the loss (= step) as well as // the gradient of the regularizer (= regParam * weightsOld) // w' = w - thisIterStepSize * (gradient + regParam * w) // w' = (1 - thisIterStepSize * regParam) * w - thisIterStepSize * gradient val thisIterStepSize = stepSize / math.sqrt(iter) val brzWeights: BV[Double] = weightsOld.asBreeze.toDenseVector brzWeights :*= (1.0 - thisIterStepSize * regParam) brzAxpy(-thisIterStepSize, gradient.asBreeze, brzWeights) val norm = brzNorm(brzWeights, 2.0) (Vectors.fromBreeze(brzWeights), 0.5 * regParam * norm * norm) } }
Example 54
Source File: GLMClassificationModel.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.classification.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SparkSession} def loadData(sc: SparkContext, path: String, modelClass: String): Data = { val dataPath = Loader.dataPath(path) val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val dataRDD = spark.read.parquet(dataPath) val dataArray = dataRDD.select("weights", "intercept", "threshold").take(1) assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath") val data = dataArray(0) assert(data.size == 3, s"Unable to load $modelClass data from: $dataPath") val (weights, intercept) = data match { case Row(weights: Vector, intercept: Double, _) => (weights, intercept) } val threshold = if (data.isNullAt(2)) { None } else { Some(data.getDouble(2)) } Data(weights, intercept, threshold) } } }
Example 55
Source File: LabeledPoint.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 56
Source File: GLMRegressionModel.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SparkSession} def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = { val dataPath = Loader.dataPath(path) val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val dataRDD = spark.read.parquet(dataPath) val dataArray = dataRDD.select("weights", "intercept").take(1) assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath") val data = dataArray(0) assert(data.size == 2, s"Unable to load $modelClass data from: $dataPath") data match { case Row(weights: Vector, intercept: Double) => assert(weights.size == numFeatures, s"Expected $numFeatures features, but" + s" found ${weights.size} features when loading $modelClass weights from $dataPath") Data(weights, intercept) } } } }
Example 57
Source File: IDFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 58
Source File: PCASuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 59
Source File: DLEstimatorBase.scala From BigDL with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol} import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row} abstract class DLEstimatorBase[Learner <: DLEstimatorBase[Learner, M], M <: DLTransformerBase[M]] extends Estimator[M] with HasLabelCol { protected def internalFit(dataFrame: DataFrame): M override def fit(dataFrame: DataFrame): M = { transformSchema(dataFrame.schema, logging = true) internalFit(dataFrame) } override def copy(extra: ParamMap): Learner = defaultCopy(extra) }
Example 60
Source File: TestMPSLinearProgramSolver.scala From spark-lp with Apache License 2.0 | 5 votes |
object TestMPSLinearProgramSolver { def main(args: Array[String]) { val conf = new SparkConf() .setMaster("local[2]") .setAppName("TestMPSLinearProgramSolver") val sc = new SparkContext(conf) // Parse the provided MPS file. val parser = new MPSParser() val mpsFile = new File(args(0)) parser.parse(mpsFile) // Convert the parsed linear program to standard form. val converter = new LPStandardConverter(true) converter.toStandardForm(parser.getC, parser.getG, parser.getH, parser.getA, parser.getB, parser.getLb, parser.getUb) // Convert the parameters of the linear program to spark lp compatible formats. val numPartitions = 2 val c: DVector = sc.parallelize(converter.getStandardC.toArray, numPartitions) .glom.map(new DenseVector(_)) val B: DMatrix = sc.parallelize(converter.getStandardA.toArray.transpose.map( Vectors.dense(_).toSparse: Vector), numPartitions) val b = new DenseVector(converter.getStandardB.toArray) println("Start solving ... ") val (optimalVal, optimalX) = LP.solve(c, B, b, sc=sc) println("optimalVal: " + optimalVal) //println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 61
Source File: package.scala From spark-lp with Apache License 2.0 | 5 votes |
implicit object DVectorSpace extends VectorSpace[DVector] { override def combine(alpha: Double, a: DVector, beta: Double, b: DVector): DVector = if (alpha == 1.0 && beta == 1.0) { a.zip(b).map { case (aPart, bPart) => { BLAS.axpy(1.0, aPart, bPart) // bPart += aPart bPart } } } else { a.zip(b).map { case (aPart, bPart) => // NOTE A DenseVector result is assumed here (not sparse safe). DenseVectorSpace.combine(alpha, aPart, beta, bPart).toDense } } override def dot(a: DVector, b: DVector): Double = a.dot(b) override def entrywiseProd(a: DVector, b: DVector): DVector = { a.zip(b).map { case (aPart, bPart) => DenseVectorSpace.entrywiseProd(aPart, bPart).toDense } } override def entrywiseNegDiv(a: DVector, b: DVector): DVector = { a.zip(b).map { case (aPart, bPart) => DenseVectorSpace.entrywiseNegDiv(aPart, bPart) } } override def sum(a: DVector): Double = a.aggregate(0.0)( seqOp = (acc: Double, v: DenseVector) => acc + v.values.sum, combOp = (acc1: Double, acc2: Double) => acc1 + acc2 ) override def min(a: DVector): Double = a.aggregate(Double.PositiveInfinity)( (mi, x) => Math.min(mi, x.values.min), Math.min ) override def max(a: DVector): Double = a.aggregate(Double.NegativeInfinity)( (ma, x) => Math.max(ma, x.values.max), Math.max ) override def cache(a: DVector): Unit = if (a.getStorageLevel == StorageLevel.NONE) { a.cache() } } }
Example 62
Source File: SpLinopMatrix.scala From spark-lp with Apache License 2.0 | 5 votes |
override def apply(mat: DMatrix): DMatrix = { dvector.zipPartitions(mat)((vectorPartition, matPartition) => vectorPartition.next().values.toIterator.checkedZip(matPartition.toIterator).map { case (a: Double, x: Vector) => val xc = x.copy BLAS.scal(a, xc) xc } ) } }
Example 63
Source File: InitializeSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, _} import org.apache.spark.mllib.optimization.tfocs.VectorSpace.{DMatrix, DVector} class InitializeSuite extends FunSuite with MLlibTestSparkContext { val numPartitions = 2 val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0) val BArray = Array( Array(12.0, 16.0, 30.0, 1.0, 0.0), Array(24.0, 16.0, 12.0, 0.0, 1.0), Array(-1.0, 0.0, 0.0, 0.0, 0.0), Array(0.0, -1.0, 0.0, 0.0, 0.0), Array(0.0, 0.0, -1.0, 0.0, 0.0), Array(0.0, 0.0, 0.0, 1.0, 0.0), Array(0.0, 0.0, 0.0, 0.0, 1.0)) val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0) lazy val c: DVector = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_)) lazy val rows: DMatrix = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_)) lazy val b: DenseVector = new DenseVector(bArray) val cBrz = new BDV[Double](cArray) val BBrz = new BDM[Double](7, 5, BArray.flatMap(x => x), offset = 0, majorStride = 5, isTranspose = true) val bBrz = new BDV[Double](bArray) // (BT * B) ^(-1) val BTBInv = inv(BBrz.t * BBrz) // xTilda = B * BTBInv * b val xTilda: BDV[Double] = BBrz * (BTBInv * bBrz) // lambdaTilda = BTBInv * (B^T * c) val lambdaTilda: BDV[Double] = BTBInv * (BBrz.t * cBrz) // sTilda = c - B * lambdaTilda val sTilda = cBrz - BBrz * lambdaTilda val deltax = Math.max(1.5 * max(xTilda), 0) val deltas = Math.max(1.5 * max(sTilda), 0) val xHat = xTilda :+ deltax val sHat = sTilda :+ deltas val deltaxHat: Double = 0.5 * (xHat.t * sHat) / sum(sHat) val deltasHat: Double = 0.5 * (xHat.t * sHat) / sum(xHat) // x = xHat + deltaxHat * e val expectedx: BDV[Double] = xHat :+ deltaxHat // val expectedLambda = lambdaTilda val expecteds: BDV[Double] = sHat :+ deltasHat test("Initialize.init is implemented properly") { val result = Initialize.init(c, rows, b) //println(LP.solve(c, rows, b, 1e-4, 1).collect()) assert(Vectors.dense(expectedx.toArray) ~= Vectors.dense(result._1.flatMap(_.toArray).collect()) relTol 1e-6, "Initialize.init x0 is not computed correctly.") assert(Vectors.dense(lambdaTilda.toArray) ~= Vectors.dense(result._2.toArray) relTol 1e-6, "Initialize.init lambda0 is not computed correctly.") assert(Vectors.dense(expecteds.toArray) ~= Vectors.dense(result._3.flatMap(_.toArray).collect()) relTol 1e-6, "Initialize.init s0 should return the correct answer.") } }
Example 64
Source File: LPSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace class LPSuite extends FunSuite with MLlibTestSparkContext { val numPartitions = 2 val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0) val BArray = Array( Array(12.0, 16.0, 30.0, 1.0, 0.0), Array(24.0, 16.0, 12.0, 0.0, 1.0), Array(-1.0, 0.0, 0.0, 0.0, 0.0), Array(0.0, -1.0, 0.0, 0.0, 0.0), Array(0.0, 0.0, -1.0, 0.0, 0.0), Array(0.0, 0.0, 0.0, 1.0, 0.0), Array(0.0, 0.0, 0.0, 0.0, 1.0)) val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0) lazy val c = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_)) lazy val rows = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_)) lazy val b = new DenseVector(bArray) test("LP solve is implemented properly") { val (v, x) = LP.solve(c, rows, b, sc=sc) // solution obtained from scipy.optimize.linprog and octave glgk lpsolver with fun_val = 12.083 val expectedSol = Vectors.dense( Array(1.66666667, 5.83333333, 40.0, 0.0, 0.0, 13.33333333, 9.16666667)) val xx = Vectors.dense(x.flatMap(_.toArray).collect()) println(s"$xx") println("optimal min value: " + v) assert(xx ~== expectedSol absTol 1e-6, "LP.solve x should return the correct answer.") } }
Example 65
Source File: BisectingKMeansModel.scala From bisecting-kmeans with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.bisectingkmeans import breeze.linalg.{Vector => BV, norm => breezeNorm} import org.apache.spark.Logging import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() this.node.toLinkageMatrix.foreach {x => val row = new java.util.ArrayList[java.lang.Double]() row.add(x._1.toDouble) row.add(x._2.toDouble) row.add(x._3.toDouble) row.add(x._4.toDouble) javaList.add(row) } javaList } }
Example 66
Source File: TestMPSLinearProgram.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.examples import java.io.File import com.joptimizer.optimizers.LPStandardConverter import com.joptimizer.util.MPSParser import org.apache.spark.mllib.linalg.{ DenseVector, Vector, Vectors } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.SolverSLP import org.apache.spark.{ SparkConf, SparkContext } object TestMPSLinearProgram { def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestMPSLinearProgram") val sc = new SparkContext(sparkConf) // Parse the provided MPS file. val parser = new MPSParser() var mpsFile = new File(args(0)) parser.parse(mpsFile) // Convert the parsed linear program to standard form. val converter = new LPStandardConverter(true) converter.toStandardForm(parser.getC, parser.getG, parser.getH, parser.getA, parser.getB, parser.getLb, parser.getUb) // Convert the parameters of the linear program to spark tfocs compatible formats. val c = sc.parallelize(converter.getStandardC.toArray).glom.map(new DenseVector(_)) val A = sc.parallelize(converter.getStandardA.toArray.transpose.map( Vectors.dense(_).toSparse: Vector)) val b = new DenseVector(converter.getStandardB.toArray) val n = converter.getStandardN val mu = 1e-2 // Solve the linear program using SolverSLP, finding the optimal x vector 'optimalX'. val (optimalX, _) = SolverSLP.run(c, A, b, mu) println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 67
Source File: LocalLDAModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.clustering import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.{DataUtils, ParamUtils} import org.apache.spark.ml.clustering.{LocalLDAModel => SparkLocalLDA} import org.apache.spark.mllib.clustering.{LocalLDAModel => OldSparkLocalLDA} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.sql.SparkSession import DataUtils._ import scala.reflect.runtime.universe class LocalLDAModel(override val sparkTransformer: SparkLocalLDA) extends LocalTransformer[SparkLocalLDA] { lazy val oldModel: OldSparkLocalLDA = { val mirror = universe.runtimeMirror(sparkTransformer.getClass.getClassLoader) val parentTerm = universe.typeOf[SparkLocalLDA].decl(universe.TermName("oldLocalModel")).asTerm mirror.reflect(sparkTransformer).reflectField(parentTerm).get.asInstanceOf[OldSparkLocalLDA] } override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => val newData = column.data.mapToMlLibVectors.map(oldModel.topicDistribution(_).toList) localData.withColumn( LocalDataColumn( sparkTransformer.getTopicDistributionCol, newData ) ) case None => localData } } } object LocalLDAModel extends SimpleModelLoader[SparkLocalLDA] with TypedTransformerConverter[SparkLocalLDA] { override def build(metadata: Metadata, data: LocalData): SparkLocalLDA = { val topics = DataUtils.constructMatrix( data.column("topicsMatrix").get.data.head.asInstanceOf[Map[String, Any]] ) val gammaShape = data.column("gammaShape").get.data.head.asInstanceOf[java.lang.Double] val topicConcentration = data.column("topicConcentration").get.data.head.asInstanceOf[java.lang.Double] val docConcentration = DataUtils.constructVector( data.column("docConcentration").get.data.head.asInstanceOf[Map[String, Any]] ) val vocabSize = data.column("vocabSize").get.data.head.asInstanceOf[java.lang.Integer] val oldLdaCtor = classOf[OldSparkLocalLDA].getDeclaredConstructor( classOf[Matrix], classOf[Vector], classOf[Double], classOf[Double] ) val oldLDA = oldLdaCtor.newInstance( Matrices.fromML(topics), Vectors.fromML(docConcentration), topicConcentration, gammaShape ) val ldaCtor = classOf[SparkLocalLDA].getDeclaredConstructor( classOf[String], classOf[Int], classOf[OldSparkLocalLDA], classOf[SparkSession] ) val lda = ldaCtor.newInstance(metadata.uid, vocabSize, oldLDA, null) ParamUtils.set(lda, lda.optimizer, metadata) ParamUtils.set(lda, lda.keepLastCheckpoint, metadata) ParamUtils.set(lda, lda.seed, metadata) ParamUtils.set(lda, lda.featuresCol, metadata) ParamUtils.set(lda, lda.learningDecay, metadata) ParamUtils.set(lda, lda.checkpointInterval, metadata) ParamUtils.set(lda, lda.learningOffset, metadata) ParamUtils.set(lda, lda.maxIter, metadata) ParamUtils.set(lda, lda.k, metadata) lda } override implicit def toLocal(sparkTransformer: SparkLocalLDA): LocalTransformer[SparkLocalLDA] = new LocalLDAModel(sparkTransformer) }
Example 68
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 69
Source File: Util.scala From spark-twitter-sentiment with Apache License 2.0 | 5 votes |
package com.dhruv import org.apache.commons.cli.{Options, ParseException, PosixParser} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.feature.HashingTF import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder object Utils { val numFeatures = 1000 val tf = new HashingTF(numFeatures) val CONSUMER_KEY = "consumerKey" val CONSUMER_SECRET = "consumerSecret" val ACCESS_TOKEN = "accessToken" val ACCESS_TOKEN_SECRET = "accessTokenSecret" val THE_OPTIONS = { val options = new Options() options.addOption(CONSUMER_KEY, true, "Twitter OAuth Consumer Key") options.addOption(CONSUMER_SECRET, true, "Twitter OAuth Consumer Secret") options.addOption(ACCESS_TOKEN, true, "Twitter OAuth Access Token") options.addOption(ACCESS_TOKEN_SECRET, true, "Twitter OAuth Access Token Secret") options } def parseCommandLineWithTwitterCredentials(args: Array[String]) = { val parser = new PosixParser try { val cl = parser.parse(THE_OPTIONS, args) System.setProperty("twitter4j.oauth.consumerKey", cl.getOptionValue(CONSUMER_KEY)) System.setProperty("twitter4j.oauth.consumerSecret", cl.getOptionValue(CONSUMER_SECRET)) System.setProperty("twitter4j.oauth.accessToken", cl.getOptionValue(ACCESS_TOKEN)) System.setProperty("twitter4j.oauth.accessTokenSecret", cl.getOptionValue(ACCESS_TOKEN_SECRET)) cl.getArgList.toArray } catch { case e: ParseException => System.err.println("Parsing failed. Reason: " + e.getMessage) System.exit(1) } } def getAuth = { Some(new OAuthAuthorization(new ConfigurationBuilder().build())) } def featurize(s: String): Vector = { tf.transform(s.sliding(2).toSeq) } object IntParam { def unapply(str: String): Option[Int] = { try { Some(str.toInt) } catch { case e: NumberFormatException => None } } } }
Example 70
Source File: Tabulizer.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.packtpub.mmlwspark.utils import org.apache.spark.mllib.linalg.Vector def table(vector: Vector, cols: Int, format: String = "%.3f"): String = table(vector.toArray.map(format.format(_)), cols, None) def table(list: Seq[Any], cols: Int, header: Option[Seq[String]]): String = table(tblize(header.map(_ ++ list).getOrElse(list), cols), header.isDefined) def table(cells: Seq[Seq[Any]], header: Boolean): String = { val colSizes = cells .map(_.map(v => if (v != null) v.toString.length else 1)) .reduce((v1, v2) => v1.zip(v2).map { case (v1, v2) => if (v1 > v2) v1 else v2 }) val rowSeparator = colSizes.map("-" * _).mkString("+", "+", "+") def valueFormatter(v: Any, size: Int): String = ("%" + size + "s").format(if (v != null) v else "-") val rows = cells .map(row => row.zip(colSizes) .map { case (v, size) => valueFormatter(v, size) }.mkString("|", "|", "|")) if (header) s""" #$rowSeparator #${rows.head} #$rowSeparator #${rows.tail.mkString("\n")} #$rowSeparator """.stripMargin('#') else s""" #$rowSeparator #${rows.mkString("\n")} #$rowSeparator """.stripMargin('#') } def tblize(list: Seq[Product], horizontal: Boolean, cols: Int): Seq[Seq[Any]] = { val arity = list.head.productArity tblize(list.flatMap(_.productIterator.toList), cols = arity * cols) } def tblize(list: Seq[Any], cols: Int = 4): Seq[Seq[Any]] = { val nrow = list.length / cols + (if (list.length % cols == 0) 0 else 1) list.sliding(cols, cols) .map(s => if (s.length == cols || s.length == list.length) s else s.padTo(cols, null)) .foldLeft(Seq[Seq[Any]]()) { case (a, s) => a ++ Seq(s) } } }
Example 71
Source File: X2PHelper.scala From spark-tsne with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib import breeze.linalg._ import breeze.numerics._ import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLUtils object X2PHelper { case class VectorWithNorm(vector: Vector, norm: Double) def fastSquaredDistance(v1: VectorWithNorm, v2: VectorWithNorm): Double = { MLUtils.fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm) } def Hbeta(D: DenseVector[Double], beta: Double = 1.0) : (Double, DenseVector[Double]) = { val P: DenseVector[Double] = exp(- D * beta) val sumP = sum(P) if(sumP == 0) { (0.0, DenseVector.zeros(D.size)) }else { val H = log(sumP) + (beta * sum(D :* P) / sumP) (H, P / sumP) } } }
Example 72
Source File: BugDemonstrationTest.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} class BugDemonstrationTest extends FunSuite with Matchers with BeforeAndAfterAll { private var sparkSession : SparkSession = _ override def beforeAll(): Unit = { super.beforeAll() sparkSession = SparkSession.builder().appName("BugTests").master("local[2]").getOrCreate() } override def afterAll(): Unit = { super.afterAll() sparkSession.stop() } test("This demonstrates a bug was fixed in tsne-spark 2.1") { val sc = sparkSession.sparkContext val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) val expectedMean = Vectors.dense(2.0,20.0,200.0) val resultMean = summary.mean assertEqualEnough(resultMean, expectedMean) val expectedVariance = Vectors.dense(1.0,100.0,10000.0) assertEqualEnough(summary.variance, expectedVariance) val expectedNumNonZeros = Vectors.dense(3.0, 3.0, 3.0) assertEqualEnough(summary.numNonzeros, expectedNumNonZeros) } private def assertEqualEnough(sample: Vector, expected: Vector): Unit = { expected.toArray.zipWithIndex.foreach{ case(d: Double, i: Int) => sample(i) should be (d +- 1E-12) } } }
Example 73
Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import java.util.Random import scala.language.implicitConversions import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace} import org.apache.spark.ml.optim.VectorRDDFunctions._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.storage.StorageLevel private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = { data.cartesian(dx).map { case (points, x) => val g = Vectors.zeros(x.size) points.foreach { case LabeledPoint(b, a) => val err = BLAS.dot(a, x) - b BLAS.axpy(err, a, g) } g }.treeSum() } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]") val sc = new SparkContext(conf) sc.setCheckpointDir("/tmp/checkpoint") val n = 1000 val p = 100 val random = new Random(0L) val xExact = Vectors.dense(Array.fill(p)(random.nextDouble())) val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) => val random = new Random(100 + idx) part.map { v => val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian() LabeledPoint(target, v) } }.glom() .cache() val x = solve(data).first() println(s"x_exact = $xExact") println(s"x_vlbfgs = $x") sc.stop() } }
Example 74
Source File: HivemallUtils.scala From hivemall-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, DataFrame, Row, UserDefinedFunction} object HivemallUtils { // # of maximum dimensions for feature vectors val maxDims = 100000000 def funcVectorizer(dense: Boolean = false, dims: Int = maxDims) : UserDefinedFunction = { udf(funcVectorizerImpl(dense, dims)) } private def funcVectorizerImpl(dense: Boolean, dims: Int) : Seq[String] => Vector = { if (dense) { // Dense features i: Seq[String] => { val features = new Array[Double](dims) i.map { ft => val s = ft.split(":").ensuring(_.size == 2) features(s(0).toInt) = s(1).toDouble } Vectors.dense(features) } } else { // Sparse features i: Seq[String] => { val features = i.map { ft => // val s = ft.split(":").ensuring(_.size == 2) val s = ft.split(":") (s(0).toInt, s(1).toDouble) } Vectors.sparse(dims, features) } } } }
Example 75
Source File: AggregatedICPClassifier.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.liblinear import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.ICPClassifierModel import org.apache.commons.lang.NotImplementedException import org.apache.spark.broadcast.Broadcast import org.apache.spark.SparkContext object AggregatedICPClassifier { def load(path: String, sc: SparkContext) = { val icps = sc.textFile(path) .map(ICPClassifierModel.deserialize(_, LibLinAlgDeserializer)) new AggregatedICPClassifier(icps) } } class AggregatedICPClassifier( private val icps: RDD[ICPClassifierModel[LibLinAlg]]) extends ICPClassifierModel[LibLinAlg] { val cachedICPs = icps.cache override def mondrianPv(features: Vector) = { cachedICPs .flatMap { icp => icp.mondrianPv(features) .zipWithIndex } .collect //we expect to aggregate up to 100 ICPs .groupBy(_._2) .toArray .sortBy(_._1) .map { case (index, seq) => val sortedSeq = seq.map(_._1).toArray.sorted val n = sortedSeq.length val median = if (n % 2 == 0) { (sortedSeq(n / 2 - 1) + sortedSeq(n / 2)) / 2 } else { sortedSeq(n / 2) } median } } def save(path: String, coalesce: Int = 0) = { var serialICPs = cachedICPs.map(_.toString) if (coalesce > 0) { serialICPs = serialICPs.coalesce(coalesce) } serialICPs.saveAsTextFile(path) } }
Example 76
Source File: LibLinAlg.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.liblinear import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import de.bwaldvogel.liblinear.Feature import de.bwaldvogel.liblinear.FeatureNode import de.bwaldvogel.liblinear.Linear import de.bwaldvogel.liblinear.Parameter import de.bwaldvogel.liblinear.Problem import de.bwaldvogel.liblinear.SolverType import se.uu.farmbio.cp.UnderlyingAlgorithm import se.uu.farmbio.cp.Deserializer object LibLinAlg { private def vectorToFeatures(v: Vector) = { val indices = v.toSparse.indices val values = v.toSparse.values indices .zip(values) .sortBy { case (i, v) => i } .map { case (i, v) => new FeatureNode(i + 1, v) .asInstanceOf[Feature] } } private def train( input: Array[LabeledPoint], solverType: SolverType, c: Double, tol: Double) = { //configure problem val problem = new Problem problem.l = input.length problem.n = input(0).features.size problem.x = input.map { p => vectorToFeatures(p.features) } problem.y = input.map(_.label + 1.0) problem.bias = -1.0 //train val parameter = new Parameter(solverType, c, tol) val libLinModel = Linear.train(problem, parameter) //convert to Spark SVMModel val weights = libLinModel.getFeatureWeights val intercept = libLinModel.getBias val svmModel = new SVMModel(Vectors.dense(weights).toSparse, intercept) svmModel.clearThreshold svmModel } } object LibLinAlgDeserializer extends Deserializer[LibLinAlg] { override def deserialize(alg: String) = { val splitted = alg.split(",", 2) val intercept = splitted(0) val weights = splitted(1) val model = new SVMModel(Vectors.parse(weights).toSparse, intercept.toDouble) model.clearThreshold() new LibLinAlg(model) } } class LibLinAlg( val svmModel: SVMModel) extends UnderlyingAlgorithm( (features: Vector) => svmModel.predict(features)) { def this( training: Array[LabeledPoint], solverType: SolverType, regParam: Double, tol: Double) = { this(LibLinAlg.train(training, solverType, regParam, tol)) } override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { score } else { -score } } override def toString = { this.svmModel.intercept + "," + this.svmModel.weights.toString } }
Example 77
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.loss.LogLoss import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a GBTs UnderlyingAlgorithm private object GBT { def trainingProcedure( input: RDD[LabeledPoint], numIterations: Int): (Vector => Double) = { //Configuration val boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.numIterations = numIterations boostingStrategy.treeStrategy.maxDepth = 5 boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() boostingStrategy.loss = LogLoss //Training val remappedInput = input.map(x => new LabeledPoint((x.label * 2) - 1, x.features)) val model = new GradientBoostedTrees(boostingStrategy) .run(input = remappedInput) model.predict } } class GBT( private val input: RDD[LabeledPoint], private val numIterations: Int) extends UnderlyingAlgorithm( GBT.trainingProcedure(input,numIterations)) { override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { -score } else { score } } }
Example 78
Source File: LogisticRegression.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.LBFGS import org.apache.spark.mllib.optimization.LogisticGradient import org.apache.spark.mllib.optimization.SquaredL2Updater import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a LogisticRegression UnderlyingAlgorithm private object LogisticRegression { def trainingProcedure( input: RDD[LabeledPoint], maxNumItearations: Int, regParam: Double, numCorrections: Int, convergenceTol: Double): (Vector => Double) = { //Train Logistic Regression with LBFGS val numFeatures = input.take(1)(0).features.size val training = input.map(x => (x.label, MLUtils.appendBias(x.features))).cache() val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1)) val (weightsWithIntercept, _) = LBFGS.runLBFGS( training, new LogisticGradient(), new SquaredL2Updater(), numCorrections, convergenceTol, maxNumItearations, regParam, initialWeightsWithIntercept) //Create the model using the weights val model = new LogisticRegressionModel( Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)), weightsWithIntercept(weightsWithIntercept.size - 1)) //Return raw score predictor model.clearThreshold() model.predict } } class LogisticRegression( private val input: RDD[LabeledPoint], private val maxNumItearations: Int = 100, private val regParam: Double = 0.1, private val numCorrections: Int = 10, private val convergenceTol: Double = 1e-4) extends UnderlyingAlgorithm( LogisticRegression.trainingProcedure( input, maxNumItearations, regParam, numCorrections, convergenceTol)) { override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { 1-score } else { score } } }
Example 79
package org.dizhang.seqspark.stat import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import org.apache.spark.mllib.feature.{PCA => SPCA} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.dizhang.seqspark.ds.{DenseCounter, Genotype, SparseCounter} import org.dizhang.seqspark.util.General._ import org.dizhang.seqspark.worker.Data import org.slf4j.LoggerFactory } def pc(n: Int): BDM[Double] = { val model = new SPCA(n) val data = this.prepare if (data.isEmpty()) { new BDM[Double](0, 0) } else { val res = model.fit(data).pc.values new BDM(res.length/n, n, res) } } }
Example 80
Source File: Util.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql._ import org.apache.spark.sql.types.{StringType, StructField, StructType} object Util { val PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/" val DATA_PATH= "../../../data/ml-100k" val PATH_MOVIES = DATA_PATH + "/u.item" def reduceDimension2(x: Vector) : String= { var i = 0 var l = x.toArray.size var l_2 = l/2.toInt var x_ = 0.0 var y_ = 0.0 for(i <- 0 until l_2) { x_ += x(i).toDouble } for(i <- (l_2 + 1) until l) { y_ += x(i).toDouble } var t = x_ + "," + y_ return t } def getMovieDataDF(spark : SparkSession) : DataFrame = { //1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995) // |0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0 val customSchema = StructType(Array( StructField("id", StringType, true), StructField("name", StringType, true), StructField("date", StringType, true), StructField("url", StringType, true))); val movieDf = spark.read.format("com.databricks.spark.csv") .option("delimiter", "|").schema(customSchema) .load(PATH_MOVIES) return movieDf } }
Example 81
Source File: LDATextExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.lda import scala.collection.mutable import org.apache.spark.mllib.clustering.LDA import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext object LDATextExample { val PATH = "/home/ubuntu/work/spark-src/spark/" val sc = new SparkContext("local[2]", "First Spark App") def main(args: Array[String]): Unit = { // Load documents from text files, 1 document per file val corpus: RDD[String] = sc.wholeTextFiles(PATH + "docs/*.md").map(_._2) // Split each document into a sequence of terms (words) val tokenized: RDD[Seq[String]] = corpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3). filter(_.forall(java.lang.Character.isLetter))) // Choose the vocabulary. // termCounts: Sorted list of (term, termCount) pairs val termCounts: Array[(String, Long)] = tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2) // vocabArray: Chosen vocab (removing common terms) val numStopwords = 20 val vocabArray: Array[String] = termCounts.takeRight(termCounts.size - numStopwords).map(_._1) // vocab: Map term -> term index val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap // Convert documents into term count vectors val documents: RDD[(Long, Vector)] = tokenized.zipWithIndex.map { case (tokens, id) => val counts = new mutable.HashMap[Int, Double]() tokens.foreach { term => if (vocab.contains(term)) { val idx = vocab(term) counts(idx) = counts.getOrElse(idx, 0.0) + 1.0 } } (id, Vectors.sparse(vocab.size, counts.toSeq)) } // Set LDA parameters val numTopics = 10 val lda = new LDA().setK(numTopics).setMaxIterations(10) val ldaModel = lda.run(documents) //val avgLogLikelihood = ldaModel. / documents.count() // Print topics, showing top-weighted 10 terms for each topic. val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10) topicIndices.foreach { case (terms, termWeights) => println("TOPIC:") terms.zip(termWeights).foreach { case (term, weight) => println(s"${vocabArray(term.toInt)}\t$weight") } println() } } }
Example 82
Source File: SparkSVDExampleOne.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.svd import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors} object SparkSVDExampleOne { def main(args: Array[String]) { val denseData = Seq( Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1), Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3), Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8), Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0) ) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo") val sc = new SparkContext(spConfig) val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2)) // Compute the top 20 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. println("U:" + U) println("s:" + s) println("V:" + V) sc.stop() } }
Example 83
Source File: PerCoordinateUpdater.scala From spark-fm with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import org.apache.spark.mllib.linalg.Vector abstract class PerCoordinateUpdater extends Serializable { def compute( weightsOld: Vector, gradient: Vector, alpha: Double, beta: Double, l1: Double, l2: Double, n: Vector, z: Vector): (Vector, Double, Vector, Vector) }
Example 84
Source File: SVDExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 85
Source File: BisectingKMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.BisectingKMeans import org.apache.spark.mllib.linalg.{Vector, Vectors} // $example off$ object BisectingKMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample") val sc = new SparkContext(sparkConf) // $example on$ // Loads and parses data def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble)) val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache() // Clustering the data into 6 clusters by BisectingKMeans. val bkm = new BisectingKMeans().setK(6) val model = bkm.run(data) // Show the compute cost and the cluster centers println(s"Compute Cost: ${model.computeCost(data)}") model.clusterCenters.zipWithIndex.foreach { case (center, idx) => println(s"Cluster Center ${idx}: ${center}") } // $example off$ sc.stop() } } // scalastyle:on println
Example 86
Source File: Normalizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 87
Source File: GaussianMixtureModelWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import scala.collection.JavaConverters import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.GaussianMixtureModel import org.apache.spark.mllib.linalg.{Vector, Vectors} val gaussians: Array[Byte] = { val modelGaussians = model.gaussians.map { gaussian => Array[Any](gaussian.mu, gaussian.sigma) } SerDe.dumps(JavaConverters.seqAsJavaListConverter(modelGaussians).asJava) } def predictSoft(point: Vector): Vector = { Vectors.dense(model.predictSoft(point)) } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 88
Source File: Word2VecModelWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 89
Source File: PearsonCorrelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 90
Source File: SpearmanCorrelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 91
Source File: GLMClassificationModel.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.classification.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SparkSession} def loadData(sc: SparkContext, path: String, modelClass: String): Data = { val dataPath = Loader.dataPath(path) val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val dataRDD = spark.read.parquet(dataPath) val dataArray = dataRDD.select("weights", "intercept", "threshold").take(1) assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath") val data = dataArray(0) assert(data.size == 3, s"Unable to load $modelClass data from: $dataPath") val (weights, intercept) = data match { case Row(weights: Vector, intercept: Double, _) => (weights, intercept) } val threshold = if (data.isNullAt(2)) { None } else { Some(data.getDouble(2)) } Data(weights, intercept, threshold) } } }
Example 92
Source File: LabeledPoint.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 93
Source File: GLMRegressionModel.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SparkSession} def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = { val dataPath = Loader.dataPath(path) val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val dataRDD = spark.read.parquet(dataPath) val dataArray = dataRDD.select("weights", "intercept").take(1) assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath") val data = dataArray(0) assert(data.size == 2, s"Unable to load $modelClass data from: $dataPath") data match { case Row(weights: Vector, intercept: Double) => assert(weights.size == numFeatures, s"Expected $numFeatures features, but" + s" found ${weights.size} features when loading $modelClass weights from $dataPath") Data(weights, intercept) } } } }
Example 94
Source File: IDFSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 95
Source File: PCASuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 96
Source File: TimeSeriesSmallModelRegressionMetrics.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary} class TimeSeriesSmallModelRegressionMetrics( idPredictionsAndObservations: Array[(Double, Double)] ) { private lazy val summary: MultivariateStatisticalSummary = idPredictionsAndObservations.map { case (observation, prediction) => Vectors.dense(observation, observation - prediction) }.aggregate(new MultivariateOnlineSummarizer())( (summary, current) => summary.add(org.apache.spark.mllib.linalg.Vectors.fromML(current)), (sum1, sum2) => sum1.merge(sum2) ) private lazy val SSerr = math.pow(summary.normL2(1), 2) private lazy val SStot = summary.variance(0) * (summary.count - 1) private lazy val SSreg = { val yMean = summary.mean(0) idPredictionsAndObservations.map { case (prediction, observation) => math.pow(prediction - yMean, 2) }.sum } def explainedVariance = SSreg / summary.count def meanAbsoluteError = summary.normL1(1) / summary.count def meanSquaredError = SSerr / summary.count def rootMeanSquaredPercentageError = math.sqrt(idPredictionsAndObservations.map { case (observation, prediction) => if (observation == 0) { 0 } else { Math.pow((observation - prediction) / observation, 2) } }.sum / summary.count) def rootMeanSquaredError = math.sqrt(meanSquaredError) def r2 = 1 - (SSerr / SStot) }
Example 97
Source File: UberHoltWintersModel.scala From uberdata with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts.models import org.apache.spark.mllib.linalg.Vector class UberHoltWintersModel(override val period: Int, override val alpha: Double, override val beta: Double, override val gamma: Double, override val modelType: String = "additive") extends HoltWintersModel(modelType,period, alpha, beta, gamma) { lazy val params = Map( "HoltWintersAlpha" -> alpha.toString, "HoltWintersBeta" -> beta.toString, "HoltWintersGamma" -> gamma.toString ) } object UberHoltWintersModel { def fitModelWithBOBYQA(ts: Vector, m: Int, modelType: String = "additive"): UberHoltWintersModel = { val model = HoltWinters.fitModelWithBOBYQA(ts, m, modelType) new UberHoltWintersModel( m, model.alpha, model.beta, model.gamma ) } }
Example 98
Source File: UberArimaModel.scala From uberdata with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts.models import org.apache.spark.mllib.linalg.Vector class UberArimaModel(override val p: scala.Int, override val d: scala.Int, override val q: scala.Int, override val coefficients: scala.Array[scala.Double], override val hasIntercept: scala.Boolean = true) extends ARIMAModel(p, q, d, coefficients, hasIntercept) { lazy val params = Map("ArimaP" -> p.toString, "ArimaD" -> d.toString, "ArimaQ" -> q.toString) } object UberArimaModel { def fitModel(p: Int, d: Int, q: Int, ts: Vector, includeIntercept: Boolean = true, method: String = "css-cgd", userInitParams: Array[Double] = null): UberArimaModel = { val model = ARIMA.fitModel(p, d, q, ts, includeIntercept, method, userInitParams) new UberArimaModel(p, d, q, model.coefficients, model.hasIntercept) } }
Example 99
Source File: RandomProjection.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.linalg import java.util.Random import breeze.stats.distributions.CauchyDistribution import org.apache.spark.mllib.linalg.{ DenseMatrix, Matrices } import org.apache.spark.mllib.linalg.{ DenseVector, Vector } def generateGaussian(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = { val localMatrix = DenseMatrix.randn(projectedDim, originalDim, random) new RandomProjection(localMatrix) } def generateCauchy(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = { def randc(numRows: Int, numCols: Int): DenseMatrix = { require( numRows.toLong * numCols <= Int.MaxValue, s"$numRows x $numCols dense matrix is too large to allocate" ) val cauchyDistribution = new CauchyDistribution(0, 1) new DenseMatrix(numRows, numCols, cauchyDistribution.drawMany(numRows * numCols)) } val localMatrix = randc(projectedDim, originalDim) new RandomProjection(localMatrix) } }
Example 100
Source File: ProbabilisticClassifierConfig.scala From pu4spark with Apache License 2.0 | 5 votes |
package ru.ispras.pu4spark import org.apache.spark.ml.classification._ import org.apache.spark.mllib.linalg.Vector sealed trait ProbabilisticClassifierConfig case class LogisticRegressionConfig(maxIter: Int = 100, regParam: Double = 1.0e-8, elasticNetParam: Double = 0.0) extends ProbabilisticClassifierConfig { def build(): ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel] = { new LogisticRegression() .setLabelCol(ProbabilisticClassifierConfig.labelName).setFeaturesCol(ProbabilisticClassifierConfig.featuresName) .setMaxIter(maxIter).setRegParam(regParam).setElasticNetParam(elasticNetParam) } } case class RandomForestConfig(numTrees: Int = 512) extends ProbabilisticClassifierConfig { def build(): ProbabilisticClassifier[Vector, RandomForestClassifier, RandomForestClassificationModel] = { new RandomForestClassifier() .setLabelCol(ProbabilisticClassifierConfig.labelName).setFeaturesCol(ProbabilisticClassifierConfig.featuresName) .setNumTrees(numTrees) } } object ProbabilisticClassifierConfig { val labelName = "label" val featuresName = "indexedFeatures" val subclasses = List(classOf[LogisticRegressionConfig], classOf[RandomForestConfig]) }
Example 101
Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package tests import org.apache.log4j.{ Level, Logger } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.storage.StorageLevel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, svd => brzSvd, max => Bmax, min => Bmin, sum => Bsum } import scala.collection.mutable.ArrayBuffer import CNN.CNN object Test_example_CNN { def main(args: Array[String]) { //1 ����Spark���� val conf = new SparkConf().setAppName("CNNtest") val sc = new SparkContext(conf) //2 �������� Logger.getRootLogger.setLevel(Level.WARN) val data_path = "/deeplearn/train_d3.txt" val examples = sc.textFile(data_path).cache() val train_d1 = examples.map { line => val f1 = line.split("\t") val f = f1.map(f => f.toDouble) val y = f.slice(0, 10) val x = f.slice(10, f.length) (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0) } val train_d = train_d1.map(f => (f._1, f._2)) //3 ����ѵ������������ģ�� // opts:��������������������������֤���� val opts = Array(50.0, 1.0, 0.0) train_d.cache val numExamples = train_d.count() println(s"numExamples = $numExamples.") val CNNmodel = new CNN(). setMapsize(new BDM(1, 2, Array(28.0, 28.0))). setTypes(Array("i", "c", "s", "c", "s")). setLayer(5). setOnum(10). setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)). setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)). setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)). setAlpha(1.0). CNNtrain(train_d, opts) //4 ģ�Ͳ��� val CNNforecast = CNNmodel.predict(train_d) val CNNerror = CNNmodel.Loss(CNNforecast) println(s"NNerror = $CNNerror.") val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200) println("Ԥ��ֵ") for (i <- 0 until printf1.length) { val outi = printf1(i)._2.mkString("\t") println(outi) } } }
Example 102
Source File: KMeanTest.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector} import scala.util.Random //spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9 //guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15 object ScalableKMeanTest { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}") val sc = new SparkContext(conf) val k = args(0).toInt val dimension = args(1).toInt val recordNum = args(2).toInt val sparsity = args(3).toDouble val iterations = args(4).toInt val means = args(5) val parNumber = args(6).toInt val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => { val ran = new Random() val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray val vec: Vector = new SparseVector(dimension, indexArr, valueArr) vec }).cache() println(args.mkString(", ")) println(data.count() + " records generated") val st = System.nanoTime() val model = if(means == "my") { println("running scalable kmeans") val model = new ScalableKMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } else { println("running mllib kmeans") val model = new KMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } println((System.nanoTime() - st) / 1e9 + " seconds cost") println("final clusters: " + model.clusterCenters.length) println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) sc.stop() } }
Example 103
Source File: lda-script.scala From practical-data-science-with-hadoop-and-spark with Apache License 2.0 | 5 votes |
import collection.JavaConversions._ import scala.collection.mutable import opennlp.tools.tokenize.SimpleTokenizer import opennlp.tools.stemmer.PorterStemmer import org.apache.spark.rdd._ import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA} import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors} import org.apache.spark.mllib.feature.IDF // add openNLP jar to the Spark Context sc.addJar("opennlp-tools-1.6.0.jar") // Load documents from text files, 1 element (text string) per file val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2) // read stop words from file val stopwordFile = "stop-words.txt" val st_words = sc.textFile(stopwordFile).collect() .flatMap(_.stripMargin.split("\\s+")).map(_.toLowerCase).toSet val stopwords = sc.broadcast(st_words) val minWordLength = 3 val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => val tokenizer = SimpleTokenizer.INSTANCE val stemmer = new PorterStemmer() val tokens = tokenizer.tokenize(text) val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w))) .map(w => stemmer.stem(w)) id -> words }.filter(_._2.length > 0) tokenized.cache() val numDocs = tokenized.count() val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => tokens.map(_ -> 1L) }.reduceByKey(_ + _) wordCounts.cache() val fullVocabSize = wordCounts.count() val vSize = 10000 val (vocab: Map[String, Int], selectedTokenCount: Long) = { val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)} (sortedWC.map(_._1).zipWithIndex.toMap, sortedWC.map(_._2).sum) } val documents = tokenized.map { case (id, tokens) => // Filter tokens by vocabulary, and create word count vector representation of document. val wc = new mutable.HashMap[Int, Int]() tokens.foreach { term => if (vocab.contains(term)) { val termIndex = vocab(term) wc(termIndex) = wc.getOrElse(termIndex, 0) + 1 } } val indices = wc.keys.toArray.sorted val values = indices.map(i => wc(i).toDouble) val sb = Vectors.sparse(vocab.size, indices, values) (id, sb) } val vocabArray = new Array[String](vocab.size) vocab.foreach { case (term, i) => vocabArray(i) = term } val tf = documents.map { case (id, vec) => vec }.cache() val idfVals = new IDF().fit(tf).idf.toArray val tfidfDocs: RDD[(Long, Vector)] = documents.map { case (id, vec) => val indices = vec.asInstanceOf[SparseVector].indices val counts = new mutable.HashMap[Int, Double]() for (idx <- indices) { counts(idx) = vec(idx) * idfVals(idx) } (id, Vectors.sparse(vocab.size, counts.toSeq)) } val numTopics = 5 val numIterations = 50 val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online") val ldaModel = lda.run(tfidfDocs) val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5) topicIndices.foreach { case (terms, termWeights) => println("TOPIC:") terms.zip(termWeights).foreach { case (term, weight) => println(s"${vocabArray(term.toInt)}\t$weight") } println() }
Example 104
Source File: Autoregression.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts.models import com.cloudera.sparkts.Lag import com.cloudera.sparkts.MatrixUtil.{matToRowArrs, toBreeze} import org.apache.commons.math3.random.RandomGenerator import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression import org.apache.spark.mllib.linalg.{DenseVector, Vector} object Autoregression { def fitModel(ts: Vector, maxLag: Int, noIntercept: Boolean = false): ARModel = { // This is loosely based off of the implementation in statsmodels: // https://github.com/statsmodels/statsmodels/blob/master/statsmodels/tsa/ar_model.py // Make left hand side val Y = toBreeze(ts)(maxLag until ts.size) // Make lagged right hand side val X = Lag.lagMatTrimBoth(ts, maxLag) val regression = new OLSMultipleLinearRegression() regression.setNoIntercept(noIntercept) // drop intercept in regression regression.newSampleData(Y.toArray, matToRowArrs(X)) val params = regression.estimateRegressionParameters() val (c, coeffs) = if (noIntercept) (0.0, params) else (params.head, params.tail) new ARModel(c, coeffs) } } class ARModel(val c: Double, val coefficients: Array[Double]) extends TimeSeriesModel { def this(c: Double, coef: Double) = this(c, Array(coef)) def removeTimeDependentEffects( ts: Vector, destTs: Vector = null): Vector = { val dest = if (destTs == null) new Array[Double](ts.size) else destTs.toArray var i = 0 while (i < ts.size) { dest(i) = ts(i) - c var j = 0 while (j < coefficients.length && i - j - 1 >= 0) { dest(i) -= ts(i - j - 1) * coefficients(j) j += 1 } i += 1 } new DenseVector(dest) } def addTimeDependentEffects(ts: Vector, destTs: Vector): Vector = { val dest = if (destTs == null) new Array[Double](ts.size) else destTs.toArray var i = 0 while (i < ts.size) { dest(i) = c + ts(i) var j = 0 while (j < coefficients.length && i - j - 1 >= 0) { dest(i) += dest(i - j - 1) * coefficients(j) j += 1 } i += 1 } new DenseVector(dest) } def sample(n: Int, rand: RandomGenerator): Vector = { val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian())) addTimeDependentEffects(vec, vec) } }
Example 105
Source File: PythonConnector.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts import java.nio.ByteBuffer import java.time._ import scala.collection.mutable.ArrayBuffer import org.apache.spark.mllib.linalg.{DenseVector, Vector} import org.apache.spark.api.java.function.{PairFunction, Function} import PythonConnector._ private object PythonConnector { val INT_SIZE = 4 val DOUBLE_SIZE = 8 val LONG_SIZE = 8 def putVector(buf: ByteBuffer, vec: Vector): Unit = { buf.putInt(vec.size) var i = 0 while (i < vec.size) { buf.putDouble(vec(i)) i += 1 } } def arrayListToSeq(list: java.util.ArrayList[Any]): Seq[Any] = { // implement with ArrayBuffer var result = ArrayBuffer[Any]() if (list != null) { result = ArrayBuffer[Any](list.toArray: _*) } result } } private class BytesToKeyAndSeries extends PairFunction[Array[Byte], String, Vector] { override def call(arr: Array[Byte]): (String, Vector) = { val buf = ByteBuffer.wrap(arr) val keySize = buf.getInt() val keyBytes = new Array[Byte](keySize) buf.get(keyBytes) val seriesSize = buf.getInt() val series = new Array[Double](seriesSize) var i = 0 while (i < seriesSize) { series(i) = buf.getDouble() i += 1 } (new String(keyBytes, "UTF8"), new DenseVector(series)) } } private class KeyAndSeriesToBytes extends Function[(String, Vector), Array[Byte]] { override def call(keyVec: (String, Vector)): Array[Byte] = { val keyBytes = keyVec._1.getBytes("UTF-8") val vec = keyVec._2 val arr = new Array[Byte](INT_SIZE + keyBytes.length + INT_SIZE + DOUBLE_SIZE * vec.size) val buf = ByteBuffer.wrap(arr) buf.putInt(keyBytes.length) buf.put(keyBytes) putVector(buf, vec) arr } } private class InstantToBytes extends Function[(ZonedDateTime, Vector), Array[Byte]] { override def call(instant: (ZonedDateTime, Vector)): Array[Byte] = { val arr = new Array[Byte](LONG_SIZE + INT_SIZE + DOUBLE_SIZE * instant._2.size) val buf = ByteBuffer.wrap(arr) buf.putLong(TimeSeriesUtils.zonedDateTimeToLong(instant._1)) putVector(buf, instant._2) arr } }
Example 106
Source File: Example.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.{SparkConf, SparkContext} import breeze.linalg.{DenseMatrix => BDM, _} object Example { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/train.format", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(784), Vector2Tensor(Vectors.dense(arr.slice(0, 784))))) val topology = new CNNTopology topology.addLayer(CNNLayer.buildConvolutionLayer(1, 6, new Scale(5, 5))) topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvolutionLayer(6, 12, new Scale(5, 5))) topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvolutionLayer(12, 12, new Scale(4, 4))) val cnn: CNN = new CNN(topology).setMaxIterations(5).setMiniBatchSize(16) val start = System.nanoTime() cnn.trainOneByOne(data) println("Training time: " + (System.nanoTime() - start) / 1e9) val right = data.map(record =>{ val result = cnn.predict(record._2) if(result == record._1) 1 else 0 }).sum() println(s"Predicting precision: $right " + right.toDouble/(data.count())) // val testData = sc.textFile("dataset/mnist/mnist_test.csv", 8) // .map(line => line.split(",")).map(arr => arr.map(_.toDouble)) // .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0))))) val rightM = data.map(record =>{ val result = cnn.predict(record._2) if(result == record._1) 1 else 0 }).sum() println(s"Mnist Full Predicting precision: $rightM " + rightM.toDouble/(data.count())) } def Vector2Tensor(record: Vector): Array[BDM[Double]] = { val mapSize = new Scale(28, 28) val m = new BDM[Double](mapSize.x, mapSize.y) var i: Int = 0 while (i < mapSize.x) { var j: Int = 0 while (j < mapSize.y) { m(i, j) = record(mapSize.x * i + j) j += 1 } i += 1 } Array(m) } }
Example 107
Source File: Driver.scala From mCNN with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import org.apache.log4j.{Logger, Level} import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.{SparkContext, SparkConf} object CNNDriver { def main(args: Array[String]) { val myLayers = new Array[Layer](8) myLayers(0) = new ConvolutionalLayer(1, 6, kernelSize = new MapSize(5, 5), inputMapSize = new MapSize(28, 28)) myLayers(1) = new FunctionalLayer(new SigmoidFunction()) myLayers(2) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(24, 24)) myLayers(3) = new ConvolutionalLayer(6, 12, new MapSize(5, 5), new MapSize(12, 12)) myLayers(4) = new FunctionalLayer(new SigmoidFunction()) myLayers(5) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(8, 8)) myLayers(6) = new ConvolutionalLayer(12, 12, new MapSize(4, 4), new MapSize(4, 4)) myLayers(7) = new FunctionalLayer(new SigmoidFunction()) val topology = FeedForwardTopology(myLayers) Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/train.format", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => { val target = new Array[Double](12) target(arr(784).toInt) = 1 val in = Vector2BDM(Vectors.dense(arr.slice(0, 784))) (Vectors.fromBreeze(in.toDenseVector), Vectors.dense(target)) }).cache() val feedForwardTrainer = new FeedForwardTrainer(topology, 784, 12) feedForwardTrainer.setStackSize(4) // CNN does not benefit from the stacked data // .LBFGSOptimizer.setNumIterations(20) .SGDOptimizer .setMiniBatchFraction(0.002) .setConvergenceTol(0) .setNumIterations(1000) .setUpdater(new CNNUpdater(0.85)) for(iter <- 1 to 1000){ val start = System.nanoTime() val mlpModel = feedForwardTrainer.train(data) feedForwardTrainer.setWeights(mlpModel.weights()) println(s"Training time $iter: " + (System.nanoTime() - start) / 1e9) // predict val right = data.filter(v => mlpModel.predict(v._1).argmax == v._2.argmax).count() val precision = right.toDouble / data.count() println(s"right: $right, count: ${data.count()}, precision: $precision") } } def Vector2BDM(record: Vector): BDM[Double] = { val mapSize = new MapSize(28, 28) val m = new BDM[Double](mapSize.x, mapSize.y) var i: Int = 0 while (i < mapSize.x) { var j: Int = 0 while (j < mapSize.y) { m(i, j) = record(mapSize.x * i + j) j += 1 } i += 1 } m } }
Example 108
Source File: CNNUpdater.scala From mCNN with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import breeze.linalg.{*, DenseMatrix => BDM, DenseVector => BDV, Vector => BV, axpy => Baxpy, sum => Bsum} import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.optimization.Updater private[ann] class CNNUpdater(alpha: Double) extends Updater { override def compute( weightsOld: Vector, gradient: Vector, stepSize: Double, iter: Int, regParam: Double): (Vector, Double) = { val thisIterStepSize = stepSize val brzWeights: BV[Double] = weightsOld.toBreeze.toDenseVector Baxpy(-thisIterStepSize, gradient.toBreeze * alpha, brzWeights) (Vectors.fromBreeze(brzWeights), 0) } }
Example 109
Source File: SVDExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 110
Source File: BisectingKMeansExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.BisectingKMeans import org.apache.spark.mllib.linalg.{Vector, Vectors} // $example off$ object BisectingKMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample") val sc = new SparkContext(sparkConf) // $example on$ // Loads and parses data def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble)) val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache() // Clustering the data into 6 clusters by BisectingKMeans. val bkm = new BisectingKMeans().setK(6) val model = bkm.run(data) // Show the compute cost and the cluster centers println(s"Compute Cost: ${model.computeCost(data)}") model.clusterCenters.zipWithIndex.foreach { case (center, idx) => println(s"Cluster Center ${idx}: ${center}") } // $example off$ sc.stop() } } // scalastyle:on println
Example 111
Source File: Normalizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 112
Source File: GaussianMixtureModelWrapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import scala.collection.JavaConverters import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.GaussianMixtureModel import org.apache.spark.mllib.linalg.{Vector, Vectors} val gaussians: Array[Byte] = { val modelGaussians = model.gaussians.map { gaussian => Array[Any](gaussian.mu, gaussian.sigma) } SerDe.dumps(JavaConverters.seqAsJavaListConverter(modelGaussians).asJava) } def predictSoft(point: Vector): Vector = { Vectors.dense(model.predictSoft(point)) } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 113
Source File: Word2VecModelWrapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 114
Source File: PearsonCorrelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 115
Source File: SpearmanCorrelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 116
Source File: GLMClassificationModel.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.classification.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SparkSession} def loadData(sc: SparkContext, path: String, modelClass: String): Data = { val dataPath = Loader.dataPath(path) val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val dataRDD = spark.read.parquet(dataPath) val dataArray = dataRDD.select("weights", "intercept", "threshold").take(1) assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath") val data = dataArray(0) assert(data.size == 3, s"Unable to load $modelClass data from: $dataPath") val (weights, intercept) = data match { case Row(weights: Vector, intercept: Double, _) => (weights, intercept) } val threshold = if (data.isNullAt(2)) { None } else { Some(data.getDouble(2)) } Data(weights, intercept, threshold) } } }
Example 117
Source File: LabeledPoint.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 118
Source File: GLMRegressionModel.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SparkSession} def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = { val dataPath = Loader.dataPath(path) val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val dataRDD = spark.read.parquet(dataPath) val dataArray = dataRDD.select("weights", "intercept").take(1) assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath") val data = dataArray(0) assert(data.size == 2, s"Unable to load $modelClass data from: $dataPath") data match { case Row(weights: Vector, intercept: Double) => assert(weights.size == numFeatures, s"Expected $numFeatures features, but" + s" found ${weights.size} features when loading $modelClass weights from $dataPath") Data(weights, intercept) } } } }
Example 119
Source File: IDFSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 120
Source File: PCASuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 121
Source File: MllibHelper.scala From twitter-stream-ml with GNU General Public License v3.0 | 5 votes |
package com.giorgioinf.twtml.spark import java.text.Normalizer import org.apache.spark.Logging import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import scala.math.BigDecimal import twitter4j.Status object MllibHelper extends Logging { val numNumberFeatures = 4 var numRetweetBegin = 100 var numRetweetEnd = 1000 var numTextFeatures = 1000 var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray def reset(conf:ConfArguments) { numRetweetBegin = conf.numRetweetBegin numRetweetEnd = conf.numRetweetEnd numTextFeatures = conf.numTextFeatures var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures") } def featurizeText(statuses: Status): SparseVector = { val text = statuses.getRetweetedStatus .getText .toLowerCase // Separate accents from characters and then remove non-unicode // characters val noAccentText = Normalizer .normalize(text, Normalizer.Form.NFD) .replaceAll("\\p{M}", "") // bigrams hashText.transform(text.sliding(2).toSeq) .asInstanceOf[SparseVector] } def featurizeNumbers(statuses: Status): Vector = { val user = statuses.getRetweetedStatus.getUser val created = statuses.getRetweetedStatus.getCreatedAt val timeLeft = (System.currentTimeMillis - created.getTime) Vectors.dense( user.getFollowersCount * Math.pow(10, -12), user.getFavouritesCount * Math.pow(10, -12), user.getFriendsCount * Math.pow(10, -12), timeLeft * Math.pow(10, -14) //retweeted.getURLEntities.length, //retweeted.getUserMentionEntities.length ) } def featurize(statuses: Status): LabeledPoint = { val textFeatures = featurizeText(statuses) val numberFeatures = featurizeNumbers(statuses) val features = Vectors.sparse( numFeatures, textFeatures.indices ++ numberFeatureIndices, textFeatures.values ++ numberFeatures.toArray ) LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features ) } def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = { val n = statuses.getRetweetedStatus.getRetweetCount (n >= start && n <= end) } def filtrate(statuses: Status): Boolean = { ( statuses.isRetweet && //statuses.getLang == "en" && retweetInterval(statuses, numRetweetBegin, numRetweetEnd) ) } }
Example 122
Source File: SimpleTextClassificationPipeline.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import scala.beans.BeanInfo import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} @BeanInfo case class LabeledDocument(id: Long, text: String, label: Double) @BeanInfo case class Document(id: Long, text: String) object SimpleTextClassificationPipeline { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // Prepare training documents, which are labeled. val training = sc.parallelize(Seq( LabeledDocument(0L, "a b c d e spark", 1.0), LabeledDocument(1L, "b d", 0.0), LabeledDocument(2L, "spark f g h", 1.0), LabeledDocument(3L, "hadoop mapreduce", 0.0))) // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training.toDF()) // Prepare test documents, which are unlabeled. val test = sc.parallelize(Seq( Document(4L, "spark i j k"), Document(5L, "l m n"), Document(6L, "spark hadoop spark"), Document(7L, "apache hadoop"))) // Make predictions on test documents. model.transform(test.toDF()) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } sc.stop() } }
Example 123
Source File: DatasetExample.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import java.io.File import com.google.common.io.Files import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext, DataFrame} object DatasetExample { case class Params( input: String = "data/mllib/sample_libsvm_data.txt", dataFormat: String = "libsvm") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DatasetExample") { head("Dataset: an example app using DataFrame as a Dataset for ML.") opt[String]("input") .text(s"input path to dataset") .action((x, c) => c.copy(input = x)) opt[String]("dataFormat") .text("data format: libsvm (default), dense (deprecated in Spark v1.1)") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DatasetExample with $params") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // for implicit conversions // Load input data val origData: RDD[LabeledPoint] = params.dataFormat match { case "dense" => MLUtils.loadLabeledPoints(sc, params.input) case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input) } println(s"Loaded ${origData.count()} instances from file: ${params.input}") // Convert input data to DataFrame explicitly. val df: DataFrame = origData.toDF() println(s"Inferred schema:\n${df.schema.prettyJson}") println(s"Converted to DataFrame with ${df.count()} records") // Select columns val labelsDf: DataFrame = df.select("label") val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v } val numLabels = labels.count() val meanLabel = labels.fold(0.0)(_ + _) / numLabels println(s"Selected label column with average value $meanLabel") val featuresDf: DataFrame = df.select("features") val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") val tmpDir = Files.createTempDir() tmpDir.deleteOnExit() val outputDir = new File(tmpDir, "dataset").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) println(s"Loading Parquet file with UDT from $outputDir.") val newDataset = sqlContext.read.parquet(outputDir) println(s"Schema from Parquet: ${newDataset.schema.prettyJson}") val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v } val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${newFeaturesSummary.mean.toString}") sc.stop() } }
Example 124
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val idf = udf { vec: Vector => idfModel.transform(vec) } dataset.withColumn($(outputCol), idf(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): IDFModel = { val copied = new IDFModel(uid, idfModel) copyValues(copied, extra) } }
Example 125
Source File: BinaryClassificationEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() case other => throw new IllegalArgumentException(s"Does not support metric $other.") } metrics.unpersist() metric } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 126
Source File: Normalizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 127
Source File: PearsonCorrelation.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 128
Source File: SpearmanCorrelation.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 129
Source File: GLMClassificationModel.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.classification.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SQLContext} def loadData(sc: SparkContext, path: String, modelClass: String): Data = { val datapath = Loader.dataPath(path) val sqlContext = new SQLContext(sc) val dataRDD = sqlContext.read.parquet(datapath) val dataArray = dataRDD.select("weights", "intercept", "threshold").take(1) assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath") val data = dataArray(0) assert(data.size == 3, s"Unable to load $modelClass data from: $datapath") val (weights, intercept) = data match { case Row(weights: Vector, intercept: Double, _) => (weights, intercept) } val threshold = if (data.isNullAt(2)) { None } else { Some(data.getDouble(2)) } Data(weights, intercept, threshold) } } }
Example 130
Source File: LabeledPoint.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 131
Source File: GLMRegressionModel.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{DataFrame, Row, SQLContext} def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = { val datapath = Loader.dataPath(path) val sqlContext = new SQLContext(sc) val dataRDD = sqlContext.read.parquet(datapath) val dataArray = dataRDD.select("weights", "intercept").take(1) assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath") val data = dataArray(0) assert(data.size == 2, s"Unable to load $modelClass data from: $datapath") data match { case Row(weights: Vector, intercept: Double) => assert(weights.size == numFeatures, s"Expected $numFeatures features, but" + s" found ${weights.size} features when loading $modelClass weights from $datapath") Data(weights, intercept) } } } }
Example 132
Source File: OneHotEncoderSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext { def stringIndexed(): DataFrame = { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) val df = sqlContext.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") .fit(df) indexer.transform(df) } test("params") { ParamsSuite.checkParams(new OneHotEncoder) } test("OneHotEncoder dropLast = false") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") .setDropLast(false) val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1), vec(2)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0), (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0)) assert(output === expected) } test("OneHotEncoder dropLast = true") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0), (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0)) assert(output === expected) } test("input column with ML attribute") { val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large") val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size") .select(col("size").as("size", attr.toMetadata())) val encoder = new OneHotEncoder() .setInputCol("size") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("size_is_small").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("size_is_medium").withIndex(1)) } test("input column without ML attribute") { val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index") val encoder = new OneHotEncoder() .setInputCol("index") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("index_is_0").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("index_is_1").withIndex(1)) } }
Example 133
Source File: Word2VecSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel} class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new Word2Vec) val model = new Word2VecModel("w2v", new OldWord2VecModel(Map("a" -> Array(0.0f)))) ParamsSuite.checkParams(model) } test("Word2Vec") { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val sentence = "a b " * 100 + "a c " * 10 val numOfWords = sentence.split(" ").size val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" ")) val codes = Map( "a" -> Array(-0.2811822295188904, -0.6356269121170044, -0.3020961284637451), "b" -> Array(1.0309048891067505, -1.29472815990448, 0.22276712954044342), "c" -> Array(-0.08456747233867645, 0.5137411952018738, 0.11731560528278351) ) val expected = doc.map { sentence => Vectors.dense(sentence.map(codes.apply).reduce((word1, word2) => word1.zip(word2).map { case (v1, v2) => v1 + v2 } ).map(_ / numOfWords)) } val docDF = doc.zip(expected).toDF("text", "expected") val model = new Word2Vec() .setVectorSize(3) .setInputCol("text") .setOutputCol("result") .setSeed(42L) .fit(docDF) model.transform(docDF).select("result", "expected").collect().foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1 ~== vector2 absTol 1E-5, "Transformed vector is different with expected.") } } }
Example 134
Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new VectorAssembler) } test("assemble") { import org.apache.spark.ml.feature.VectorAssembler.assemble assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty)) assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0))) val dv = Vectors.dense(2.0, 0.0) assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0))) val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0)) assert(assemble(0.0, dv, 1.0, sv) === Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0))) for (v <- Seq(1, "a", null)) { intercept[SparkException](assemble(v)) intercept[SparkException](assemble(1.0, v)) } } test("assemble should compress vectors") { import org.apache.spark.ml.feature.VectorAssembler.assemble val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) assert(v1.isInstanceOf[SparseVector]) val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) assert(v2.isInstanceOf[DenseVector]) } test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) )).toDF("id", "x", "y", "name", "z", "n") val assembler = new VectorAssembler() .setInputCols(Array("x", "y", "z", "n")) .setOutputCol("features") assembler.transform(df).select("features").collect().foreach { case Row(v: Vector) => assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0))) } } test("ML attributes") { val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari") val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0) val user = new AttributeGroup("user", Array( NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"), NumericAttribute.defaultAttr.withName("salary"))) val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0))) val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad") .select( col("browser").as("browser", browser.toMetadata()), col("hour").as("hour", hour.toMetadata()), col("count"), // "count" is an integer column without ML attribute col("user").as("user", user.toMetadata()), col("ad")) // "ad" is a vector column without ML attribute val assembler = new VectorAssembler() .setInputCols(Array("browser", "hour", "count", "user", "ad")) .setOutputCol("features") val output = assembler.transform(df) val schema = output.schema val features = AttributeGroup.fromStructField(schema("features")) assert(features.size === 7) val browserOut = features.getAttr(0) assert(browserOut === browser.withIndex(0).withName("browser")) val hourOut = features.getAttr(1) assert(hourOut === hour.withIndex(1).withName("hour")) val countOut = features.getAttr(2) assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2)) val userGenderOut = features.getAttr(3) assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3)) val userSalaryOut = features.getAttr(4) assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4)) assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5)) assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6)) } }
Example 135
Source File: PolynomialExpansionSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } }
Example 136
Source File: IDFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { def scaleDataWithIDF(dataSet: Array[Vector], model: Vector): Array[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } test("params") { ParamsSuite.checkParams(new IDF) val model = new IDFModel("idf", new OldIDFModel(Vectors.dense(1.0))) ParamsSuite.checkParams(model) } test("compute IDF with default parameter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((numOfData + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") { val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df) idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } }
Example 137
Source File: NormalizerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var data: Array[Vector] = _ @transient var dataFrame: DataFrame = _ @transient var normalizer: Normalizer = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) val sqlContext = new SQLContext(sc) dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData)) normalizer = new Normalizer() .setInputCol("features") .setOutputCol("normalized_features") } def collectResult(result: DataFrame): Array[Vector] = { result.select("normalized_features").collect().map { case Row(features: Vector) => features } } def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { (vector1, vector2) => vector1 ~== vector2 absTol 1E-5 }, "The vector value is not correct after normalization.") } test("Normalization with default parameter") { val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l2Normalized) } test("Normalization with setter") { normalizer.setP(1) val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l1Normalized) } } private object NormalizerSuite { case class FeatureData(features: Vector) }
Example 138
Source File: HashingTFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.Utils class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val df = sqlContext.createDataFrame(Seq( (0, "a a b b c d".split(" ").toSeq) )).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(n)) val features = output.select("features").first().getAs[Vector](0) // Assume perfect hash on "a", "b", "c", and "d". def idx(any: Any): Int = Utils.nonNegativeMod(any.##, n) val expected = Vectors.sparse(n, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) assert(features ~== expected absTol 1e-14) } }
Example 139
Source File: IDFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 140
Source File: SparkXGBoostClassifierSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.functions.udf import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.LogisticLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext class SparkXGBoostClassifierSuite extends FunSuite with TestData with MLlibTestSparkContext { test("test with simple data") { val rawdata = Seq( LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(0, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(1, Vectors.dense(1.0, 1.0)) ) val data = sqlContext.createDataFrame(sc.parallelize(rawdata, 2)) val truthUDF = udf { feature: Vector => if (feature(0) == feature(1)) 0.0 else 1.0 } val dataWithTruth = data.withColumn("truth", truthUDF(data("features"))) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostClassifier = new SparkXGBoostClassifier(new LogisticLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(2) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostClassifier)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("truth") .setPredictionCol("prediction") .setMetricName("precision") val precision = evaluator.evaluate(sXGBoostModel.transform(dataWithTruth)) assert(precision === 1.0) } }
Example 141
Source File: BasicStatistics.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.stat.MultivariateStatisticalSummary import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix val sc: SparkContext = null val seriesX: RDD[Double] = null // a series 一系列 //必须与seriesX具有相同数量的分区和基数 val seriesY: RDD[Double] = null // must have the same number of partitions and cardinality as seriesX // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a // method is not specified, Pearson's method will be used by default. //pearson皮尔森相关性 val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") println("pearson:"+correlation) //请注意,每个向量是一个行,而不是一个列 val data: RDD[Vector] = null // note that each Vector is a row and not a column //spearman 斯皮尔曼相关性 // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. //用皮尔森法计算相关矩阵,用“斯皮尔曼”的斯皮尔曼方法 // If a method is not specified, Pearson's method will be used by default. //如果没有指定方法,皮尔森的方法将被默认使用 val correlMatrix: Matrix = Statistics.corr(data, "pearson") println("correlMatrix:"+correlMatrix.toString()) } }
Example 142
Source File: LogisticRegressionWithLBFGSExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors val points = Array( //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) LabeledPoint(0.0,Vectors.dense(0.245)), LabeledPoint(0.0,Vectors.dense(0.247)), LabeledPoint(1.0,Vectors.dense(0.285)), LabeledPoint(1.0,Vectors.dense(0.299)), LabeledPoint(1.0,Vectors.dense(0.327)), LabeledPoint(1.0,Vectors.dense(0.347)), LabeledPoint(0.0,Vectors.dense(0.356)), LabeledPoint(1.0,Vectors.dense(0.36)), LabeledPoint(0.0,Vectors.dense(0.363)), LabeledPoint(1.0,Vectors.dense(0.364)), LabeledPoint(0.0,Vectors.dense(0.398)), LabeledPoint(1.0,Vectors.dense(0.4)), LabeledPoint(0.0,Vectors.dense(0.409)), LabeledPoint(1.0,Vectors.dense(0.421)), LabeledPoint(0.0,Vectors.dense(0.432)), LabeledPoint(1.0,Vectors.dense(0.473)), LabeledPoint(1.0,Vectors.dense(0.509)), LabeledPoint(1.0,Vectors.dense(0.529)), LabeledPoint(0.0,Vectors.dense(0.561)), LabeledPoint(0.0,Vectors.dense(0.569)), LabeledPoint(1.0,Vectors.dense(0.594)), LabeledPoint(1.0,Vectors.dense(0.638)), LabeledPoint(1.0,Vectors.dense(0.656)), LabeledPoint(1.0,Vectors.dense(0.816)), LabeledPoint(1.0,Vectors.dense(0.853)), LabeledPoint(1.0,Vectors.dense(0.938)), LabeledPoint(1.0,Vectors.dense(1.036)), LabeledPoint(1.0,Vectors.dense(1.045))) //创建之前数据的RDD val spiderRDD = sc.parallelize(points) //使用数据训练模型(当所有预测值为0的时候,拦截是有意义的) //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法) val lr = new LogisticRegressionWithLBFGS().setIntercept(true) val model = lr.run(spiderRDD) //预测0.938尺度的蜘蛛的现状 val predict = model.predict(Vectors.dense(0.938)) } }
Example 143
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val idf = udf { vec: Vector => idfModel.transform(vec) } dataset.withColumn($(outputCol), idf(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): IDFModel = { val copied = new IDFModel(uid, idfModel) copyValues(copied, extra).setParent(parent) } }
Example 144
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.types.DataType def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT }
Example 145
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StructField, StructType} override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val pcaOp = udf { pcaModel.transform _ } dataset.withColumn($(outputCol), pcaOp(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[VectorUDT], s"Input column ${$(inputCol)} must be a vector column") require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.") val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) StructType(outputFields) } override def copy(extra: ParamMap): PCAModel = { val copied = new PCAModel(uid, pcaModel) copyValues(copied, extra).setParent(parent) } }
Example 146
Source File: BinaryClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //ROC曲线下面积 setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { //ROC曲线下面积为1.0时表示一个完美的分类器 case "areaUnderROC" => metrics.areaUnderROC() //准确率与召回率 case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true//ROC曲线下面积为1.0时表示一个完美的分类器,0.5则表示一个随机的性能 case "areaUnderPR" => true //准确率与召回率 } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 147
Source File: Normalizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 148
Source File: ChiSqSelector.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val indices = Statistics.chiSqTest(data) .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) .map { case (_, indices) => indices } .sorted new ChiSqSelectorModel(indices) } }
Example 149
Source File: GaussianMixtureModelWrapper.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{Vector, Vectors, Matrix} import org.apache.spark.mllib.clustering.GaussianMixtureModel val gaussians: JList[Object] = { val modelGaussians = model.gaussians var i = 0 var mu = ArrayBuffer.empty[Vector] var sigma = ArrayBuffer.empty[Matrix] while (i < k) { mu += modelGaussians(i).mu sigma += modelGaussians(i).sigma i += 1 } List(mu.toArray, sigma.toArray).map(_.asInstanceOf[Object]).asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 150
Source File: Word2VecModelWrapper.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = { rdd.rdd.map(model.transform) } def findSynonyms(word: String, num: Int): JList[Object] = { val vec = transform(word) findSynonyms(vec, num) } def findSynonyms(vector: Vector, num: Int): JList[Object] = { val result = model.findSynonyms(vector, num) val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map({case (k, v) => (k, v.toList.asJava)}).asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 151
Source File: PearsonCorrelation.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 152
Source File: SpearmanCorrelation.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 153
Source File: GLMClassificationModel.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.classification.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SQLContext} def loadData(sc: SparkContext, path: String, modelClass: String): Data = { val datapath = Loader.dataPath(path) val sqlContext = SQLContext.getOrCreate(sc) val dataRDD = sqlContext.read.parquet(datapath) val dataArray = dataRDD.select("weights", "intercept", "threshold").take(1) assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath") val data = dataArray(0) assert(data.size == 3, s"Unable to load $modelClass data from: $datapath") val (weights, intercept) = data match { case Row(weights: Vector, intercept: Double, _) => (weights, intercept) } val threshold = if (data.isNullAt(2)) { None } else { Some(data.getDouble(2)) } Data(weights, intercept, threshold) } } }
Example 154
Source File: LabeledPoint.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 155
Source File: GLMRegressionModel.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{DataFrame, Row, SQLContext} def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = { val datapath = Loader.dataPath(path) val sqlContext = SQLContext.getOrCreate(sc) val dataRDD = sqlContext.read.parquet(datapath) val dataArray = dataRDD.select("weights", "intercept").take(1) assert(dataArray.size == 1, s"Unable to load $modelClass data from: $datapath") val data = dataArray(0) assert(data.size == 2, s"Unable to load $modelClass data from: $datapath") data match { case Row(weights: Vector, intercept: Double) => assert(weights.size == numFeatures, s"Expected $numFeatures features, but" + s" found ${weights.size} features when loading $modelClass weights from $datapath") Data(weights, intercept) } } } }
Example 156
Source File: VectorSlicerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SQLContext} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") {//参数 val slicer = new VectorSlicer ParamsSuite.checkParams(slicer) //指数 assert(slicer.getIndices.length === 0) //名称 assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.validateParams() } } } test("feature validity checks") {//特征有效性检查 import VectorSlicer._ //如果给定的特征索引是有效的,返回true assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) //如果给定的特征名称有效,返回true assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") {//测试向量机 val sqlContext = new SQLContext(sc) val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 //预计在选择指数1,4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = sqlContext.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) //VectorSlicer是一个转换器输入特征向量,输出原始特征向量子集. val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df))//transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame validateResults(vectorSlicer.transform(df)) } }
Example 157
Source File: DCTSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext { //正向离散余弦变换jtransforms比赛结果 test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } //逆离散余弦变换jtransforms比赛结果 test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true) } else { (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = sqlContext.createDataFrame(Seq( DCTTestData(data, expectedResult) )) val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) //transform()方法将DataFrame转化为另外一个DataFrame的算法 transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 158
Source File: MinMaxScalerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SQLContext} println(vector1+"|||"+vector2) assert(vector1.equals(vector2), "Transformed vector is different with expected.") } } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } //MinMaxScaler将所有特征向量线性变换到用户指定最大-最小值之间 test("MinMaxScaler arguments max must be larger than min") { withClue("arguments max must be larger than min") { intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(10).setMax(0) scaler.validateParams() } intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(0).setMax(0) scaler.validateParams() } } } }
Example 159
Source File: PolynomialExpansionSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class PolynomialExpansionSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") {//参数 ParamsSuite.checkParams(new PolynomialExpansion) } test("Polynomial expansion with default parameter") {//带有默认参数的多项式展开 val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val twoDegreeExpansion: Array[Vector] = Array( Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 2.3, -4.6, 5.29)), Vectors.dense(-2.0, 4.0, 2.3, -4.6, 5.29), Vectors.dense(new Array[Double](9)), Vectors.dense(0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), Vectors.sparse(9, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(twoDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") //transform()方法将DataFrame转化为另外一个DataFrame的算法 polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } //多项式展开设置 test("Polynomial expansion with setter") { val data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq()) ) val threeDegreeExpansion: Array[Vector] = Array( Vectors.sparse(19, Array(0, 1, 2, 3, 4, 5, 6, 7, 8), Array(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)), Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17), Vectors.dense(new Array[Double](19)), Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331, -3.0, -1.8, -1.08, 3.3, 1.98, -3.63, 9.0, 5.4, -9.9, -27.0), Vectors.sparse(19, Array.empty, Array.empty)) val df = sqlContext.createDataFrame(data.zip(threeDegreeExpansion)).toDF("features", "expected") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) //transform()方法将DataFrame转化为另外一个DataFrame的算法 polynomialExpansion.transform(df).select("polyFeatures", "expected").collect().foreach { case Row(expanded: DenseVector, expected: DenseVector) => assert(expanded ~== expected absTol 1e-1) case Row(expanded: SparseVector, expected: SparseVector) => assert(expanded ~== expected absTol 1e-1) case _ => throw new TestFailedException("Unmatched data types after polynomial expansion", 0) } } }
Example 160
Source File: IDFSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("compute IDF with setter") {//设置IDF计算 val numOfFeatures = 4 val data = Array( Vectors.sparse(numOfFeatures, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(numOfFeatures, Array(1), Array(1.0)) ) val numOfData = data.size val idf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((numOfData + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, idf) val df = sqlContext.createDataFrame(data.zip(expected)).toDF("features", "expected") val idfModel = new IDF() .setInputCol("features") .setOutputCol("idfValue") .setMinDocFreq(1) .fit(df)//fit()方法将DataFrame转化为一个Transformer的算法 //transform()方法将DataFrame转化为另外一个DataFrame的算法 idfModel.transform(df).select("idfValue", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } }
Example 161
Source File: NormalizerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var data: Array[Vector] = _ @transient var dataFrame: DataFrame = _ @transient var normalizer: Normalizer = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) val sqlContext = new SQLContext(sc) dataFrame = sqlContext.createDataFrame(sc.parallelize(data, 2).map(NormalizerSuite.FeatureData)) normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized_features") } //收集的结果 def collectResult(result: DataFrame): Array[Vector] = { result.select("normalized_features").collect().map { case Row(features: Vector) => features } } //向量的断言类型 def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } //断言值 def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = { assert((lhs, rhs).zipped.forall { (vector1, vector2) => vector1 ~== vector2 absTol 1E-5 }, "The vector value is not correct after normalization.") } test("Normalization with default parameter") {//默认参数的正常化 //transform()方法将DataFrame转化为另外一个DataFrame的算法 normalizer.transform(dataFrame).show() val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l2Normalized) } test("Normalization with setter") {//规范化设置 normalizer.setP(1) //transform()方法将DataFrame转化为另外一个DataFrame的算法 normalizer.transform(dataFrame).show() val result = collectResult(normalizer.transform(dataFrame)) assertTypeOfVector(data, result) assertValues(result, l1Normalized) } } private object NormalizerSuite { case class FeatureData(features: Vector) }
Example 162
Source File: ProbabilisticClassifierSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} final class TestProbabilisticClassificationModel( override val uid: String, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] { override def copy(extra: org.apache.spark.ml.param.ParamMap): this.type = defaultCopy(extra) override protected def predictRaw(input: Vector): Vector = { input } override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { rawPrediction } def friendlyPredict(input: Vector): Double = { predict(input) } } //概率分类器套件 class ProbabilisticClassifierSuite extends SparkFunSuite { test("test thresholding") {//测试阈值 val thresholds = Array(0.5, 0.2) //在二进制分类中设置阈值,范围为[0,1],如果类标签1的估计概率>Threshold,则预测1,否则0 val testModel = new TestProbabilisticClassificationModel("myuid", 2).setThresholds(thresholds) assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 1.0))) === 1.0) assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 0.2))) === 0.0) } test("test thresholding not required") {//测试不需要阈值 val testModel = new TestProbabilisticClassificationModel("myuid", 2) assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 2.0))) === 1.0) } }
Example 163
Source File: FeatureExtraction.scala From meetup-stream with Apache License 2.0 | 5 votes |
package transformations import scala.io.Source import core._ import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast object FeatureExtraction { val localDictionary=Source .fromURL(getClass.getResource("/wordsEn.txt")) .getLines .zipWithIndex .toMap def breakToWords(description: String)={ val wordSelector="""[^\<\>\/]\b([a-zA-Z\d]{4,})\b""".r (wordSelector findAllIn description).map{_.trim.toLowerCase()} } def eventToVector(dictionary: Map[String, Int], description: String): Option[Vector]={ def popularWords(words: Iterator[String])={ val initialWordCounts=collection.mutable.Map[String, Int]() val wordCounts=words. foldLeft(initialWordCounts){ case(wordCounts, word)=> wordCounts+Tuple2(word,wordCounts.getOrElse(word,0)+1) } val wordsIndexes=wordCounts .flatMap{ case(word, count)=>dictionary.get(word).map{index=>(index,count.toDouble)} } val topWords=wordsIndexes.toSeq.sortBy(-1*_._2).take(10) topWords } val wordsIterator = breakToWords(description) val topWords=popularWords(wordsIterator) if (topWords.size==10) Some(Vectors.sparse(dictionary.size,topWords)) else None } }
Example 164
Source File: SVDExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val rows = sc.parallelize(data) val mat: RowMatrix = new RowMatrix(rows) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") sc.stop() } } // scalastyle:on println
Example 165
Source File: BisectingKMeansExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.BisectingKMeans import org.apache.spark.mllib.linalg.{Vector, Vectors} // $example off$ object BisectingKMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample") val sc = new SparkContext(sparkConf) // $example on$ // Loads and parses data def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble)) val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache() // Clustering the data into 6 clusters by BisectingKMeans. val bkm = new BisectingKMeans().setK(6) val model = bkm.run(data) // Show the compute cost and the cluster centers println(s"Compute Cost: ${model.computeCost(data)}") model.clusterCenters.zipWithIndex.foreach { case (center, idx) => println(s"Cluster Center ${idx}: ${center}") } // $example off$ sc.stop() } } // scalastyle:on println
Example 166
Source File: Normalizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 167
Source File: GaussianMixtureModelWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import scala.collection.JavaConverters import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.GaussianMixtureModel import org.apache.spark.mllib.linalg.{Vector, Vectors} val gaussians: Array[Byte] = { val modelGaussians = model.gaussians.map { gaussian => Array[Any](gaussian.mu, gaussian.sigma) } SerDe.dumps(JavaConverters.seqAsJavaListConverter(modelGaussians).asJava) } def predictSoft(point: Vector): Vector = { Vectors.dense(model.predictSoft(point)) } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 168
Source File: Word2VecModelWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 169
Source File: PearsonCorrelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 170
Source File: SpearmanCorrelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 val cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 171
Source File: GLMClassificationModel.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.classification.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SparkSession} def loadData(sc: SparkContext, path: String, modelClass: String): Data = { val dataPath = Loader.dataPath(path) val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val dataRDD = spark.read.parquet(dataPath) val dataArray = dataRDD.select("weights", "intercept", "threshold").take(1) assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath") val data = dataArray(0) assert(data.size == 3, s"Unable to load $modelClass data from: $dataPath") val (weights, intercept) = data match { case Row(weights: Vector, intercept: Double, _) => (weights, intercept) } val threshold = if (data.isNullAt(2)) { None } else { Some(data.getDouble(2)) } Data(weights, intercept, threshold) } } }
Example 172
Source File: LabeledPoint.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 173
Source File: GLMRegressionModel.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression.impl import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.Loader import org.apache.spark.sql.{Row, SparkSession} def loadData(sc: SparkContext, path: String, modelClass: String, numFeatures: Int): Data = { val dataPath = Loader.dataPath(path) val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val dataRDD = spark.read.parquet(dataPath) val dataArray = dataRDD.select("weights", "intercept").take(1) assert(dataArray.length == 1, s"Unable to load $modelClass data from: $dataPath") val data = dataArray(0) assert(data.size == 2, s"Unable to load $modelClass data from: $dataPath") data match { case Row(weights: Vector, intercept: Double) => assert(weights.size == numFeatures, s"Expected $numFeatures features, but" + s" found ${weights.size} features when loading $modelClass weights from $dataPath") Data(weights, intercept) } } } }
Example 174
Source File: IDFSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 175
Source File: PCASuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } test("memory cost computation") { assert(PCAUtil.memoryCost(10, 100) < Int.MaxValue) // check overflowing assert(PCAUtil.memoryCost(40000, 60000) > Int.MaxValue) } }
Example 176
Source File: FeaturesParser.scala From spark-anomaly-detection with MIT License | 5 votes |
package com.micvog.ml import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object FeaturesParser{ def parseFeatures(rawdata: RDD[String]): RDD[Vector] = { val rdd: RDD[Array[Double]] = rawdata.map(_.split(",").map(_.toDouble)) val vectors: RDD[Vector] = rdd.map(arrDouble => Vectors.dense(arrDouble)) vectors } def parseFeaturesWithLabel(cvData: RDD[String]): RDD[LabeledPoint] = { val rdd: RDD[Array[Double]] = cvData.map(_.split(",").map(_.toDouble)) val labeledPoints = rdd.map(arrDouble => new LabeledPoint(arrDouble(0), Vectors.dense(arrDouble.slice(1, arrDouble.length)))) labeledPoints } }
Example 177
Source File: AnomalyDetection$Test.scala From spark-anomaly-detection with MIT License | 5 votes |
package com.micvog.ml import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.scalactic.Equality import org.scalatest.{FlatSpec, FunSuite, Matchers} class AnomalyDetection$Test extends FlatSpec with Matchers with SharedSparkContext { { val point = Vectors.dense(Array(14.8593411857427, 14.9006647394062)) val means = Vectors.dense(Array(14.1122257839456, 14.9977105081362)) val variances = Vectors.dense(Array(1.83263141349452, 1.70974533082878)) "probFunction" should "return correct product value" in { val p = AnomalyDetection.probFunction(point, means, variances) assert(p === 0.0769984879544 +- 0.0001) } "predict" should "predict the anomaly" in { assert(!AnomalyDetection.predict(point, means, variances, 0.05)) } "predict" should "predict non anomaly" in { assert(AnomalyDetection.predict(point, means, variances, 0.08)) } } private def vectorequality() = { new Equality[Vector] { def areEqual(a: Vector, b: Any): Boolean = b match { case v: Vector => v.toArray.zip(a.toArray).map(pair => pair._1 === pair._2 +- 0.001).reduce((a, b) => a && b) case _ => false } } } def trainModel(): AnomalyDetectionModel = { val trainingExamplesFilePath = "./src/test/resources/training.csv" val trainingData = sc.textFile(trainingExamplesFilePath, 2).cache() val trainingRdd = FeaturesParser.parseFeatures(trainingData) new AnomalyDetection().run(trainingRdd) } "run" should "return model with correct mean and variance" in { val model: AnomalyDetectionModel = trainModel() //use scalactic's more relaxing equality implicit val vectorEq = vectorequality() assert(model.means === Vectors.dense(Array(79.9843751617201, 5.13662727300755))) assert(model.variances === Vectors.dense(Array(356.44539323536225, 3.79818173645375))) } "optimize" should "calculate epsilon and F1 score" in { val cvFilePath = "./src/test/resources/cross_val.csv" val cvData = sc.textFile(cvFilePath, 2).cache() val cvPointsRdd: RDD[LabeledPoint] = FeaturesParser.parseFeaturesWithLabel(cvData) val model = trainModel() val optimalModel = new AnomalyDetection().optimize(cvPointsRdd, model) assert(optimalModel.epsilon === 3.382218E-4 +- 0.0000000001) } }
Example 178
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import breeze.linalg._ import breeze.plot._ import org.jfree.chart.axis.NumberTickUnit object ROC extends App { val conf = new SparkConf().setAppName("ROC") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val transformedTest = sqlContext.read.parquet("transformedTest.parquet") val labelScores = transformedTest.select("probability", "label").map { case Row(probability:Vector, label:Double) => (probability(1), label) } val bm = new BinaryClassificationMetrics(labelScores, 300) val roc = bm.roc.collect roc.foreach { println } val falsePositives = roc.map { _._1 } val truePositives = roc.map { _._2 } val f = Figure() val p = f.subplot(0) p += plot(falsePositives, truePositives) p.xlabel = "false positives" p.ylabel = "true positives" p.xlim = (0.0, 0.1) p.xaxis.setTickUnit(new NumberTickUnit(0.01)) p.yaxis.setTickUnit(new NumberTickUnit(0.1)) f.refresh f.saveas("roc.png") }
Example 179
Source File: LogisticRegressionDemo.scala From s4ds with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.SaveMode case class LabelledDocument(fileName:String, text:String, category:String) object LogisticRegressionDemo extends App { val conf = new SparkConf().setAppName("LrTest") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val spamText = sc.wholeTextFiles("spam/*") val hamText = sc.wholeTextFiles("ham/*") val spamDocuments = spamText.map { case (fileName, text) => LabelledDocument(fileName, text, "spam") } val hamDocuments = hamText.map { case (fileName, text) => LabelledDocument(fileName, text, "ham") } val documentsDF = spamDocuments.union(hamDocuments).toDF documentsDF.persist val Array(trainDF, testDF) = documentsDF.randomSplit(Array(0.7, 0.3)) val indexer = new StringIndexer().setInputCol("category").setOutputCol("label") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val hasher = new HashingTF().setInputCol("words").setOutputCol("features") val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.0) val pipeline = new Pipeline().setStages(Array(indexer, tokenizer, hasher, lr)) val model = pipeline.fit(trainDF) val transformedTrain = model.transform(trainDF) transformedTrain.persist val transformedTest = model.transform(testDF) transformedTest.persist println("in sample misclassified:", transformedTrain.filter($"prediction" !== $"label").count, " / ",transformedTrain.count) println("out sample misclassified:", transformedTest.filter($"prediction" !== $"label").count, " / ",transformedTest.count) transformedTrain.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTrain.parquet") transformedTest.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTest.parquet") }
Example 180
Source File: DataFrameExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import com.google.common.io.Files import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SQLContext} object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DataFrameExample with $params") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = sqlContext.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Files.createTempDir() tmpDir.deleteOnExit() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = sqlContext.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() sc.stop() } } // scalastyle:on println