org.apache.spark.mllib.util.MLUtils Scala Examples
The following examples show how to use org.apache.spark.mllib.util.MLUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0 | 6 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.log4j import scala.collection.mutable object MNISTBenchmark { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt) val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2" val numPartitions = if(args.length >= 3) args(2).toInt else 10 val models = if(args.length >=4) args(3).split(',') else Array("tree","naive") val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, path) .zipWithIndex() .filter(_._2 < ns.max) .sortBy(_._2, numPartitions = numPartitions) .keys .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) .cache() dataset.count() //force persist val limiter = new Limiter() val knn = new KNNClassifier() .setTopTreeSize(numPartitions * 10) .setFeaturesCol("features") .setPredictionCol("prediction") .setK(1) val naiveKNN = new NaiveKNNClassifier() val pipeline = new Pipeline() .setStages(Array(limiter, knn)) val naivePipeline = new Pipeline() .setStages(Array(limiter, naiveKNN)) val paramGrid = new ParamGridBuilder() .addGrid(limiter.n, ns) .build() val bm = new Benchmarker() .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumTimes(3) val metrics = mutable.ArrayBuffer[String]() if(models.contains("tree")) { val bmModel = bm.setEstimator(pipeline).fit(dataset) metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}" } if(models.contains("naive")) { val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset) metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}" } logger.info(metrics.mkString("\n")) } } class Limiter(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("limiter")) val n: IntParam = new IntParam(this, "n", "number of rows to limit") def setN(value: Int): this.type = set(n, value) // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN) override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF() override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = schema }
Example 2
Source File: GradientBoostedTreeRegressorExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor} // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} predictions.select("prediction", "label", "features").show(5) // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label")//标签列名 //预测结果列名 .setPredictionCol("prediction") //rmse均方根误差说明样本的离散程度 .setMetricName("rmse") val rmse = evaluator.evaluate(predictions) //rmse均方根误差说明样本的离散程度 println("Root Mean Squared Error (RMSE) on test data = " + rmse) val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel] println("Learned regression GBT model:\n" + gbtModel.toDebugString) // $example off$ sc.stop() } } // scalastyle:on println
Example 3
Source File: MultivariateSummarizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels //总结标签 val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features //总结特点 val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 4
Source File: SVMWithSGDExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils import org.apache.spark.SparkConf object SVMWithSGDExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SVMWithSGDExample").setMaster("local[4]") val sc = new SparkContext(conf) //把数据加载成RDD val svmData = MLUtils.loadLibSVMFile(sc, "../data/mllib/sample_libsvm_data.txt") //计算记录的数目 svmData.count //把数据集分成两半,一半训练数据和一半测试数据 val trainingAndTest = svmData.randomSplit(Array(0.5, 0.5)) //训练数据和测试数据赋值 val trainingData = trainingAndTest(0) val testData = trainingAndTest(1) //训练算法产并经过100次迭代构建模型 (SGD随机梯度下降) val model = SVMWithSGD.train(trainingData, 100) //用模型去为任意数据集预测标签,使用测试数据中的第一个点测试标签 val label = model.predict(testData.first.features) //创建一个元组,其中第一个元素是测试数据的预测标签,第二个元素是实际标签 val predictionsAndLabels = testData.map(r => (model.predict(r.features), r.label)) //计算有多少预测标签和实际标签不匹配的记录 predictionsAndLabels.filter(p => p._1 != p._2).count } }
Example 5
Source File: RandomForestClassifierExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.tree.configuration.Strategy // 加载数据 val data = MLUtils.loadLibSVMFile(sc, "../data/mllib/rf_libsvm_data.txt") // 将数据随机分配为两份,一份用于训练,一份用于测试 val splits = data.randomSplit(Array(0.7, 0.3)) //数据分成训练和测试数据集 val (trainingData, testData) = (splits(0), splits(1)) //创建一个分类的树策略(随机森林也支持回归) val treeStrategy = Strategy.defaultStrategy("Classification") //训练模型 val model = RandomForest.trainClassifier(trainingData,treeStrategy, numTrees=3, featureSubsetStrategy="auto", seed =12345) //基于测试实例评估模型并计算测试错误 val testErr = testData.map { point => //预测 val prediction = model.predict(point.features) if (point.label == prediction) 1.0 else 0.0}.mean()//平均数 //检查模型 println("Test Error = " + testErr) println("Learned Random Forest:n" + model.toDebugString) } }
Example 6
Source File: SparseNaiveBayes.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. //缓存的例子,因为它将被用于在训练和评估。 examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() //numTraining = 81, numTest = 19. println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest //Test accuracy = 1.0. 准确率 println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 7
Source File: LogisticRegressionWithLBFGSDeom.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = modelBFGS.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 8
Source File: SVMWithSGDDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 9
Source File: GradientBoostedTreesExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Algo} import org.apache.spark.mllib.util.MLUtils // 加载数据 val data = MLUtils.loadLibSVMFile(sc, "../data/mllib/rf_libsvm_data.txt") // 将数据随机分配为两份,一份用于训练,一份用于测试 val splits = data.randomSplit(Array(0.7, 0.3)) //数据分成训练和测试数据集 val (trainingData, testData) = (splits(0), splits(1)) //创建一个分类的提升策略并设置迭代次数为3(随机森林也支持回归) val boostingStrategy =BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = 3 //梯度提升决策树:综合多个决策树,消除噪声,避免过拟合 val model = GradientBoostedTrees.train(trainingData,boostingStrategy) //基于测试实例评估模型并计算测试错误 val testErr = testData.map { point => //预测 val prediction = model.predict(point.features) if (point.label == prediction) 1.0 else 0.0}.mean()//平均数 //检查模型 println("Test Error = " + testErr) println("Learned Random Forest:n" + model.toDebugString) } }
Example 10
Source File: RandomForestRegressorExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} // $example off$ import org.apache.spark.sql.Row import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} predictions.select("prediction", "label", "features").show(5) // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") //算法预测结果的存储列的名称, 默认是”prediction” .setPredictionCol("prediction") //rmse均方根误差说明样本的离散程度 .setMetricName("rmse") val rmse = evaluator.evaluate(predictions) //Root Mean Squared Error (RMSE) on test data = 0.09854713827168428 println("Root Mean Squared Error (RMSE) on test data = " + rmse) val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel] println("Learned regression forest model:\n" + rfModel.toDebugString) // $example off$ sc.stop() } } // scalastyle:on println
Example 11
Source File: NaiveBayesExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} // $example off$ predictions.show(5) // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label")//标签列名 .setPredictionCol("prediction")//预测结果列名 .setMetricName("precision")//准确率 //Accuracy: 1.0 val accuracy = evaluator.evaluate(predictions) println("Accuracy: " + accuracy) // $example off$ sc.stop() } } // scalastyle:on println
Example 12
Source File: LinearRegressionWithElasticNetExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.regression.LinearRegression // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} trainingSummary.residuals.show() //rmse均方根误差说明样本的离散程度 //RMSE: 10.189126225286143 println(s"RMSE: ${trainingSummary.rootMeanSquaredError}") //R2平方系统也称判定系数,用来评估模型拟合数据的好坏 //r2: 0.02285205756871944 println(s"r2: ${trainingSummary.r2}") // $example off$ sc.stop() } } // scalastyle:on println
Example 13
Source File: NormalizerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.Normalizer import org.apache.spark.mllib.util.MLUtils // $example off$ object NormalizerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NormalizerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val normalizer1 = new Normalizer() val normalizer2 = new Normalizer(p = Double.PositiveInfinity) // Each sample in data1 will be normalized using $L^2$ norm. val data1 = data.map(x => (x.label, normalizer1.transform(x.features))) // Each sample in data2 will be normalized using $L^\infty$ norm. val data2 = data.map(x => (x.label, normalizer2.transform(x.features))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 14
Source File: MultilayerPerceptronClassifierExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator // $example off$ import org.apache.spark.sql.Row import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} result.show(5) val predictionAndLabels = result.select("prediction", "label") //多分类评估 val evaluator = new MulticlassClassificationEvaluator() .setMetricName("precision") //准确率 Accuracy: 0.9636363636363636 println("Accuracy: " + evaluator.evaluate(predictionAndLabels)) // $example off$ sc.stop() } } // scalastyle:on println
Example 15
Source File: MultivariateSummarizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } }
Example 16
Source File: DatasetExample.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import java.io.File import com.google.common.io.Files import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext, DataFrame} object DatasetExample { case class Params( input: String = "data/mllib/sample_libsvm_data.txt", dataFormat: String = "libsvm") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DatasetExample") { head("Dataset: an example app using DataFrame as a Dataset for ML.") opt[String]("input") .text(s"input path to dataset") .action((x, c) => c.copy(input = x)) opt[String]("dataFormat") .text("data format: libsvm (default), dense (deprecated in Spark v1.1)") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DatasetExample with $params") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // for implicit conversions // Load input data val origData: RDD[LabeledPoint] = params.dataFormat match { case "dense" => MLUtils.loadLabeledPoints(sc, params.input) case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input) } println(s"Loaded ${origData.count()} instances from file: ${params.input}") // Convert input data to DataFrame explicitly. val df: DataFrame = origData.toDF() println(s"Inferred schema:\n${df.schema.prettyJson}") println(s"Converted to DataFrame with ${df.count()} records") // Select columns val labelsDf: DataFrame = df.select("label") val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v } val numLabels = labels.count() val meanLabel = labels.fold(0.0)(_ + _) / numLabels println(s"Selected label column with average value $meanLabel") val featuresDf: DataFrame = df.select("features") val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") val tmpDir = Files.createTempDir() tmpDir.deleteOnExit() val outputDir = new File(tmpDir, "dataset").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) println(s"Loading Parquet file with UDT from $outputDir.") val newDataset = sqlContext.read.parquet(outputDir) println(s"Schema from Parquet: ${newDataset.schema.prettyJson}") val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v } val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${newFeaturesSummary.mean.toString}") sc.stop() } }
Example 17
Source File: SparseNaiveBayes.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } }
Example 18
Source File: LinearRegression.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater} spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } }
Example 19
Source File: Correlations.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } }
Example 20
Source File: LogLoss.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[spark] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 21
Source File: MultivariateSummarizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 22
Source File: StandardScalerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils // $example off$ object StandardScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StandardScalerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 23
Source File: SparseNaiveBayes.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 24
Source File: Correlations.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 25
Source File: LibSVMRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import com.google.common.base.Objects import org.apache.spark.Logging import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @Since("1.6.0") class DefaultSource extends RelationProvider with DataSourceRegister { @Since("1.6.0") override def shortName(): String = "libsvm" @Since("1.6.0") override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) : BaseRelation = { val path = parameters.getOrElse("path", throw new IllegalArgumentException("'path' must be specified")) val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt val vectorType = parameters.getOrElse("vectorType", "sparse") new LibSVMRelation(path, numFeatures, vectorType)(sqlContext) } }
Example 26
Source File: PerceptronClassifier.scala From Scalaprof with GNU General Public License v2.0 | 5 votes |
package edu.neu.coe.scala.spark.nn import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.Row object PerceptronClassifier extends App { val conf = new SparkConf().setAppName("spam") val sc = new SparkContext(conf) val sqlContext = new org.apache.spark.sql.SQLContext(sc) val sparkHome = "/Applications/spark-1.5.1-bin-hadoop2.6/" val trainingFile = "data/mllib/sample_multiclass_classification_data.txt" // this is used to implicitly convert an RDD to a DataFrame. import sqlContext.implicits._ // Load training data val data = MLUtils.loadLibSVMFile(sc, s"$sparkHome$trainingFile").toDF() // Split the data into train and test val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L) val train = splits(0) val test = splits(1) // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) val layers = Array[Int](4, 5, 4, 3) // create the trainer and set its parameters val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(100) // train the model val model = trainer.fit(train) // compute precision on the test set val result = model.transform(test) val predictionAndLabels = result.select("prediction", "label") predictionAndLabels.show val evaluator = new MulticlassClassificationEvaluator() .setMetricName("precision") println("Precision:" + evaluator.evaluate(predictionAndLabels)) }
Example 27
Source File: MyBinaryClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils object MyBinaryClassification { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .appName("myBinaryClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() // Load training data in LIBSVM format //https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html val data = MLUtils.loadLibSVMFile(spark.sparkContext, "../data/sparkml2/chapter4/myBinaryClassificationData.txt") // Split data into training (60%) and test (40%) val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L) training.cache() // Run training algorithm to build the model val model = new LogisticRegressionWithLBFGS() .setNumClasses(2) .run(training) // Clear the prediction threshold so the model will return probabilities model.clearThreshold // Compute raw scores on the test set val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Instantiate metrics object val metrics = new BinaryClassificationMetrics(predictionAndLabels) // Precision by threshold val precision = metrics.precisionByThreshold precision.foreach { case (t, p) => println(s"Threshold: $t, Precision: $p") } // Recall by threshold val recall = metrics.recallByThreshold recall.foreach { case (t, r) => println(s"Threshold: $t, Recall: $r") } val PRC = metrics.pr val f1Score = metrics.fMeasureByThreshold f1Score.foreach { case (t, f) => println(s"Threshold: $t, F-score: $f, Beta = 1") } val beta = 0.5 val fScore = metrics.fMeasureByThreshold(beta) f1Score.foreach { case (t, f) => println(s"Threshold: $t, F-score: $f, Beta = 0.5") } val auPRC = metrics.areaUnderPR println("Area under precision-recall curve = " + auPRC) val thresholds = precision.map(_._1) val roc = metrics.roc val auROC = metrics.areaUnderROC println("Area under ROC = " + auROC) spark.stop() } }
Example 28
Source File: RandomForestClassification.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import scopt.OptionParser object RandomForestClassification { case class Params( inputPath: String = null, numTrees: Int = 3, numClasses: Int = 2, featureSubsetStrategy: String = "auto", impurity: String = "gini", maxDepth: Int = 4, maxBins: Int = 32) def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("RF") { head("RF: an example app.") opt[Int]("numTrees") .text(s"numTrees, default: ${defaultParams.numTrees}") .action((x, c) => c.copy(numTrees = x)) opt[Int]("numClasses") .text(s"numClasses, default: ${defaultParams.numClasses}") .action((x, c) => c.copy(numClasses = x)) opt[Int]("maxDepth") .text(s"maxDepth, default: ${defaultParams.maxDepth}") .action((x, c) => c.copy(maxDepth = x)) opt[Int]("maxBins") .text(s"maxBins, default: ${defaultParams.maxBins}") .action((x, c) => c.copy(maxBins = x)) opt[String]("featureSubsetStrategy") .text(s"featureSubsetStrategy, default: ${defaultParams.featureSubsetStrategy}") .action((x, c) => c.copy(featureSubsetStrategy = x)) opt[String]("impurity") .text(s"impurity (smoothing constant), default: ${defaultParams.impurity}") .action((x, c) => c.copy(impurity = x)) arg[String]("<inputPath>") .required() .text("Input path of dataset") .action((x, c) => c.copy(inputPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"RFC with $params") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data: RDD[LabeledPoint] = sc.objectFile(params.inputPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a RandomForest model. // Empty categoricalFeaturesInfo indicates all features are continuous. val categoricalFeaturesInfo = Map[Int, Int]() val model = RandomForest.trainClassifier(trainingData, params.numClasses, categoricalFeaturesInfo, params.numTrees, params.featureSubsetStrategy, params.impurity, params.maxDepth, params.maxBins) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 29
Source File: GradientBoostingTree.scala From Swallow with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint object GradientBoostingTree { def main(args: Array[String]): Unit = { var inputPath = "" var numIterations: Int = 3 val numClasses: Int = 2 val maxDepth: Int = 5 if (args.length == 2) { inputPath = args(0) numIterations = args(1).toInt } val conf = new SparkConf() .setAppName("GradientBoostingTree") val sc = new SparkContext(conf) // Load and parse the data file. //val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val data: RDD[LabeledPoint] = sc.objectFile(inputPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a GradientBoostedTrees model. // The defaultParams for Classification use LogLoss by default. val boostingStrategy = BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = numIterations boostingStrategy.treeStrategy.numClasses = numClasses boostingStrategy.treeStrategy.maxDepth = maxDepth // Empty categoricalFeaturesInfo indicates all features are continuous. boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() val model = GradientBoostedTrees.train(trainingData, boostingStrategy) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 30
Source File: BinaryClassification.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.examples.ml import com.github.cloudml.zen.ml.regression.LogisticRegression import org.apache.spark.graphx2.GraphXUtils import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} import scopt.OptionParser object BinaryClassification { case class Params( input: String = null, out: String = null, numIterations: Int = 200, stepSize: Double = 1.0, l1: Double = 1e-2, epsilon: Double = 1e-4, useAdaGrad: Boolean = false, kryo: Boolean = false) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("BinaryClassification") { head("BinaryClassification: an example app for LogisticRegression.") opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[Double]("epsilon") .text(s"epsilon (smoothing constant) for MIS, default: ${defaultParams.epsilon}") .action((x, c) => c.copy(epsilon = x)) opt[Unit]("kryo") .text("use Kryo serialization") .action((_, c) => c.copy(kryo = true)) opt[Double]("stepSize") .text(s"stepSize, default: ${defaultParams.stepSize}") .action((x, c) => c.copy(stepSize = x)) opt[Double]("l1") .text(s"L1 Regularization, default: ${defaultParams.l1} (auto)") .action((x, c) => c.copy(l1 = x)) opt[Unit]("adagrad") .text("use AdaGrad") .action((_, c) => c.copy(useAdaGrad = true)) arg[String]("<input>") .required() .text("input paths (binary labeled data in the LIBSVM format)") .action((x, c) => c.copy(input = x)) arg[String]("<out>") .required() .text("out paths (model)") .action((x, c) => c.copy(out = x)) note( """ |For example, the following command runs this app on a synthetic dataset: | | bin/spark-submit --class com.github.cloudml.zen.examples.ml.LogisticRegression \ | examples/target/scala-*/zen-examples-*.jar \ | --numIterations 200 --lambda 1.0 --kryo \ | data/mllib/kdda.txt | data/mllib/lr_model.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { System.exit(1) } } def run(params: Params): Unit = { val Params(input, out, numIterations, stepSize, l1, epsilon, useAdaGrad, useKryo) = params val conf = new SparkConf().setAppName(s"LogisticRegression with $params") if (useKryo) { GraphXUtils.registerKryoClasses(conf) // conf.set("spark.kryoserializer.buffer.mb", "8") } val sc = new SparkContext(conf) val dataSet = MLUtils.loadLibSVMFile(sc, input).zipWithUniqueId().map(_.swap).cache() val model = LogisticRegression.trainMIS(dataSet, numIterations, stepSize, l1, epsilon, useAdaGrad) val lm = new LogisticRegressionModel(model.weights, model.intercept, model.weights.size, 2) lm.save(sc, out) sc.stop() } }
Example 31
Source File: LogisticRegressionSuite.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.regression import com.github.cloudml.zen.ml.util._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.scalatest.{Matchers, FunSuite} import com.github.cloudml.zen.ml.util.SparkUtils._ class LogisticRegressionSuite extends FunSuite with SharedSparkContext with Matchers { test("LogisticRegression MIS") { val zenHome = sys.props.getOrElse("zen.test.home", fail("zen.test.home is not set!")) val dataSetFile = classOf[LogisticRegressionSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile) val max = dataSet.map(_.features.activeValuesIterator.map(_.abs).sum + 1L).max val maxIter = 10 val stepSize = 1 / (2 * max) val trainDataSet = dataSet.zipWithUniqueId().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else -1.0 (id, LabeledPoint(newLabel, features)) } val lr = new LogisticRegressionMIS(trainDataSet, stepSize) val pps = new Array[Double](maxIter) var i = 0 val startedAt = System.currentTimeMillis() while (i < maxIter) { lr.run(1) val q = lr.forward(i) pps(i) = lr.loss(q) i += 1 } println((System.currentTimeMillis() - startedAt) / 1e3) pps.foreach(println) val ppsDiff = pps.init.zip(pps.tail).map { case (lhs, rhs) => lhs - rhs } assert(ppsDiff.count(_ > 0).toDouble / ppsDiff.size > 0.05) assert(pps.head - pps.last > 0) } test("LogisticRegression SGD") { val zenHome = sys.props.getOrElse("zen.test.home", fail("zen.test.home is not set!")) val dataSetFile = classOf[LogisticRegressionSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile) val maxIter = 10 val stepSize = 1 val trainDataSet = dataSet.zipWithIndex().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else 0 (id, LabeledPoint(newLabel, features)) } val lr = new LogisticRegressionSGD(trainDataSet, stepSize) val pps = new Array[Double](maxIter) var i = 0 val startedAt = System.currentTimeMillis() while (i < maxIter) { lr.run(1) val margin = lr.forward(i) pps(i) = lr.loss(margin) i += 1 } println((System.currentTimeMillis() - startedAt) / 1e3) pps.foreach(println) val ppsDiff = pps.init.zip(pps.tail).map { case (lhs, rhs) => lhs - rhs } assert(ppsDiff.count(_ > 0).toDouble / ppsDiff.size > 0.05) assert(pps.head - pps.last > 0) } }
Example 32
Source File: MLPSuite.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.neuralNetwork import com.github.cloudml.zen.ml.util.{Utils, SparkUtils, MnistDatasetSuite} import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector => SV} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.scalatest.{FunSuite, Matchers} class MLPSuite extends FunSuite with MnistDatasetSuite with Matchers { ignore("MLP") { val (data, numVisible) = mnistTrainDataset(5000) val topology = Array(numVisible, 500, 10) val nn = MLP.train(data, 20, 1000, topology, fraction = 0.02, learningRate = 0.1, weightCost = 0.0) // val nn = MLP.runLBFGS(data, topology, 100, 4000, 1e-5, 0.001) // MLP.runSGD(data, nn, 37, 6000, 0.1, 0.5, 0.0) val (dataTest, _) = mnistTrainDataset(10000, 5000) println("Error: " + MLP.error(dataTest, nn, 100)) } ignore("binary classification") { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val dataSetFile = s"$sparkHome/data/a5a" val checkpoint = s"$sparkHome/target/tmp" sc.setCheckpointDir(checkpoint) val data = MLUtils.loadLibSVMFile(sc, dataSetFile).map { case LabeledPoint(label, features) => val y = BDV.zeros[Double](2) y := 0.04 / y.length y(if (label > 0) 0 else 1) += 0.96 (features, SparkUtils.fromBreeze(y)) }.persist() val trainSet = data.filter(_._1.hashCode().abs % 5 == 3).persist() val testSet = data.filter(_._1.hashCode().abs % 5 != 3).persist() val numVisible = trainSet.first()._1.size val topology = Array(numVisible, 30, 2) var nn = MLP.train(trainSet, 100, 1000, topology, fraction = 0.02, learningRate = 0.05, weightCost = 0.0) val modelPath = s"$checkpoint/model" nn.save(sc, modelPath) nn = MLP.load(sc, modelPath) val scoreAndLabels = testSet.map { case (features, label) => val out = nn.predict(SparkUtils.toBreeze(features).toDenseVector.asDenseMatrix.t) // Utils.random.nextInt(2).toDouble (out(0, 0), if (label(0) > 0.5) 1.0 else 0.0) }.persist() scoreAndLabels.repartition(1).map(t => s"${t._1}\t${t._2}"). saveAsTextFile(s"$checkpoint/mlp/${System.currentTimeMillis()}") val testAccuracy = new BinaryClassificationMetrics(scoreAndLabels).areaUnderROC() println(f"Test AUC = $testAccuracy%1.6f") } }
Example 33
Source File: MVMSuite.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.recommendation import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, sum => brzSum} import com.github.cloudml.zen.ml.util._ import com.google.common.io.Files import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.scalatest.{FunSuite, Matchers} class MVMSuite extends FunSuite with SharedSparkContext with Matchers { test("binary classification") { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val checkpoint = s"$sparkHome/target/tmp" sc.setCheckpointDir(checkpoint) val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else 0.0 (id, LabeledPoint(newLabel, features)) } val stepSize = 0.1 val regParam = 1e-2 val l2 = (regParam, regParam, regParam) val rank = 20 val useAdaGrad = true val trainSet = dataSet.cache() val fm = new FMClassification(trainSet, stepSize, l2, rank, useAdaGrad) val maxIter = 10 val pps = new Array[Double](maxIter) var i = 0 val startedAt = System.currentTimeMillis() while (i < maxIter) { fm.run(1) pps(i) = fm.saveModel().loss(trainSet) i += 1 } println((System.currentTimeMillis() - startedAt) / 1e3) pps.foreach(println) val ppsDiff = pps.init.zip(pps.tail).map { case (lhs, rhs) => lhs - rhs } assert(ppsDiff.count(_ < 0).toDouble / ppsDiff.size > 0.05) val fmModel = fm.saveModel() val tempDir = Files.createTempDir() tempDir.deleteOnExit() val path = tempDir.toURI.toString fmModel.save(sc, path) val sameModel = FMModel.load(sc, path) assert(sameModel.k === fmModel.k) assert(sameModel.classification === fmModel.classification) assert(sameModel.factors.sortByKey().map(_._2).collect() === fmModel.factors.sortByKey().map(_._2).collect()) } ignore("url_combined classification") { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val checkpointDir = s"$sparkHome/target/tmp" sc.setCheckpointDir(checkpointDir) val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else 0.0 (id, LabeledPoint(newLabel, features)) }.cache() val numFeatures = dataSet.first()._2.features.size val stepSize = 0.1 val numIterations = 500 val regParam = 1e-3 val rank = 20 val views = Array(20, numFeatures / 2, numFeatures).map(_.toLong) val useAdaGrad = true val useWeightedLambda = true val miniBatchFraction = 1 val Array(trainSet, testSet) = dataSet.randomSplit(Array(0.8, 0.2)) trainSet.cache() testSet.cache() val fm = new MVMClassification(trainSet, stepSize, views, regParam, 0.0, rank, useAdaGrad, useWeightedLambda, miniBatchFraction) fm.run(numIterations) val model = fm.saveModel() println(f"Test loss: ${model.loss(testSet.cache())}%1.4f") } }
Example 34
Source File: PCA_LinearRegression_Demo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.regression.LinearRegressionWithSGD object PCAExample2 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val data = MLUtils.loadLibSVMFile(spark.sparkContext, "data/mnist.bz2") val df = spark.read.format("libsvm").load("C:/Exp/mnist.bz2") df.show(20) val featureSize = data.first().features.size println("Feature Size: " + featureSize) val splits = data.randomSplit(Array(0.75, 0.25), seed = 12345L) val (training, test) = (splits(0), splits(1)) val pca = new PCA(featureSize/2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 20 val stepSize = 0.0001 val model = LinearRegressionWithSGD.train(training, numIterations, stepSize) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations, stepSize) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow(v - p, 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow(v - p, 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) println("Model coefficients:"+ model.toString()) println("Model with PCA coefficients:"+ model_pca.toString()) spark.stop() } }
Example 35
Source File: RandomForestDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.evaluation.MulticlassMetrics object RandomForestDemo { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample")//.setMaster("local[*]") val sc = new SparkContext(conf) val filePath = args(0) val data = MLUtils.loadLibSVMFile(sc, filePath) val splits = data.randomSplit(Array(0.75, 0.25), seed = 12345L) val training = splits(0).cache() val test = splits(1) // Train a RandomForest mode with an empty categoricalFeaturesInfo indicates all features are continuous. val numClasses = 10 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 50 // Use more in practice. val featureSubsetStrategy = "auto" // Let the algorithm choose. val impurity = "gini" val maxDepth = 30 val maxBins = 32 val model = RandomForest.trainClassifier(training, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) // Evaluate model on test instances and compute test error val labelAndPreds = test.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val metrics = new MulticlassMetrics(labelAndPreds) // Confusion matrix println("Confusion matrix:") println(metrics.confusionMatrix) // Overall Statistics val accuracy = metrics.accuracy println("Summary Statistics") println(s"Accuracy = $accuracy") // Precision by label val labels = metrics.labels labels.foreach { l => println(s"Precision($l) = " + metrics.precision(l)) } // Recall by label labels.foreach { l => println(s"Recall($l) = " + metrics.recall(l)) } // False positive rate by label labels.foreach { l => println(s"FPR($l) = " + metrics.falsePositiveRate(l)) } // F-measure by label labels.foreach { l => println(s"F1-Score($l) = " + metrics.fMeasure(l)) } // Weighted stats println(s"Weighted precision: ${metrics.weightedPrecision}") println(s"Weighted recall: ${metrics.weightedRecall}") println(s"Weighted F1 score: ${metrics.weightedFMeasure}") println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}") val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / test.count() println("Accuracy = " + (1-testErr) * 100 + " %") //println("Learned classification forest model:\n" + model.toDebugString) } }
Example 36
Source File: LogLoss.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[mllib] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 37
Source File: LogLoss.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[mllib] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 38
Source File: MultivariateSummarizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 39
Source File: SparseNaiveBayes.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 40
Source File: LinearRegression.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater} spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } } // scalastyle:on println
Example 41
Source File: Correlations.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 42
Source File: MultivariateSummarizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 43
Source File: StandardScalerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils // $example off$ object StandardScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StandardScalerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 44
Source File: SparseNaiveBayes.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 45
Source File: Correlations.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 46
Source File: NaiveBayesExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.util.MLUtils // $example off$ object NaiveBayesExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val Array(training, test) = data.randomSplit(Array(0.6, 0.4)) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 47
Source File: NaiveBayesExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.util.MLUtils // $example off$ object NaiveBayesExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val Array(training, test) = data.randomSplit(Array(0.6, 0.4)) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ } } // scalastyle:on println
Example 48
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => DBM, DenseVector => DBV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.MLUtils private def calculateCovarianceConstants: (DBM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(sigma.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = MLUtils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 49
Source File: AggregatedICPs.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.examples import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.util.MLUtils import scopt.OptionParser import se.uu.farmbio.cp.AggregatedICPClassifier import se.uu.farmbio.cp.BinaryClassificationICPMetrics import se.uu.farmbio.cp.ICP import se.uu.farmbio.cp.alg.GBT object AggregatedICPs { case class Params( input: String = null, calibrationSize: Int = 0, numIterations: Int = 0, numOfICPs: Int = 0, master: String = null) def run(params: Params) { //Init Spark val conf = new SparkConf() .setAppName("AggregatedICPs") if (params.master != null) { conf.setMaster(params.master) } val sc = new SparkContext(conf) //Load and split data val Array(training, test) = MLUtils.loadLibSVMFile(sc, params.input) .randomSplit(Array(0.8, 0.2)) //Train icps val icps = (1 to params.numOfICPs).map { _ => val (calibration, properTraining) = ICP.calibrationSplit(training, params.calibrationSize) //Train ICP val gbt = new GBT(properTraining.cache, params.numIterations) ICP.trainClassifier(gbt, numClasses = 2, calibration) } //Aggregate ICPs and perform tests val icp = new AggregatedICPClassifier(icps) val mondrianPvAndLabels = test.map { p => (icp.mondrianPv(p.features), p.label) } val metrics = new BinaryClassificationICPMetrics(mondrianPvAndLabels) println(metrics) } def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("AggregatedICPs") { head("AggregatedICPs: an example of aggregated ICPs") opt[Int]("calibrationSize") .required() .text(s"size of calibration set (for each class)") .action((x, c) => c.copy(calibrationSize = x)) opt[Int]("numIterations") .required() .text(s"number of GBT iterations") .action((x, c) => c.copy(numIterations = x)) opt[Int]("numOfICPs") .required() .text(s"number of ICPs to train") .action((x, c) => c.copy(numOfICPs = x)) opt[String]("master") .text("spark master") .action((x, c) => c.copy(master = x)) arg[String]("<input>") .required() .text("input paths to labeled examples in LIBSVM format") .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } }
Example 50
Source File: GBTICPClassifier.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.examples import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.util.MLUtils import scopt.OptionParser import se.uu.farmbio.cp.BinaryClassificationICPMetrics import se.uu.farmbio.cp.ICP import se.uu.farmbio.cp.alg.GBT object GBTICPClassifier { case class Params( input: String = null, calibrationSize: Int = 0, numIterations: Int = 0, master: String = null) def run(params: Params) { //Init Spark val conf = new SparkConf() .setAppName("GBTICPClassifier") if (params.master != null) { conf.setMaster(params.master) } val sc = new SparkContext(conf) //Load and split data val Array(training, test) = MLUtils.loadLibSVMFile(sc, params.input) .randomSplit(Array(0.8, 0.2)) val (calibration, properTraining) = ICP.calibrationSplit(training, params.calibrationSize) //Train ICP val t0 = System.currentTimeMillis val gbt = new GBT(properTraining.cache, params.numIterations) val icp = ICP.trainClassifier(gbt, numClasses = 2, calibration) val t1 = System.currentTimeMillis //Compute and print metrics val mondrianPvAndLabels = test.map { p => (icp.mondrianPv(p.features), p.label) } val metrics = new BinaryClassificationICPMetrics(mondrianPvAndLabels) println(metrics) println(s"training took: ${t1-t0} millisec.") sc.stop } def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("GBTICPClassifier") { head("GBTICPClassifier: an example of Gradient Boosted Trees ICP classification.") opt[Int]("calibrationSize") .required() .text(s"size of calibration set (for each class)") .action((x, c) => c.copy(calibrationSize = x)) opt[Int]("numIterations") .required() .text(s"number of GBT iterations") .action((x, c) => c.copy(numIterations = x)) opt[String]("master") .text("spark master") .action((x, c) => c.copy(master = x)) arg[String]("<input>") .required() .text("input paths to labeled examples in LIBSVM format") .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } }
Example 51
Source File: Splitter.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.examples import scopt.OptionParser import org.apache.spark.mllib.util.MLUtils import org.apache.spark.SparkConf import org.apache.spark.SparkContext object Splitter { case class Params( input: String = null, ratio: Double = 0.0, master: String = null) def run(params: Params) { //Init Spark val conf = new SparkConf() .setAppName("Splitter") if (params.master != null) { conf.setMaster(params.master) } val sc = new SparkContext(conf) //Load and split data val Array(training, test) = MLUtils.loadLibSVMFile(sc, params.input) .randomSplit(Array(params.ratio, 1.0-params.ratio)) MLUtils.saveAsLibSVMFile(training, params.input+s".${params.ratio}.svm") val roundRatio = BigDecimal(1.0-params.ratio) .setScale(1, BigDecimal.RoundingMode.HALF_UP) MLUtils.saveAsLibSVMFile(test, params.input+s".$roundRatio.svm") sc.stop } def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("Splitter") { head("Splitter: randomly split data into training and test.") opt[Double]("ratio") .required() .text("split ratio") .action((x, c) => c.copy(ratio = x)) opt[String]("master") .text("spark master") .action((x, c) => c.copy(master = x)) arg[String]("<input>") .required() .text("input path to labeled examples in LIBSVM format") .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } }
Example 52
Source File: ICPRunningTime.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.examples import org.apache.spark.SparkConf import se.uu.farmbio.cp.BinaryClassificationICPMetrics import org.apache.spark.SparkContext import se.uu.farmbio.cp.ICP import org.apache.spark.mllib.util.MLUtils import scopt.OptionParser import se.uu.farmbio.cp.alg.GBT object ICPRunningTime { case class Params( input: String = null, calibrationSize: Int = 0, numIterations: Int = 0, inputFraction: Double = 0.0, master: String = null) def run(params: Params) { //Init Spark val conf = new SparkConf() .setAppName("ICPRunningTime") if (params.master != null) { conf.setMaster(params.master) } val sc = new SparkContext(conf) //Load and split data val training = MLUtils.loadLibSVMFile(sc, params.input) .sample(withReplacement = false, fraction = params.inputFraction) val (calibration, properTraining) = ICP.calibrationSplit(training, params.calibrationSize) //Train ICP val t0 = System.currentTimeMillis val gbt = new GBT(properTraining.cache, params.numIterations) val icp = ICP.trainClassifier(gbt, numClasses = 2, calibration) val t1 = System.currentTimeMillis println(s"training took: ${t1 - t0} millisec.") sc.stop } def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("ICPRunningTime") { head("ICPRunningTime: it measures ICP with GBT training time for a fraction of the input.") opt[Int]("calibrationSize") .required() .text(s"size of calibration set (for each class)") .action((x, c) => c.copy(calibrationSize = x)) opt[Int]("numIterations") .required() .text(s"number of GBT iterations") .action((x, c) => c.copy(numIterations = x)) opt[Double]("inputFraction") .required() .text(s"input fraction to use for measuring training time") .action((x, c) => c.copy(inputFraction = x)) opt[String]("master") .text("spark master") .action((x, c) => c.copy(master = x)) arg[String]("<input>") .required() .text("input paths to labeled examples in LIBSVM format") .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } }
Example 53
Source File: LibLinearTraining.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.examples import scopt.OptionParser import org.apache.spark.SparkConf import org.apache.spark.SparkContext import se.uu.farmbio.cp.liblinear.LIBLINEAR import org.apache.spark.mllib.util.MLUtils import org.apache.spark.Logging object LibLinearTraining extends Logging { case class Params( trainInputPath: String = null, outputPath: String = null, calibrRatio: Double = 0.2, numberOfCPs: Int = 100, nofOutFiles: Int = 0, dfsBlockSize: String = "8M", master: String = null) def main(args: Array[String]) = { val defaultParams = Params() val parser = new OptionParser[Params]("PubChemTraining") { head("LibLinearTraining: LIBINEAR training procedure") opt[Double]("calibrRatio") .text("fraction of calibration examples") .action((x, c) => c.copy(calibrRatio = x)) opt[Int]("numberOfCPs") .text("number of CPs to train") .action((x, c) => c.copy(numberOfCPs = x)) opt[String]("master") .text("spark master") .action((x, c) => c.copy(master = x)) opt[Int]("nofOutFiles") .text("Number of output files. " + "It can be equal to the parallelism level at most " + "(defualt: as much as the parallelism level)") .action((x, c) => c.copy(nofOutFiles = x)) opt[String]("dfsBlockSize") .text("It tunes the Hadoop dfs.block.size property (default:8M)") .action((x, c) => c.copy(dfsBlockSize = x)) arg[String]("<input>") .required() .text("input path to training examples in LIBSVM format") .action((x, c) => c.copy(trainInputPath = x)) arg[String]("<output>") .required() .text("output path to save CPs") .action((x, c) => c.copy(outputPath = x)) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { //Init Spark val conf = new SparkConf() .setAppName("LibLinearTraining") if (params.master != null) { conf.setMaster(params.master) } val sc = new SparkContext(conf) //Set and log dfs.block.size sc.hadoopConfiguration.set("dfs.block.size", params.dfsBlockSize) val blockSize = sc.hadoopConfiguration.get("dfs.block.size") logInfo(s"dfs.block.size = $blockSize") //Load data //This example assumes the training set to be relatively small //the model data generated will be big instead. val input = MLUtils.loadLibSVMFile(sc, params.trainInputPath) val trainingData = input.collect //Train the CPs val modelData = LIBLINEAR.trainAggregatedICPClassifier( sc, trainingData, params.calibrRatio, params.numberOfCPs) //Save the model in a distributed fashion modelData.save(params.outputPath, params.nofOutFiles) //Stop Spark sc.stop } }
Example 54
Source File: MNISTCrossValidation.scala From spark-knn with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.KNNClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.log4j object MNISTCrossValidation { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val dataset = MLUtils.loadLibSVMFile(sc, "data/mnist/mnist.bz2") .toDF() //.limit(10000) //split traning and testing val Array(train, test) = dataset.randomSplit(Array(0.7, 0.3), seed = 1234L).map(_.cache()) //create PCA matrix to reduce feature dimensions val pca = new PCA() .setInputCol("features") .setK(50) .setOutputCol("pcaFeatures") val knn = new KNNClassifier() .setTopTreeSize(50) .setFeaturesCol("pcaFeatures") .setPredictionCol("prediction") .setK(1) val pipeline = new Pipeline() .setStages(Array(pca, knn)) val paramGrid = new ParamGridBuilder() // .addGrid(knn.k, 1 to 20) .addGrid(pca.k, 10 to 100 by 10) .build() val cv = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(5) val cvModel = cv.fit(train) val insample = validate(cvModel.transform(train)) val outofsample = validate(cvModel.transform(test)) //reference accuracy: in-sample 95% out-of-sample 94% logger.info(s"In-sample: $insample, Out-of-sample: $outofsample") logger.info(s"Cross-validated: ${cvModel.avgMetrics.toSeq}") } private[this] def validate(results: DataFrame): Double = { results .selectExpr("SUM(CASE WHEN label = prediction THEN 1.0 ELSE 0.0 END) / COUNT(1)") .collect() .head .getDecimal(0) .doubleValue() } }
Example 55
Source File: MNIST.scala From spark-knn with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.KNNClassifier import org.apache.spark.ml.feature.PCA import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.log4j object MNIST { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, "data/mnist/mnist.bz2") .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) //split training and testing val Array(train, test) = dataset .randomSplit(Array(0.7, 0.3), seed = 1234L) .map(_.cache()) //create PCA matrix to reduce feature dimensions val pca = new PCA() .setInputCol("features") .setK(50) .setOutputCol("pcaFeatures") val knn = new KNNClassifier() .setTopTreeSize(dataset.count().toInt / 500) .setFeaturesCol("pcaFeatures") .setPredictionCol("predicted") .setK(1) val pipeline = new Pipeline() .setStages(Array(pca, knn)) .fit(train) val insample = validate(pipeline.transform(train)) val outofsample = validate(pipeline.transform(test)) //reference accuracy: in-sample 95% out-of-sample 94% logger.info(s"In-sample: $insample, Out-of-sample: $outofsample") } private[this] def validate(results: DataFrame): Double = { results .selectExpr("SUM(CASE WHEN label = predicted THEN 1.0 ELSE 0.0 END) / COUNT(1)") .collect() .head .getDecimal(0) .doubleValue() } }
Example 56
Source File: KNNUtils.scala From spark-knn with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.knn import org.apache.spark.ml.{linalg => newlinalg} import org.apache.spark.mllib.{linalg => oldlinalg} import org.apache.spark.mllib.util.MLUtils object KNNUtils { import oldlinalg.VectorImplicits._ def fastSquaredDistance( v1: newlinalg.Vector, norm1: Double, v2: newlinalg.Vector, norm2: Double, precision: Double = 1e-6): Double = { MLUtils.fastSquaredDistance(v1, norm1, v2, norm2, precision) } }
Example 57
Source File: spark.scala From ann-benchmark with Apache License 2.0 | 5 votes |
import org.apache.log4j._ Logger.getRootLogger.setLevel(Level.OFF) import org.apache.spark.mllib.ann.{FeedForwardTrainer, FeedForwardTopology} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.classification.ANNClassifier // maximum number of worker nodes in cluster val numNodes = 5 // batch size, ~10K is good for GPU val batchSize = 1000 // number of iterations to run val numIterations = 5 val train = MLUtils.loadLibSVMFile(sc, "/mnist.scale") val topology = FeedForwardTopology.multiLayerPerceptron(Array[Int](780, 2500, 2000, 1500, 1000, 500, 10), false) val trainer = new FeedForwardTrainer(topology, 780, 10).setBatchSize(batchSize) trainer.SGDOptimizer.setNumIterations(numIterations).setMiniBatchFraction(1.0).setStepSize(0.03) // parallalize the data for N nodes, persist, run X iterations and print average time for each run for (i <- 1 to numNodes) { val dataPartitions = sc.parallelize(1 to i, i) val sample = train.sample(true, 1.0 / i, 11L).collect val parallelData = dataPartitions.flatMap(x => sample) parallelData.persist parallelData.count val t = System.nanoTime() val model = new ANNClassifier(trainer).train(parallelData) println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) }
Example 58
Source File: spark-latest.scala From ann-benchmark with Apache License 2.0 | 5 votes |
import org.apache.log4j._ Logger.getRootLogger.setLevel(Level.OFF) import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.ml.classification.MultilayerPerceptronClassifier // maximum number of worker nodes in cluster val numNodes = 5 // batch size, ~10K is good for GPU val batchSize = 1000 // number of iterations to run val numIterations = 5 val train = MLUtils.loadLibSVMFile(sc, "file:///data/mnist/mnist.scale") //val layers = Array[Int](780, 2500, 2000, 1500, 1000, 500, 10) val layers = Array[Int](780, 10) val trainer = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(1000).setSeed(1234L).setMaxIter(1) for (i <- 1 to numNodes) { val dataPartitions = sc.parallelize(1 to i, i) val sample = train.sample(true, 1.0 / i, 11L).collect val parallelData = sqlContext.createDataFrame(dataPartitions.flatMap(x => sample)) parallelData.persist parallelData.count val t = System.nanoTime() val model = trainer.fit(parallelData) println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) parallelData.unpersist() }
Example 59
Source File: X2PHelper.scala From spark-tsne with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib import breeze.linalg._ import breeze.numerics._ import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLUtils object X2PHelper { case class VectorWithNorm(vector: Vector, norm: Double) def fastSquaredDistance(v1: VectorWithNorm, v2: VectorWithNorm): Double = { MLUtils.fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm) } def Hbeta(D: DenseVector[Double], beta: Double = 1.0) : (Double, DenseVector[Double]) = { val P: DenseVector[Double] = exp(- D * beta) val sumP = sum(P) if(sumP == 0) { (0.0, DenseVector.zeros(D.size)) }else { val H = log(sumP) + (beta * sum(D :* P) / sumP) (H, P / sumP) } } }
Example 60
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.HingeGradient import org.apache.spark.mllib.optimization.LBFGS import org.apache.spark.mllib.optimization.SquaredL2Updater import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a SVMs UnderlyingAlgorithm private object SVM { def trainingProcedure( input: RDD[LabeledPoint], maxNumItearations: Int, regParam: Double, numCorrections: Int, convergenceTol: Double) = { //Train SVM with LBFGS val numFeatures = input.take(1)(0).features.size val training = input.map(x => (x.label, MLUtils.appendBias(x.features))).cache() val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1)) val (weightsWithIntercept, _) = LBFGS.runLBFGS( training, new HingeGradient(), new SquaredL2Updater(), numCorrections, convergenceTol, maxNumItearations, regParam, initialWeightsWithIntercept) //Create the model using the weights val model = new SVMModel( Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)), weightsWithIntercept(weightsWithIntercept.size - 1)) //Return raw score predictor model.clearThreshold() model } } class SVM(val model: SVMModel) extends UnderlyingAlgorithm(model.predict) { def this( input: RDD[LabeledPoint], maxNumItearations: Int = 100, regParam: Double = 0.1, numCorrections: Int = 10, convergenceTol: Double = 1e-4) = { this(SVM.trainingProcedure( input, maxNumItearations, regParam, numCorrections, convergenceTol)) } def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { -score } else { score } } }
Example 61
Source File: LogLoss.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[spark] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 62
Source File: IDF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path import org.apache.spark.annotation.Since import org.apache.spark.ml._ import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType @Since("2.0.0") def idf: Vector = idfModel.idf.asML @Since("1.6.0") override def write: MLWriter = new IDFModelWriter(this) } @Since("1.6.0") object IDFModel extends MLReadable[IDFModel] { private[IDFModel] class IDFModelWriter(instance: IDFModel) extends MLWriter { private case class Data(idf: Vector) override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) val data = Data(instance.idf) val dataPath = new Path(path, "data").toString sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } } private class IDFModelReader extends MLReader[IDFModel] { private val className = classOf[IDFModel].getName override def load(path: String): IDFModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath) val Row(idf: Vector) = MLUtils.convertVectorColumnsToML(data, "idf") .select("idf") .head() val model = new IDFModel(metadata.uid, new feature.IDFModel(OldVectors.fromML(idf))) DefaultParamsReader.getAndSetParams(model, metadata) model } } @Since("1.6.0") override def read: MLReader[IDFModel] = new IDFModelReader @Since("1.6.0") override def load(path: String): IDFModel = super.load(path) }
Example 63
Source File: SampledRDDs.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SampledRDDs with $params") val sc = new SparkContext(conf) val fraction = 0.1 // fraction of data to sample val examples = MLUtils.loadLibSVMFile(sc, params.input) val numExamples = examples.count() if (numExamples == 0) { throw new RuntimeException("Error: Data file had no samples to load.") } println(s"Loaded data with $numExamples examples from file: ${params.input}") // Example: RDD.sample() and RDD.takeSample() val expectedSampleSize = (numExamples * fraction).toInt println(s"Sampling RDD using fraction $fraction. Expected sample size = $expectedSampleSize.") val sampledRDD = examples.sample(withReplacement = true, fraction = fraction) println(s" RDD.sample(): sample has ${sampledRDD.count()} examples") val sampledArray = examples.takeSample(withReplacement = true, num = expectedSampleSize) println(s" RDD.takeSample(): sample has ${sampledArray.length} examples") println() // Example: RDD.sampleByKey() and RDD.sampleByKeyExact() val keyedRDD = examples.map { lp => (lp.label.toInt, lp.features) } println(s" Keyed data using label (Int) as key ==> Orig") // Count examples per label in original data. val keyCounts = keyedRDD.countByKey() // Subsample, and count examples per label in sampled data. (approximate) val fractions = keyCounts.keys.map((_, fraction)).toMap val sampledByKeyRDD = keyedRDD.sampleByKey(withReplacement = true, fractions = fractions) val keyCountsB = sampledByKeyRDD.countByKey() val sizeB = keyCountsB.values.sum println(s" Sampled $sizeB examples using approximate stratified sampling (by label)." + " ==> Approx Sample") // Subsample, and count examples per label in sampled data. (approximate) val sampledByKeyRDDExact = keyedRDD.sampleByKeyExact(withReplacement = true, fractions = fractions) val keyCountsBExact = sampledByKeyRDDExact.countByKey() val sizeBExact = keyCountsBExact.values.sum println(s" Sampled $sizeBExact examples using exact stratified sampling (by label)." + " ==> Exact Sample") // Compare samples println(s" \tFractions of examples with key") println(s"Key\tOrig\tApprox Sample\tExact Sample") keyCounts.keys.toSeq.sorted.foreach { key => val origFrac = keyCounts(key) / numExamples.toDouble val approxFrac = if (sizeB != 0) { keyCountsB.getOrElse(key, 0L) / sizeB.toDouble } else { 0 } val exactFrac = if (sizeBExact != 0) { keyCountsBExact.getOrElse(key, 0L) / sizeBExact.toDouble } else { 0 } println(s"$key\t$origFrac\t$approxFrac\t$exactFrac") } sc.stop() } } // scalastyle:on println
Example 64
Source File: MultivariateSummarizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 65
Source File: StandardScalerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils // $example off$ object StandardScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StandardScalerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 66
Source File: SparseNaiveBayes.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 67
Source File: BinaryClassification.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.optimization.{L1Updater, SquaredL2Updater} import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --algorithm LR --regType L2 --regParam 1.0 \ | data/mllib/sample_binary_classification_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"BinaryClassification with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val model = params.algorithm match { case LR => val algorithm = new LogisticRegressionWithLBFGS() algorithm.optimizer .setNumIterations(params.numIterations) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() case SVM => val algorithm = new SVMWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() } val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val metrics = new BinaryClassificationMetrics(predictionAndLabel) println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.") println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.") sc.stop() } } // scalastyle:on println
Example 68
Source File: LinearRegression.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } } // scalastyle:on println
Example 69
Source File: Correlations.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 70
Source File: NaiveBayesExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.util.MLUtils // $example off$ object NaiveBayesExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val Array(training, test) = data.randomSplit(Array(0.6, 0.4)) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ } } // scalastyle:on println
Example 71
Source File: MultivariateSummarizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 72
Source File: NormalizerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.Normalizer import org.apache.spark.mllib.util.MLUtils // $example off$ object NormalizerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NormalizerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val normalizer1 = new Normalizer() val normalizer2 = new Normalizer(p = Double.PositiveInfinity) // Each sample in data1 will be normalized using $L^2$ norm. val data1 = data.map(x => (x.label, normalizer1.transform(x.features))) // Each sample in data2 will be normalized using $L^\infty$ norm. val data2 = data.map(x => (x.label, normalizer2.transform(x.features))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 73
Source File: CPWithLR.scala From spark-tutorial with Apache License 2.0 | 5 votes |
package se.uu.farmbio.tutorial import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.util.MLUtils import se.uu.farmbio.cp.BinaryClassificationICPMetrics import se.uu.farmbio.cp.ICP import se.uu.farmbio.cp.alg.LogisticRegression object CPWithLR { def main(args: Array[String]) = { //Start the Spark context val conf = new SparkConf() .setAppName("CPWithLR") .setMaster("local[*]") val sc = new SparkContext(conf) //Load examples val data = MLUtils.loadLibSVMFile(sc, "pubchem.svm") //Split the data. We leave 20% of the examples out to compute the efficiency later on. val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0) val test = splits(1) //Split in training and calibration set val (calibration, properTraining) = ICP.splitCalibrationAndTraining( training, //sample 32 calibration examples for each class (balanced calibration set) numOfCalibSamples=32, bothClasses=true) val conformalPred = ICP.trainClassifier( new LogisticRegression( properTraining.cache(), //why do we cache only the proper training set? regParam=0.0, //no regularization (for LBFGS) maxNumItearations=30), //maximum 30 iterations (for LBFGS) numClasses = 2, //we have only two classes: toxic and non-toxic calibration) //provide the calibration examples //Compute the p-values val pvAndLabels = test.map { testExample => //for each test example val pvForEachClass = conformalPred.mondrianPv(testExample.features) //compute p-value for each class val trueLabel = testExample.label //keep track of the true label to compute the efficiency later on (pvForEachClass, trueLabel) } //BinaryClassificationICPMetrics class computes some metrics which include efficiency val metrics = new BinaryClassificationICPMetrics( pvAndLabels, significances=Array(0.1,0.15,0.2,0.25) //specify for which significances the metrics will be computed ) //Print the metrics println(metrics) //Stop the Spark context sc.stop } }
Example 74
Source File: LogisticRegressionExample.scala From spark-tutorial with Apache License 2.0 | 5 votes |
package se.uu.farmbio.tutorial import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils object LogisticRegressionExample { def main(args: Array[String]) = { //Start the Spark context val conf = new SparkConf() .setAppName("LogisticRegression") .setMaster("local[*]") val sc = new SparkContext(conf) //Load pubchem.svm val data = MLUtils.loadLibSVMFile(sc, "pubchem.svm") //Split the data in training and test val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0).cache() val test = splits(1) //Train the model using Logistic Regression with LBFGS val lbfgs = new LogisticRegressionWithLBFGS() val model = lbfgs.run(training) model.clearThreshold() //Compute the probability to be in the positive class for each of the test examples val probAndLabels = test.map { testExample => val probability = model.predict(testExample.features) (probability, testExample.label) } //Compute the area under the ROC curve using the Spark's BinaryClassificationMetrics class val metrics = new BinaryClassificationMetrics(probAndLabels) val auROC = metrics.areaUnderROC() println("Area under ROC = " + auROC) //print the area under the ROC //Stop the Spark context sc.stop } }
Example 75
Source File: CPWithSVM.scala From spark-tutorial with Apache License 2.0 | 5 votes |
package se.uu.farmbio.tutorial import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.util.MLUtils import se.uu.farmbio.cp.ICP import se.uu.farmbio.cp.alg.SVM import se.uu.farmbio.cp.BinaryClassificationICPMetrics object CPWithSVM { def main(args: Array[String]) = { //Start the Spark context val conf = new SparkConf() .setAppName("CPWithSVM") .setMaster("local[*]") val sc = new SparkContext(conf) //Load examples val data = MLUtils.loadLibSVMFile(sc, "pubchem.svm") //Split the data. We leave 20% of the examples out to compute the efficiency later on. val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0) val test = splits(1) val pvAndLabels = test.map { testExample => //for each test example val pvForEachClass = conformalPred.mondrianPv(testExample.features) //compute p-value for each class val trueLabel = testExample.label //keep track of the true label to compute the efficiency later on (pvForEachClass, trueLabel) } //BinaryClassificationICPMetrics class computes some metrics which include efficiency val metrics = new BinaryClassificationICPMetrics( pvAndLabels, significances=Array(0.1,0.15,0.2,0.25) //specify for which significances the metrics will be computed ) //Print the metrics println(metrics) //Stop the Spark context sc.stop } }
Example 76
Source File: PerceptronClassifier.scala From CSYE7200_Old with MIT License | 5 votes |
package edu.neu.coe.csye7200.nn import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.Row object PerceptronClassifier extends App { val conf = new SparkConf().setAppName("spam").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new org.apache.spark.sql.SQLContext(sc) val sparkHome = "/Applications/spark-1.5.1-bin-hadoop2.6/" val trainingFile = "data/mllib/sample_multiclass_classification_data.txt" // this is used to implicitly convert an RDD to a DataFrame. import sqlContext.implicits._ // Load training data val data = MLUtils.loadLibSVMFile(sc, s"$sparkHome$trainingFile").toDF() // Split the data into train and test val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L) val train = splits(0) val test = splits(1) // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) val layers = Array[Int](4, 5, 4, 3) // create the trainer and set its parameters val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(100) // train the model val model = trainer.fit(train) // compute precision on the test set val result = model.transform(test) val predictionAndLabels = result.select("prediction", "label") predictionAndLabels.show val evaluator = new MulticlassClassificationEvaluator() .setMetricName("precision") println("Precision:" + evaluator.evaluate(predictionAndLabels)) }
Example 77
Source File: MLLibRandomForestFromFile.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.example import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy} import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.mllib.util.MLUtils import reforest.rf.feature.RFStrategyFeatureSQRT import reforest.rf.parameter._ import reforest.util.{CCUtil, CCUtilIO} import scala.util.Random object MLLibRandomForestFromFile { def main(args: Array[String]): Unit = { val property = RFParameterFromFile(args(0)).applyAppName("MLLib") val sc = CCUtil.getSparkContext(property) sc.setLogLevel("error") val timeStart = System.currentTimeMillis() val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2) val splits = data.randomSplit(Array(0.7, 0.3), 0) val (trainingData, testData) = (splits(0), splits(1)) // Train a RandomForest model. // val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap val categoricalFeaturesInfo = Map[Int, Int]() val featureSubsetStrategy = "sqrt" val impurity = "entropy" val s = new Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1) val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt()) val timeEnd = System.currentTimeMillis() val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() CCUtilIO.logACCURACY(property, (1-testErr), (timeEnd-timeStart)) println("Time: "+(timeEnd-timeStart)) println("Test Error = " + testErr) if (property.outputTree) { println("Learned classification forest model:\n" + model.toDebugString) } } }
Example 78
Source File: MLLibRandomForest.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.example import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy} import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.mllib.util.MLUtils import reforest.rf.feature.RFStrategyFeatureSQRT import reforest.rf.parameter._ import reforest.util.CCUtil import scala.util.Random object MLLibRandomForest { def main(args: Array[String]): Unit = { val property = RFParameterBuilder.apply .addParameter(RFParameterType.Dataset, "data/sample-covtype.libsvm") .addParameter(RFParameterType.NumFeatures, 54) .addParameter(RFParameterType.NumClasses, 10) .addParameter(RFParameterType.NumTrees, 100) .addParameter(RFParameterType.Depth, Array(10)) .addParameter(RFParameterType.BinNumber, Array(8)) .addParameter(RFParameterType.SparkMaster, "local[4]") .addParameter(RFParameterType.SparkCoresMax, 4) .addParameter(RFParameterType.SparkPartition, 4*4) .addParameter(RFParameterType.SparkExecutorMemory, "4096m") .addParameter(RFParameterType.SparkExecutorInstances, 1) .build val sc = CCUtil.getSparkContext(property) sc.setLogLevel("error") val timeStart = System.currentTimeMillis() val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2) val splits = data.randomSplit(Array(0.6, 0.2, 0.2), 0) val (trainingData, testData) = (splits(0), splits(2)) // Train a RandomForest model. // val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap val categoricalFeaturesInfo = Map[Int, Int]() val featureSubsetStrategy = "sqrt" val impurity = "entropy" val s = new Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1) val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt()) val timeEnd = System.currentTimeMillis() val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Time: "+(timeEnd-timeStart)) println("Test Error = " + testErr) if (property.outputTree) { println("Learned classification forest model:\n" + model.toDebugString) } } }
Example 79
Source File: LRAccuracyTest.scala From SparseML with Apache License 2.0 | 5 votes |
package MLlib import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkContext, SparkConf} object LRAccuracyTest { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map( l => LabeledPoint(l.label, l.features.toSparse)) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = new SparseLogisticRegressionWithLBFGS() .setNumClasses(5) .run(training) // Compute raw scores on the test set. val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Get evaluation metrics. val metrics = new MulticlassMetrics(predictionAndLabels) val precision = metrics.precision println("Precision = " + precision) } }
Example 80
Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package tests import org.apache.log4j.{ Level, Logger } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.storage.StorageLevel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, svd => brzSvd, max => Bmax, min => Bmin, sum => Bsum } import scala.collection.mutable.ArrayBuffer import CNN.CNN object Test_example_CNN { def main(args: Array[String]) { //1 ����Spark���� val conf = new SparkConf().setAppName("CNNtest") val sc = new SparkContext(conf) //2 �������� Logger.getRootLogger.setLevel(Level.WARN) val data_path = "/deeplearn/train_d3.txt" val examples = sc.textFile(data_path).cache() val train_d1 = examples.map { line => val f1 = line.split("\t") val f = f1.map(f => f.toDouble) val y = f.slice(0, 10) val x = f.slice(10, f.length) (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0) } val train_d = train_d1.map(f => (f._1, f._2)) //3 ����ѵ������������ģ�� // opts:��������������������������֤���� val opts = Array(50.0, 1.0, 0.0) train_d.cache val numExamples = train_d.count() println(s"numExamples = $numExamples.") val CNNmodel = new CNN(). setMapsize(new BDM(1, 2, Array(28.0, 28.0))). setTypes(Array("i", "c", "s", "c", "s")). setLayer(5). setOnum(10). setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)). setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)). setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)). setAlpha(1.0). CNNtrain(train_d, opts) //4 ģ�Ͳ��� val CNNforecast = CNNmodel.predict(train_d) val CNNerror = CNNmodel.Loss(CNNforecast) println(s"NNerror = $CNNerror.") val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200) println("Ԥ��ֵ") for (i <- 0 until printf1.length) { val outi = printf1(i)._2.mkString("\t") println(outi) } } }
Example 81
Source File: LogLoss.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[spark] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 82
Source File: NormalizerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.Normalizer import org.apache.spark.mllib.util.MLUtils // $example off$ object NormalizerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NormalizerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val normalizer1 = new Normalizer() val normalizer2 = new Normalizer(p = Double.PositiveInfinity) // Each sample in data1 will be normalized using $L^2$ norm. val data1 = data.map(x => (x.label, normalizer1.transform(x.features))) // Each sample in data2 will be normalized using $L^\infty$ norm. val data2 = data.map(x => (x.label, normalizer2.transform(x.features))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 83
Source File: StandardScalerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils // $example off$ object StandardScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StandardScalerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 84
Source File: SparseNaiveBayes.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 85
Source File: Correlations.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 86
Source File: NaiveBayesExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.util.MLUtils // $example off$ object NaiveBayesExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val Array(training, test) = data.randomSplit(Array(0.6, 0.4)) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ } } // scalastyle:on println
Example 87
Source File: NormalizerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.Normalizer import org.apache.spark.mllib.util.MLUtils // $example off$ object NormalizerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NormalizerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val normalizer1 = new Normalizer() val normalizer2 = new Normalizer(p = Double.PositiveInfinity) // Each sample in data1 will be normalized using $L^2$ norm. val data1 = data.map(x => (x.label, normalizer1.transform(x.features))) // Each sample in data2 will be normalized using $L^\infty$ norm. val data2 = data.map(x => (x.label, normalizer2.transform(x.features))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 88
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.mllib.util.MLUtils //import org.apache.spark.ml.feature.HashingTF //import org.apache.spark.ml.feature.IDF object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) println(zipped.first()) val train = zipped.map { case (topic, vector) => { LabeledPoint(newsgroupsMap(topic), vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(train,"./output/20news-by-date-train-libsvm") train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => { println(topic) println(vector) LabeledPoint(topic, vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(test,"./output/20news-by-date-test-libsvm") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.accuracy) println(metrics.weightedFalsePositiveRate) println(metrics.weightedPrecision) println(metrics.weightedFMeasure) println(metrics.weightedRecall) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 89
Source File: StandardScalarSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} object StandardScalarSample { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local").setAppName("Word2Vector") val sc = new SparkContext(conf) val data = MLUtils.loadLibSVMFile(sc, "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6/data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) println(data1.first()) // Without converting the features into dense vectors, transformation with zero mean will raise // exception on sparse vector. // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) println(data2.first()) } }
Example 90
Source File: StandardScalarSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} object StandardScalarSample { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local").setAppName("Word2Vector") val sc = new SparkContext(conf) val data = MLUtils.loadLibSVMFile(sc, org.sparksamples.Util.SPARK_HOME + "/data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) println(data1.first()) // Without converting the features into dense vectors, transformation with zero mean will raise // exception on sparse vector. // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) println(data2.first()) } }
Example 91
Source File: StreamingMLUtils.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib import scala.language.implicitConversions import org.apache.spark.ml.linalg.{SparseVector, DenseVector, Vector} import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.util.MLUtils object StreamingMLUtils { implicit def mlToMllibVector(v: Vector): OldVector = v match { case dv: DenseVector => OldVectors.dense(dv.toArray) case sv: SparseVector => OldVectors.sparse(sv.size, sv.indices, sv.values) case _ => throw new IllegalArgumentException } def fastSquaredDistance(x: Vector, xNorm: Double, y: Vector, yNorm: Double) = { MLUtils.fastSquaredDistance(x, xNorm, y, yNorm) } }
Example 92
Source File: LogisticRegression.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.LBFGS import org.apache.spark.mllib.optimization.LogisticGradient import org.apache.spark.mllib.optimization.SquaredL2Updater import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a LogisticRegression UnderlyingAlgorithm private object LogisticRegression { def trainingProcedure( input: RDD[LabeledPoint], maxNumItearations: Int, regParam: Double, numCorrections: Int, convergenceTol: Double): (Vector => Double) = { //Train Logistic Regression with LBFGS val numFeatures = input.take(1)(0).features.size val training = input.map(x => (x.label, MLUtils.appendBias(x.features))).cache() val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1)) val (weightsWithIntercept, _) = LBFGS.runLBFGS( training, new LogisticGradient(), new SquaredL2Updater(), numCorrections, convergenceTol, maxNumItearations, regParam, initialWeightsWithIntercept) //Create the model using the weights val model = new LogisticRegressionModel( Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)), weightsWithIntercept(weightsWithIntercept.size - 1)) //Return raw score predictor model.clearThreshold() model.predict } } class LogisticRegression( private val input: RDD[LabeledPoint], private val maxNumItearations: Int = 100, private val regParam: Double = 0.1, private val numCorrections: Int = 10, private val convergenceTol: Double = 1e-4) extends UnderlyingAlgorithm( LogisticRegression.trainingProcedure( input, maxNumItearations, regParam, numCorrections, convergenceTol)) { override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { 1-score } else { score } } }