org.apache.spark.ml.feature.VectorAssembler Scala Examples
The following examples show how to use org.apache.spark.ml.feature.VectorAssembler.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 2
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 3
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 4
Source File: ImportanceSelectorSuite.scala From spark-FeatureSelection with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature.selection.embedded import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.feature.selection.{FeatureSelectionTestBase, FeatureSelectorTestBase} import org.apache.spark.ml.linalg.Vectors class ImportanceSelectorSuite extends FeatureSelectionTestBase { // Order of feature importances must be: f4 > f3 > f2 > f1 private val featureWeights = Vectors.dense(Array(0.3, 0.5, 0.7, 0.8)) test("Test ImportanceSelector: numTopFeatures") { val selector = new ImportanceSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setFeatureWeights(featureWeights) .setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(2) val importantColNames = Array("pWidth", "pLength") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[ImportanceSelector, ImportanceSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test ImportanceSelector: percentile") { val selector = new ImportanceSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.51).setFeatureWeights(featureWeights) val importantColNames = Array("pWidth", "pLength") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[ImportanceSelector, ImportanceSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test ImportanceSelector: randomCutOff") { val selector = new ImportanceSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("randomCutOff").setRandomCutOff(1.0).setFeatureWeights(featureWeights) val importantColNames = Array("pWidth", "pLength", "sWidth", "sLength") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[ImportanceSelector, ImportanceSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("ImportanceSelector read/write") { val nb = new ImportanceSelector testEstimatorAndModelReadWrite[ImportanceSelector, ImportanceSelectorModel](nb, dataset, FeatureSelectorTestBase.allParamSettings.+("featureWeights" -> featureWeights), FeatureSelectorTestBase.checkModelData) } }
Example 5
Source File: VectorAssemblerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object VectorAssemblerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorAssemblerExample") .getOrCreate() // $example on$ val dataset = spark.createDataFrame( Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0)) ).toDF("id", "hour", "mobile", "userFeatures", "clicked") val assembler = new VectorAssembler() .setInputCols(Array("hour", "mobile", "userFeatures")) .setOutputCol("features") val output = assembler.transform(dataset) println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: InteractionExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Interaction import org.apache.spark.ml.feature.VectorAssembler // $example off$ import org.apache.spark.sql.SparkSession object InteractionExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("InteractionExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (1, 1, 2, 3, 8, 4, 5), (2, 4, 3, 8, 7, 9, 8), (3, 6, 1, 9, 2, 3, 6), (4, 10, 8, 6, 9, 4, 5), (5, 9, 2, 7, 10, 7, 3), (6, 1, 1, 4, 2, 8, 4) )).toDF("id1", "id2", "id3", "id4", "id5", "id6", "id7") val assembler1 = new VectorAssembler(). setInputCols(Array("id2", "id3", "id4")). setOutputCol("vec1") val assembled1 = assembler1.transform(df) val assembler2 = new VectorAssembler(). setInputCols(Array("id5", "id6", "id7")). setOutputCol("vec2") val assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2") val interaction = new Interaction() .setInputCols(Array("id1", "vec1", "vec2")) .setOutputCol("interactedCol") val interacted = interaction.transform(assembled2) interacted.show(truncate = false) // $example off$ spark.stop() } } // scalastyle:on println
Example 7
Source File: LinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer} import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, SparkSession} object LinearRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def linearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = { val lr = new LinearRegression() .setFeaturesCol("features") .setLabelCol("label") .setRegParam(0.1) .setElasticNetParam(1.0) .setMaxIter(10) val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr)) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) val model = pipeline.fit(training) val fullPredictions = model.transform(test).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select("label").rdd.map(_.getDouble(0)) val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError println(s" Root mean squared error (RMSE): $RMSE") } def linearRegressionWithSVMFormat(spark: SparkSession) = { // Load training data val training = spark.read.format("libsvm") .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt") val lr = new LinearRegression() .setMaxIter(10) .setRegParam(0.3) .setElasticNetParam(0.8) // Fit the model val lrModel = lr.fit(training) // Print the coefficients and intercept for linear regression println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") // Summarize the model over the training set and print out some metrics val trainingSummary = lrModel.summary println(s"numIterations: ${trainingSummary.totalIterations}") println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") trainingSummary.residuals.show() println(s"RMSE: ${trainingSummary.rootMeanSquaredError}") println(s"r2: ${trainingSummary.r2}") } }
Example 8
Source File: GeneralizedLinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer} import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{SparkSession, _} object GeneralizedLinearRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def genLinearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = { val lr = new GeneralizedLinearRegression() .setFeaturesCol("features") .setLabelCol("label") .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3) val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr)) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) val model = pipeline.fit(training) val fullPredictions = model.transform(test).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select("label").rdd.map(_.getDouble(0)) val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError println(s" Root mean squared error (RMSE): $RMSE") } def genLinearRegressionWithSVMFormat(spark: SparkSession) = { // Load training data val training = spark.read.format("libsvm") .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt") val lr = new GeneralizedLinearRegression() .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3) // Fit the model val model = lr.fit(training) // Print the coefficients and intercept for generalized linear regression model println(s"Coefficients: ${model.coefficients}") println(s"Intercept: ${model.intercept}") // Summarize the model over the training set and print out some metrics val summary = model.summary println(s"Coefficient Standard Errors: ${summary.coefficientStandardErrors.mkString(",")}") println(s"T Values: ${summary.tValues.mkString(",")}") println(s"P Values: ${summary.pValues.mkString(",")}") println(s"Dispersion: ${summary.dispersion}") println(s"Null Deviance: ${summary.nullDeviance}") println(s"Residual Degree Of Freedom Null: ${summary.residualDegreeOfFreedomNull}") println(s"Deviance: ${summary.deviance}") println(s"Residual Degree Of Freedom: ${summary.residualDegreeOfFreedom}") println(s"AIC: ${summary.aic}") println("Deviance Residuals: ") summary.residuals().show() } }
Example 9
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 10
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 11
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 12
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 13
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 14
Source File: LRSelectorSuite.scala From spark-FeatureSelection with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature.selection.embedded import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.feature.selection.{FeatureSelectionTestBase, FeatureSelectorTestBase} import org.apache.spark.ml.linalg.Matrices class LRSelectorSuite extends FeatureSelectionTestBase { // Order of feature importances must be: f4 > f3 > f2 > f1 private val lrWeights = Matrices.dense(3, 4, Array(0.1, 0.1, 0.1, 0.2, 0.2, 0.2, -0.8, -0.8, -0.8, 0.9, 0.9, 0.9)) test("Test LRSelector: numTopFeatures") { val selector = new LRSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName).setCoefficientMatrix(lrWeights) .setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(2) val importantColNames = Array("pWidth", "pLength") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[LRSelector, LRSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test LRSelector: percentile") { val selector = new LRSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.51).setCoefficientMatrix(lrWeights) val importantColNames = Array("pWidth", "pLength") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[LRSelector, LRSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test LRSelector: randomCutOff") { val selector = new LRSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("randomCutOff").setRandomCutOff(1.0).setCoefficientMatrix(lrWeights) val importantColNames = Array("pWidth", "pLength", "sWidth", "sLength") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[LRSelector, LRSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("LRSelector read/write") { val nb = new LRSelector testEstimatorAndModelReadWrite[LRSelector, LRSelectorModel](nb, dataset, FeatureSelectorTestBase.allParamSettings.+("coefficientMatrix" -> lrWeights), FeatureSelectorTestBase.checkModelData) } }
Example 15
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/NB.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/NaiveBayes.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 16
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 17
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/DT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/DecisionTree.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 18
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/RF.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/RandomForest.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 19
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 20
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 21
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 22
Source File: L9-15MLPipeline.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.ml.param.ParamMap object MLPipelineApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLPipelineApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val pMap = ParamMap(normalizer.p -> 1.0) val model = pipeline.fit(train, pMap) val prediction = model.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 23
Source File: InteractionExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Interaction import org.apache.spark.ml.feature.VectorAssembler // $example off$ import org.apache.spark.sql.SparkSession object InteractionExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("InteractionExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (1, 1, 2, 3, 8, 4, 5), (2, 4, 3, 8, 7, 9, 8), (3, 6, 1, 9, 2, 3, 6), (4, 10, 8, 6, 9, 4, 5), (5, 9, 2, 7, 10, 7, 3), (6, 1, 1, 4, 2, 8, 4) )).toDF("id1", "id2", "id3", "id4", "id5", "id6", "id7") val assembler1 = new VectorAssembler(). setInputCols(Array("id2", "id3", "id4")). setOutputCol("vec1") val assembled1 = assembler1.transform(df) val assembler2 = new VectorAssembler(). setInputCols(Array("id5", "id6", "id7")). setOutputCol("vec2") val assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2") val interaction = new Interaction() .setInputCols(Array("id1", "vec1", "vec2")) .setOutputCol("interactedCol") val interacted = interaction.transform(assembled2) interacted.show(truncate = false) // $example off$ spark.stop() } } // scalastyle:on println
Example 24
Source File: VectorAssemblerSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vectors class VectorAssemblerSuite extends SparkFeaturePFASuiteBase[VectorAssemblerResult] { import spark.implicits._ val data = Seq((0, 18, 1.0, 3.0, Vectors.dense(0.0, 10.0, 0.5), 1.0)) val df = spark.createDataset(data).toDF("id", "hour", "mobile", "region", "userFeatures", "clicked") override val sparkTransformer = new VectorAssembler() .setInputCols(Array("hour", "mobile", "region", "userFeatures", "clicked")) .setOutputCol("features") val result = sparkTransformer.transform(df) val columnNames = sparkTransformer.getInputCols.toSeq override val input = Array( """{"hour":{"double":18}, |"mobile":{"double":1.0}, |"region":{"double":3.0}, |"userFeatures":{"array":[0.0,10.0,0.5]}, |"clicked":{"double":1.0}}""".stripMargin) override val expectedOutput = withColumnAsArray(result, sparkTransformer.getOutputCol).toJSON.collect() } case class VectorAssemblerResult(features: Seq[Double]) extends Result
Example 25
Source File: VectorAssembler.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.document.{PFABuilder, PFADocument} import com.ibm.aardpfark.pfa.expression.PFAExpression import com.ibm.aardpfark.spark.ml.PFATransformer import org.apache.avro.{Schema, SchemaBuilder} import org.apache.spark.ml.feature.VectorAssembler import org.json4s.DefaultFormats class PFAVectorAssembler(override val sparkTransformer: VectorAssembler) extends PFATransformer { import com.ibm.aardpfark.pfa.dsl._ implicit val formats = DefaultFormats private val inputCols = sparkTransformer.getInputCols private val outputCol = sparkTransformer.getOutputCol type DorSeqD = Either[Double, Seq[Double]] override protected def inputSchema: Schema = { val builder = SchemaBuilder.record(withUid(inputBaseName)).fields() for (inputCol <- inputCols) { builder.name(inputCol).`type`() .unionOf() .doubleType().and() .array().items().doubleType() .endUnion().noDefault() } builder.endRecord() } override protected def outputSchema: Schema = { SchemaBuilder.record(withUid(outputBaseName)).fields() .name(outputCol).`type`().array().items().doubleType().noDefault() .endRecord() } private val asDouble = As[Double]("x", x => NewArray[Double](x)) private val asArray = As[Array[Double]]("x", x => x) private val castFn = NamedFunctionDef("castToArray", FunctionDef[DorSeqD, Seq[Double]]("x") { x => Cast(x, asDouble, asArray) } ) override protected def action: PFAExpression = { val cols = Let("cols", NewArray[DorSeqD](inputCols.map(c => StringExpr(s"input.$c")))) Action( cols, NewRecord(outputSchema, Map(outputCol -> a.flatten(a.map(cols.ref, castFn.ref)))) ) } override def pfa: PFADocument = { PFABuilder() .withName(sparkTransformer.uid) .withMetadata(getMetadata) .withInput(inputSchema) .withOutput(outputSchema) .withAction(action) .withFunction(castFn) .pfa } }
Example 26
Source File: GBTLRExample.scala From spark-gbtlr with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.gbtlr.GBTLRClassifier import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.SparkSession // scalastyle:off println object GBTLRExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("gbtlr example") .getOrCreate() val startTime = System.currentTimeMillis() val dataset = spark.read.option("header", "true").option("inferSchema", "true") .option("delimiter", ";").csv("data/bank/bank-full.csv") val columnNames = Array("job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome", "y") val indexers = columnNames.map(name => new StringIndexer() .setInputCol(name).setOutputCol(name + "_index")) val pipeline = new Pipeline().setStages(indexers) val data1 = pipeline.fit(dataset).transform(dataset) val data2 = data1.withColumnRenamed("y_index", "label") val assembler = new VectorAssembler() assembler.setInputCols(Array("age", "job_index", "marital_index", "education_index", "default_index", "balance", "housing_index", "loan_index", "contact_index", "day", "month_index", "duration", "campaign", "pdays", "previous", "poutcome_index")) assembler.setOutputCol("features") val data3 = assembler.transform(data2) val data4 = data3.randomSplit(Array(4, 1)) val gBTLRClassifier = new GBTLRClassifier() .setFeaturesCol("features") .setLabelCol("label") .setGBTMaxIter(10) .setLRMaxIter(100) .setRegParam(0.01) .setElasticNetParam(0.5) val model = gBTLRClassifier.fit(data4(0)) val summary = model.evaluate(data4(1)) val endTime = System.currentTimeMillis() val auc = summary.binaryLogisticRegressionSummary .asInstanceOf[BinaryLogisticRegressionSummary].areaUnderROC println(s"Training and evaluating cost ${(endTime - startTime) / 1000} seconds") println(s"The model's auc: ${auc}") } } // scalastyle:on println
Example 27
Source File: BaseTransformerConverter.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter.runtime import com.truecar.mleap.runtime.transformer import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.classification.RandomForestClassificationModel import org.apache.spark.ml.feature.{IndexToString, StandardScalerModel, StringIndexerModel, VectorAssembler} import org.apache.spark.ml.mleap.classification.SVMModel import org.apache.spark.ml.mleap.converter.runtime.classification.{RandomForestClassificationModelToMleap, SupportVectorMachineModelToMleap} import org.apache.spark.ml.mleap.converter.runtime.feature.{IndexToStringToMleap, StandardScalerModelToMleap, StringIndexerModelToMleap, VectorAssemblerModelToMleap} import org.apache.spark.ml.mleap.converter.runtime.regression.{LinearRegressionModelToMleap, RandomForestRegressionModelToMleap} import org.apache.spark.ml.regression.{LinearRegressionModel, RandomForestRegressionModel} trait BaseTransformerConverter extends SparkTransformerConverter { // regression implicit val mleapLinearRegressionModelToMleap: TransformerToMleap[LinearRegressionModel, transformer.LinearRegressionModel] = addConverter(LinearRegressionModelToMleap) implicit val mleapRandomForestRegressionModelToMleap: TransformerToMleap[RandomForestRegressionModel, transformer.RandomForestRegressionModel] = addConverter(RandomForestRegressionModelToMleap) // classification implicit val mleapRandomForestClassificationModelToMleap: TransformerToMleap[RandomForestClassificationModel, transformer.RandomForestClassificationModel] = addConverter(RandomForestClassificationModelToMleap) implicit val mleapSupportVectorMachineModelToMleap: TransformerToMleap[SVMModel, transformer.SupportVectorMachineModel] = addConverter(SupportVectorMachineModelToMleap) //feature implicit val mleapIndexToStringToMleap: TransformerToMleap[IndexToString, transformer.ReverseStringIndexerModel] = addConverter(IndexToStringToMleap) implicit val mleapStandardScalerModelToMleap: TransformerToMleap[StandardScalerModel, transformer.StandardScalerModel] = addConverter(StandardScalerModelToMleap) implicit val mleapStringIndexerModelToMleap: TransformerToMleap[StringIndexerModel, transformer.StringIndexerModel] = addConverter(StringIndexerModelToMleap) implicit val mleapVectorAssemblerToMleap: TransformerToMleap[VectorAssembler, transformer.VectorAssemblerModel] = addConverter(VectorAssemblerModelToMleap) // other implicit val mleapPipelineModelToMleap: TransformerToMleap[PipelineModel, transformer.PipelineModel] = addConverter(PipelineModelToMleap(this)) } object BaseTransformerConverter extends BaseTransformerConverter
Example 28
Source File: TitanicLogisticRegression.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.classification import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.DataFrame object TitanicLogisticRegression extends SparkSessionWrapper { def withVectorizedFeatures( featureColNames: Array[String] = Array("Gender", "Age", "SibSp", "Parch", "Fare"), outputColName: String = "features" )(df: DataFrame): DataFrame = { val assembler: VectorAssembler = new VectorAssembler() .setInputCols(featureColNames) .setOutputCol(outputColName) assembler.transform(df) } def withLabel( inputColName: String = "Survived", outputColName: String = "label" )(df: DataFrame) = { val labelIndexer: StringIndexer = new StringIndexer() .setInputCol(inputColName) .setOutputCol(outputColName) labelIndexer .fit(df) .transform(df) } def model(df: DataFrame = TitanicData.trainingDF()): LogisticRegressionModel = { val trainFeatures: DataFrame = df .transform(withVectorizedFeatures()) .transform(withLabel()) .select("features", "label") // only uses the features and label columns new LogisticRegression() .fit(trainFeatures) } def persistModel(): Unit = { model().save("./tmp/titanic_model/") } }
Example 29
Source File: IrisKMeansClustering.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.clustering import com.github.mrpowers.spark.spec.Config import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper import org.apache.spark.ml.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.DataFrame object IrisKMeansClustering extends SparkSessionWrapper { val irisDF = spark .read .option("header", "true") .option("inferSchema", "true") .csv(Config.get("irisData")) val Array(trainingDF, testDF) = irisDF.randomSplit(Array(0.7, 0.3), seed = 12345) def withVectorizedFeatures( featureColNames: Array[String] = Array("SepalLengthCm", "SepalLengthCm", "PetalLengthCm", "PetalWidthCm"), outputColName: String = "features" )(df: DataFrame): DataFrame = { val assembler: VectorAssembler = new VectorAssembler() .setInputCols(featureColNames) .setOutputCol(outputColName) assembler.transform(df) } def model(df: DataFrame = trainingDF): KMeansModel = { val trainFeatures: DataFrame = df .transform(withVectorizedFeatures()) new KMeans() .setK(3) // # of clusters .setSeed(2L) .fit(trainFeatures) } def persistModel(): Unit = { model().save("./tmp/iris_kMeans_model/") } }
Example 30
Source File: LOFSuite.scala From spark-lof with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.outlier import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import org.apache.spark.sql.functions._ object LOFSuite { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("LOFExample") .master("local[4]") .getOrCreate() val schema = new StructType(Array( new StructField("col1", DataTypes.DoubleType), new StructField("col2", DataTypes.DoubleType))) val df = spark.read.schema(schema).csv("data/outlier.csv") val assembler = new VectorAssembler() .setInputCols(df.columns) .setOutputCol("features") val data = assembler.transform(df).repartition(4) val startTime = System.currentTimeMillis() val result = new LOF() .setMinPts(5) .transform(data) val endTime = System.currentTimeMillis() result.count() // Outliers have much higher LOF value than normal data result.sort(desc(LOF.lof)).head(10).foreach { row => println(row.get(0) + " | " + row.get(1) + " | " + row.get(2)) } println("Total time = " + (endTime - startTime) / 1000.0 + "s") } }
Example 31
Source File: VectorAssemblerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.mllib.linalg.Vectors // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object VectorAssemblerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VectorAssemblerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val dataset = sqlContext.createDataFrame( Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0)) ).toDF("id", "hour", "mobile", "userFeatures", "clicked") val assembler = new VectorAssembler() .setInputCols(Array("hour", "mobile", "userFeatures")) .setOutputCol("features") val output = assembler.transform(dataset) println(output.select("features", "clicked").first()) // $example off$ sc.stop() } } // scalastyle:on println
Example 32
Source File: VectorAssemblerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object VectorAssemblerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorAssemblerExample") .getOrCreate() // $example on$ val dataset = spark.createDataFrame( Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0)) ).toDF("id", "hour", "mobile", "userFeatures", "clicked") val assembler = new VectorAssembler() .setInputCols(Array("hour", "mobile", "userFeatures")) .setOutputCol("features") val output = assembler.transform(dataset) println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 33
Source File: VectorSizeHintExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{VectorAssembler, VectorSizeHint} import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object VectorSizeHintExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSizeHintExample") .getOrCreate() // $example on$ val dataset = spark.createDataFrame( Seq( (0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0), (0, 18, 1.0, Vectors.dense(0.0, 10.0), 0.0)) ).toDF("id", "hour", "mobile", "userFeatures", "clicked") val sizeHint = new VectorSizeHint() .setInputCol("userFeatures") .setHandleInvalid("skip") .setSize(3) val datasetWithSize = sizeHint.transform(dataset) println("Rows where 'userFeatures' is not the right size are filtered out") datasetWithSize.show(false) val assembler = new VectorAssembler() .setInputCols(Array("hour", "mobile", "userFeatures")) .setOutputCol("features") // This dataframe can be used by downstream transformers as before val output = assembler.transform(datasetWithSize) println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 34
Source File: L9-17MLCrossValidation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.CrossValidator import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object MLCrossValidationApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLCrossValidationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val validator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) val pGrid = new ParamGridBuilder() .addGrid(normalizer.p, Array(1.0, 5.0, 10.0)) .addGrid(regressor.numTrees, Array(10, 50, 100)) .build() validator.setEstimatorParamMaps(pGrid) validator.setNumFolds(5) val bestModel = validator.fit(train) val prediction = bestModel.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 35
Source File: TypedVectorAssembler.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml package feature import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vector import shapeless.{HList, HNil, LabelledGeneric} import shapeless.ops.hlist.ToTraversable import shapeless.ops.record.{Keys, Values} import shapeless._ import scala.annotation.implicitNotFound final class TypedVectorAssembler[Inputs] private[ml](vectorAssembler: VectorAssembler, inputCols: Array[String]) extends AppendTransformer[Inputs, TypedVectorAssembler.Output, VectorAssembler] { val transformer: VectorAssembler = vectorAssembler .setInputCols(inputCols) .setOutputCol(AppendTransformer.tempColumnName) } object TypedVectorAssembler { case class Output(vector: Vector) def apply[Inputs](implicit inputsChecker: TypedVectorAssemblerInputsChecker[Inputs]): TypedVectorAssembler[Inputs] = { new TypedVectorAssembler(new VectorAssembler(), inputsChecker.inputCols.toArray) } } @implicitNotFound( msg = "Cannot prove that ${Inputs} is a valid input type. Input type must only contain fields of numeric or boolean types." ) private[ml] trait TypedVectorAssemblerInputsChecker[Inputs] { val inputCols: Seq[String] } private[ml] object TypedVectorAssemblerInputsChecker { implicit def checkInputs[Inputs, InputsRec <: HList, InputsKeys <: HList, InputsVals <: HList]( implicit inputsGen: LabelledGeneric.Aux[Inputs, InputsRec], inputsKeys: Keys.Aux[InputsRec, InputsKeys], inputsKeysTraverse: ToTraversable.Aux[InputsKeys, Seq, Symbol], inputsValues: Values.Aux[InputsRec, InputsVals], inputsTypeCheck: TypedVectorAssemblerInputsValueChecker[InputsVals] ): TypedVectorAssemblerInputsChecker[Inputs] = new TypedVectorAssemblerInputsChecker[Inputs] { val inputCols: Seq[String] = inputsKeys.apply.to[Seq].map(_.name) } } private[ml] trait TypedVectorAssemblerInputsValueChecker[InputsVals] private[ml] object TypedVectorAssemblerInputsValueChecker { implicit def hnilCheckInputsValue: TypedVectorAssemblerInputsValueChecker[HNil] = new TypedVectorAssemblerInputsValueChecker[HNil] {} implicit def hlistCheckInputsValueNumeric[H, T <: HList]( implicit ch: CatalystNumeric[H], tt: TypedVectorAssemblerInputsValueChecker[T] ): TypedVectorAssemblerInputsValueChecker[H :: T] = new TypedVectorAssemblerInputsValueChecker[H :: T] {} implicit def hlistCheckInputsValueBoolean[T <: HList]( implicit tt: TypedVectorAssemblerInputsValueChecker[T] ): TypedVectorAssemblerInputsValueChecker[Boolean :: T] = new TypedVectorAssemblerInputsValueChecker[Boolean :: T] {} }
Example 36
Source File: VectorAssemblerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object VectorAssemblerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorAssemblerExample") .getOrCreate() // $example on$ val dataset = spark.createDataFrame( Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0)) ).toDF("id", "hour", "mobile", "userFeatures", "clicked") val assembler = new VectorAssembler() .setInputCols(Array("hour", "mobile", "userFeatures")) .setOutputCol("features") val output = assembler.transform(dataset) println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 37
Source File: InteractionExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Interaction import org.apache.spark.ml.feature.VectorAssembler // $example off$ import org.apache.spark.sql.SparkSession object InteractionExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("InteractionExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (1, 1, 2, 3, 8, 4, 5), (2, 4, 3, 8, 7, 9, 8), (3, 6, 1, 9, 2, 3, 6), (4, 10, 8, 6, 9, 4, 5), (5, 9, 2, 7, 10, 7, 3), (6, 1, 1, 4, 2, 8, 4) )).toDF("id1", "id2", "id3", "id4", "id5", "id6", "id7") val assembler1 = new VectorAssembler(). setInputCols(Array("id2", "id3", "id4")). setOutputCol("vec1") val assembled1 = assembler1.transform(df) val assembler2 = new VectorAssembler(). setInputCols(Array("id5", "id6", "id7")). setOutputCol("vec2") val assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2") val interaction = new Interaction() .setInputCols(Array("id1", "vec1", "vec2")) .setOutputCol("interactedCol") val interacted = interaction.transform(assembled2) interacted.show(truncate = false) // $example off$ spark.stop() } } // scalastyle:on println
Example 38
Source File: IForestExample.scala From spark-iforest with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.iforest.{IForest, IForestModel} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Row, SparkSession} object IForestExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local") // test in local mode .appName("iforest example") .getOrCreate() val startTime = System.currentTimeMillis() // Dataset from https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original) val dataset = spark.read.option("inferSchema", "true") .csv("data/anomaly-detection/breastw.csv") // Index label values: 2 -> 0, 4 -> 1 val indexer = new StringIndexer() .setInputCol("_c10") .setOutputCol("label") val assembler = new VectorAssembler() assembler.setInputCols(Array("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")) assembler.setOutputCol("features") val iForest = new IForest() .setNumTrees(100) .setMaxSamples(256) .setContamination(0.35) .setBootstrap(false) .setMaxDepth(100) .setSeed(123456L) val pipeline = new Pipeline().setStages(Array(indexer, assembler, iForest)) val model = pipeline.fit(dataset) val predictions = model.transform(dataset) // Save pipeline model model.write.overwrite().save("/tmp/iforest.model") // Load pipeline model val loadedPipelineModel = PipelineModel.load("/tmp/iforest.model") // Get loaded iforest model val loadedIforestModel = loadedPipelineModel.stages(2).asInstanceOf[IForestModel] println(s"The loaded iforest model has no summary: model.hasSummary = ${loadedIforestModel.hasSummary}") val binaryMetrics = new BinaryClassificationMetrics( predictions.select("prediction", "label").rdd.map { case Row(label: Double, ground: Double) => (label, ground) } ) val endTime = System.currentTimeMillis() println(s"Training and predicting time: ${(endTime - startTime) / 1000} seconds.") println(s"The model's auc: ${binaryMetrics.areaUnderROC()}") } } // scalastyle:on println
Example 39
Source File: EnsembleByKeySuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class EnsembleByKeySuite extends TestBase with TransformerFuzzing[EnsembleByKey] { test("Should work on Dataframes doubles or vectors") { val scoreDF = session.createDataFrame(Seq( (0, "foo", 1.0, .1), (1, "bar", 4.0, -2.0), (1, "bar", 0.0, -3.0))) .toDF("label1", "label2", "score1", "score2") val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1") val scoreDF2 = va.transform(scoreDF) val t = new EnsembleByKey().setKey("label1").setCol("score1") val df1 = t.transform(scoreDF2) df1.printSchema() assert(df1.collect().map(r => (r.getInt(0), r.getDouble(1))).toSet === Set((1, 2.0), (0, 1.0))) val t2 = new EnsembleByKey().setKeys("label1", "label2").setCols("score1", "score2", "v1") val df2 = t2.transform(scoreDF2) val res2 = df2.select("mean(score1)", "mean(v1)").collect().map(r => (r.getDouble(0), r.getAs[DenseVector](1))) val true2 = Set( (2.0, new DenseVector(Array(2.0, -2.5))), (1.0, new DenseVector(Array(1.0, 0.1)))) assert(res2.toSet === true2) } test("should support collapsing or not") { val scoreDF = session.createDataFrame( Seq((0, "foo", 1.0, .1), (1, "bar", 4.0, -2.0), (1, "bar", 0.0, -3.0))) .toDF("label1", "label2", "score1", "score2") val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1") val scoreDF2 = va.transform(scoreDF) val t = new EnsembleByKey().setKey("label1").setCol("score1").setCollapseGroup(false) val df1 = t.transform(scoreDF2) assert(df1.collect().map(r => (r.getInt(0), r.getDouble(5))).toSet === Set((1, 2.0), (0, 1.0))) assert(df1.count() == scoreDF.count()) df1.show() } lazy val testDF: DataFrame = { val initialTestDF = session.createDataFrame( Seq((0, "foo", 1.0, .1), (1, "bar", 4.0, -2.0), (1, "bar", 0.0, -3.0))) .toDF("label1", "label2", "score1", "score2") new VectorAssembler().setInputCols(Array("score1", "score2")) .setOutputCol("v1").transform(initialTestDF) } lazy val testModel: EnsembleByKey = new EnsembleByKey().setKey("label1").setCol("score1") .setCollapseGroup(false).setVectorDims(Map("v1"->2)) test("should support passing the vector dims to avoid maerialization") { val df1 = testModel.transform(testDF) assert(df1.collect().map(r => (r.getInt(0), r.getDouble(5))).toSet === Set((1, 2.0), (0, 1.0))) assert(df1.count() == testDF.count()) df1.show() } test("should overwrite a column if instructed") { val scoreDF = session.createDataFrame( Seq((0, "foo", 1.0, .1), (1, "bar", 4.0, -2.0), (1, "bar", 0.0, -3.0))) .toDF("label1", "label2", "score1", "score2") val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1") val scoreDF2 = va.transform(scoreDF) val t = new EnsembleByKey().setKey("label1").setCol("score1").setColName("score1").setCollapseGroup(false) val df1 = t.transform(scoreDF2) assert(scoreDF2.columns.toSet === df1.columns.toSet) } test("should rountrip serialize") { testSerialization() } def testObjects(): Seq[TestObject[EnsembleByKey]] = Seq(new TestObject(testModel, testDF)) def reader: EnsembleByKey.type = EnsembleByKey }
Example 40
Source File: VerifyIsolationForest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.isolationforest import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.metrics.MetricConstants import com.microsoft.ml.spark.core.test.benchmarks.Benchmarks import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row} import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.scalactic.Tolerance._ import com.microsoft.ml.spark.train.ComputeModelStatistics case class MammographyRecord(feature0: Double, feature1: Double, feature2: Double, feature3: Double, feature4: Double, feature5: Double, label: Double) case class ScoringResult(features: Vector, label: Double, predictedLabel: Double, outlierScore: Double) class VerifyIsolationForest extends Benchmarks with EstimatorFuzzing[IsolationForest] { test ("Verify isolationForestMammographyDataTest") { import session.implicits._ val data = loadMammographyData // Train a new isolation forest model val contamination = 0.02 val isolationForest = new IsolationForest() .setNumEstimators(100) .setBootstrap(false) .setMaxSamples(256) .setMaxFeatures(1.0) .setFeaturesCol("features") .setPredictionCol("predictedLabel") .setScoreCol("outlierScore") .setContamination(0.02) .setContaminationError(contamination * 0.01) .setRandomSeed(1) // Score all training data instances using the new model val isolationForestModel = isolationForest.fit(data) // Calculate area under ROC curve and assert val scores = isolationForestModel.transform(data).as[ScoringResult] val metrics = new ComputeModelStatistics() .setEvaluationMetric(MetricConstants.AucSparkMetric) .setLabelCol("label") .setScoredLabelsCol("predictedLabel") .setScoresCol("outlierScore") .transform(scores) // Expectation from results in the 2008 "Isolation Forest" paper by F. T. Liu, et al. val aurocExpectation = 0.86 val uncert = 0.02 val auroc = metrics.first().getDouble(1) assert(auroc === aurocExpectation +- uncert, "expected area under ROC =" + s" $aurocExpectation +/- $uncert, but observed $auroc") } def loadMammographyData(): DataFrame = { import session.implicits._ val mammographyRecordSchema = Encoders.product[MammographyRecord].schema val fileLocation = FileUtilities.join(BuildInfo.datasetDir,"IsolationForest", "mammography.csv").toString // Open source dataset from http://odds.cs.stonybrook.edu/mammography-dataset/ val rawData = session.read .format("csv") .option("comment", "#") .option("header", "false") .schema(mammographyRecordSchema) .load(fileLocation) val assembler = new VectorAssembler() .setInputCols(Array("feature0", "feature1", "feature2", "feature3", "feature4", "feature5")) .setOutputCol("features") val data = assembler .transform(rawData) .select("features", "label") data } override def reader: MLReadable[_] = IsolationForest override def modelReader: MLReadable[_] = IsolationForestModel override def testObjects(): Seq[TestObject[IsolationForest]] = { val dataset = loadMammographyData.toDF Seq(new TestObject( new IsolationForest(), dataset)) } }
Example 41
Source File: ACMEModel.scala From cdsw-simple-serving with Apache License 2.0 | 5 votes |
// Don't execute these lines in the workbench -- skip to "Start workbench session" package acme import org.apache.spark.ml.PipelineModel import com.cloudera.datascience.cdsw.acme.ACMEData import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.ml.{Pipeline, PipelineModel} import scala.util.Random // Read and cache training data prepared from acme-dataeng: val training = ACMEData.readData() training.cache() training.show() // Build a logistic regression model, val assembler = new VectorAssembler(). setInputCols(training.columns.filter(_ != "Occupancy")). setOutputCol("featureVec") val lr = new LogisticRegression(). setFeaturesCol("featureVec"). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val pipeline = new Pipeline().setStages(Array(assembler, lr)) // and tune that model: val paramGrid = new ParamGridBuilder(). addGrid(lr.regParam, Seq(0.00001, 0.001, 0.1)). addGrid(lr.elasticNetParam, Seq(1.0)). build() val eval = new BinaryClassificationEvaluator(). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val validator = new TrainValidationSplit(). setSeed(Random.nextLong()). setEstimator(pipeline). setEvaluator(eval). setEstimatorParamMaps(paramGrid). setTrainRatio(0.9) val validatorModel = validator.fit(training) val pipelineModel = validatorModel.bestModel.asInstanceOf[PipelineModel] val lrModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel] // Logistic regression model parameters: training.columns.zip(lrModel.coefficients.toArray).foreach(println) // Model hyperparameters: lrModel.getElasticNetParam lrModel.getRegParam // Validation metric (accuracy): validatorModel.validationMetrics.max pipelineModel // End workbench session } }
Example 42
Source File: GiniSelectorSuite.scala From spark-FeatureSelection with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature.selection.filter import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.feature.selection.{FeatureSelectionTestBase, FeatureSelectorTestBase} class GiniSelectorSuite extends FeatureSelectionTestBase { test("Test GiniSelector: numTopFeatures") { val selector = new GiniSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(2) val importantColNames = Array("pLength", "pWidth") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[GiniSelector, GiniSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test GiniSelector: percentile") { val selector = new GiniSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.51) val importantColNames = Array("pLength", "pWidth") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[GiniSelector, GiniSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test GiniSelector: randomCutOff") { val selector = new GiniSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("randomCutOff").setRandomCutOff(1.0) val importantColNames = Array("pLength", "pWidth", "sLength", "sWidth") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[GiniSelector, GiniSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("GiniSelector read/write") { val nb = new GiniSelector testEstimatorAndModelReadWrite[GiniSelector, GiniSelectorModel](nb, dataset, FeatureSelectorTestBase.allParamSettings, FeatureSelectorTestBase.checkModelData) } }
Example 43
Source File: InfoGainSelectorSuite.scala From spark-FeatureSelection with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature.selection.filter import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.feature.selection.{FeatureSelectionTestBase, FeatureSelectorTestBase} class InfoGainSelectorSuite extends FeatureSelectionTestBase { test("Test InfoGainSelector: numTopFeatures") { val selector = new InfoGainSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(2) val importantColNames = Array("pLength", "pWidth") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[InfoGainSelector, InfoGainSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test InfoGainSelector: percentile") { val selector = new InfoGainSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.51) val importantColNames = Array("pLength", "pWidth") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[InfoGainSelector, InfoGainSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test InfoGainSelector: randomCutOff") { val selector = new InfoGainSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("randomCutOff").setRandomCutOff(1.0) val importantColNames = Array("pLength", "pWidth", "sLength", "sWidth") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[InfoGainSelector, InfoGainSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("InfoGainSelector read/write") { val nb = new InfoGainSelector testEstimatorAndModelReadWrite[InfoGainSelector, InfoGainSelectorModel](nb, dataset, FeatureSelectorTestBase.allParamSettings, FeatureSelectorTestBase.checkModelData) } }
Example 44
Source File: TrainValidationSplitParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.sql.DataFrame class TrainValidationSplitParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new TrainValidationSplit(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 45
Source File: VectorIndexerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, VectorIndexer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class VectorIndexerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "state") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("state"). setOutputCol("state_index"), new VectorAssembler(). setInputCols(Array("dti", "loan_amount", "state_index")). setOutputCol("features"), new VectorIndexer(). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 46
Source File: DCTParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{DCT, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class DCTParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new DCT(uid = "dct"). setInverse(true). setInputCol("features"). setOutputCol("filter_features"))).fit(dataset) }
Example 47
Source File: BucketedRandomProjectionLSHParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{BucketedRandomProjectionLSH, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class BucketedRandomProjectionLSHParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new BucketedRandomProjectionLSH(). setInputCol("features"). setBucketLength(2). setOutputCol("lsh_features"))).fit(dataset) }
Example 48
Source File: MinMaxScalerPipelineParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, MinMaxScaler, QuantileDiscretizer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MinMaxScalerPipelineParitySpec extends SparkParityBase { private val getKeys: Map[String, Double] => Seq[String] = { input: Map[String, Double] => input.keySet.toSeq } val keyUdf = functions.udf(getKeys) override val dataset = spark.createDataFrame(Seq( (Array("1"), 1.0, Map("a" -> 0.1, "b" -> 0.2, "c" -> 0.3), 1), (Array("2"), 10.0, Map("d" -> 0.1, "e" -> 0.2, "c" -> 0.3), 0), (Array("3"), 20.0, Map("x" -> 0.1, "a" -> 0.2, "b" -> 0.3), 0), (Array("4"), 15.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0), (Array("5"), 18.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0), (Array("6"), 25.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 1), (Array("6"), 5.0, Map("a" -> 0.1, "b" -> 0.2, "d" -> 0.3), 0), (Array("7"), 30.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0)) ) .toDF("book_id", "pv", "myInputCol0", "label") .withColumn("myInputCol", keyUdf(functions.col("myInputCol0"))) .drop("myInputCol0") override val sparkTransformer = new Pipeline() .setStages(Array(new CountVectorizer() .setInputCol("book_id") .setOutputCol("book_id_vec") .setMinDF(1) .setMinTF(1) .setBinary(true), new QuantileDiscretizer() .setInputCol("pv") .setOutputCol("pv_bucket") .setNumBuckets(3), new CountVectorizer() .setInputCol("myInputCol") .setOutputCol("myInputCol1_vec") .setMinDF(1) .setMinTF(1) .setBinary(true), new VectorAssembler() .setInputCols(Array("pv_bucket", "book_id_vec", "myInputCol1_vec")) .setOutputCol("vectorFeature"), new MinMaxScaler().setInputCol("vectorFeature").setOutputCol("scaledFeatures"))).fit(dataset) }
Example 49
Source File: VectorSlicerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{VectorAssembler, VectorSlicer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class VectorSlicerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new VectorSlicer(). setIndices(Array(1)). setNames(Array("dti")). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) }
Example 50
Source File: NormalizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{Normalizer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class NormalizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new Normalizer(). setP(3d). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) }
Example 51
Source File: PolynomialExpansionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{PolynomialExpansion, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class PolynomialExpansionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new PolynomialExpansion(). setInputCol("features"). setOutputCol("poly"). setDegree(3))).fit(dataset) }
Example 52
Source File: PcaParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{PCA, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class PcaParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new PCA(). setInputCol("features"). setOutputCol("pca_features"). setK(2))).fit(dataset) override val unserializedParams = Set("k") }
Example 53
Source File: BinarizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{Binarizer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame class BinarizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti")). setOutputCol("features"), new Binarizer(). setThreshold(0.12). setInputCol("dti"). setOutputCol("thresholded_features_double"), new Binarizer(). setThreshold(0.12). setInputCol("features"). setOutputCol("thresholded_features"))).fit(dataset) }
Example 54
Source File: MinMaxScalerWithNonDefaultsParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{MinMaxScaler, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class MinMaxScalerWithNonDefaultsParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new MinMaxScaler(). setInputCol("features"). setOutputCol("scaled_features"). setMin(2.0). setMax(4.0))).fit(dataset) }
Example 55
Source File: BisectingKMeansParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.clustering.BisectingKMeans import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class BisectingKMeansParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new BisectingKMeans(). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "k", "maxIter", "seed", "minDivisibleClusterSize") }
Example 56
Source File: CrossValidatorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor} import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.DataFrame class CrossValidatorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new CrossValidator(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 57
Source File: LinearSVCParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.parity import org.apache.spark.ml.classification.LinearSVCModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class LinearSVCParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline() .setStages(Array( new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LinearSVCModel("linear_svc", Vectors.dense(0.44, 0.77), 0.66).setThreshold(0.5).setFeaturesCol("features"))) .fit(dataset) // The string order type is ignored, because once the transformer is built based on some order type, we need to serialize only the string to index map // but not the order in which it has to index. This value we can ignore while we check the transformer values. override val unserializedParams: Set[String] = Set("stringOrderType") }
Example 58
Source File: VectorAssemblerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.bundle.DataShape import ml.combust.bundle.BundleContext import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.mleap.TypeConverters._ import ml.combust.mleap.runtime.types.BundleTypeConverters._ class VectorAssemblerOp extends SimpleSparkOp[VectorAssembler] { override val Model: OpModel[SparkBundleContext, VectorAssembler] = new OpModel[SparkBundleContext, VectorAssembler] { override val klazz: Class[VectorAssembler] = classOf[VectorAssembler] override def opName: String = Bundle.BuiltinOps.feature.vector_assembler override def store(model: Model, obj: VectorAssembler) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val dataset = context.context.dataset.get val inputShapes = obj.getInputCols.map(i => sparkToMleapDataShape(dataset.schema(i), dataset): DataShape) model.withValue("input_shapes", Value.dataShapeList(inputShapes)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): VectorAssembler = { new VectorAssembler(uid = "") } } override def sparkLoad(uid: String, shape: NodeShape, model: VectorAssembler): VectorAssembler = { new VectorAssembler(uid = uid) } override def sparkInputs(obj: VectorAssembler): Seq[ParamSpec] = { Seq("input" -> obj.inputCols) } override def sparkOutputs(obj: VectorAssembler): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 59
Source File: ReebDiagram.scala From spark-tda with Apache License 2.0 | 5 votes |
import java.io.{File, PrintWriter} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.feature.{ReebDiagram, VectorAssembler} import org.apache.spark.sql.functions._ def computeReebDiagram( pathToTextFile: String, quantity: Int, linkThresholdRatio: Double, coreThresholdRatio: Double, topTreeRatio: Double) { def save(f: File)(func: PrintWriter => Unit) { val p = new PrintWriter(f) try { func(p) } finally { p.close() } } val filename = pathToTextFile.split("\\.")(0) val outputFilename = s"$filename-REEB-k${quantity}-l${linkThresholdRatio}-c${coreThresholdRatio}-i${topTreeRatio}.tsv" val points = sc.textFile(pathToTextFile) .map { line => line.trim.split("\\s+") } .zipWithIndex .map { case (row, i) => (i, row(0).toDouble, row(1).toDouble, 0) } .toDF("id", "x", "y", "cover_id") val cardinality = points.count val assembler = new VectorAssembler() .setInputCols(Array("x", "y")) .setOutputCol("feature") val features = assembler .transform(points) val reeb = new ReebDiagram() .setK(quantity) .setLinkThresholdRatio(linkThresholdRatio) .setCoreThresholdRatio(coreThresholdRatio) .setTopTreeSize((topTreeRatio * cardinality).toInt) .setTopTreeLeafSize(quantity) .setIdCol("id") .setCoverCol("cover_id") .setFeaturesCol("feature") .setOutputCol("cluster_id") val transformed = reeb .fit(features) .transform(features) val clusters = Map( transformed .select("cluster_id") .rdd .map(row => row.getLong(0)) .distinct .zipWithIndex .collect(): _*) val result = transformed .select("x", "y", "cluster_id") .rdd .map(row => (row.getDouble(0), row.getDouble(1), row.getLong(2))) .map { case (x, y, clusterId) => (x, y, clusters(clusterId) + 1)} .collect() save(new File(outputFilename)) { println(s"OUTPUT TO: ${outputFilename}") f => result.foreach{ case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}") } } }
Example 60
Source File: PipelineConstruction.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} object PipelineConstruction { // Index labels, adding metadata to the label column. Fit on whole dataset to include all labels in index. val ipindexer = new StringIndexer() .setInputCol("international_plan") .setOutputCol("iplanIndex") val labelindexer = new StringIndexer() .setInputCol("churn") .setOutputCol("label") val featureCols = Array("account_length", "iplanIndex", "num_voice_mail", "total_day_mins", "total_day_calls", "total_evening_mins", "total_evening_calls", "total_night_mins", "total_night_calls", "total_international_mins", "total_international_calls", "total_international_num_calls") val assembler = new VectorAssembler() .setInputCols(featureCols) .setOutputCol("features") }
Example 61
Source File: Preproessing.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML import org.apache.spark.ml.feature.{ StringIndexer, StringIndexerModel} import org.apache.spark.ml.feature.VectorAssembler object Preproessing { var trainSample = 1.0 var testSample = 1.0 val train = "data/insurance_train.csv" val test = "data/insurance_test.csv" val spark = SparkSessionCreate.createSession() import spark.implicits._ println("Reading data from " + train + " file") val trainInput = spark.read .option("header", "true") .option("inferSchema", "true") .format("com.databricks.spark.csv") .load(train) .cache val testInput = spark.read .option("header", "true") .option("inferSchema", "true") .format("com.databricks.spark.csv") .load(test) .cache println("Preparing data for training model") var data = trainInput.withColumnRenamed("loss", "label").sample(false, trainSample) var DF = data.na.drop() // Null check if (data == DF) println("No null values in the DataFrame") else { println("Null values exist in the DataFrame") data = DF } val seed = 12345L val splits = data.randomSplit(Array(0.75, 0.25), seed) val (trainingData, validationData) = (splits(0), splits(1)) trainingData.cache validationData.cache val testData = testInput.sample(false, testSample).cache def isCateg(c: String): Boolean = c.startsWith("cat") def categNewCol(c: String): String = if (isCateg(c)) s"idx_${c}" else c // Function to remove categorical columns with too many categories def removeTooManyCategs(c: String): Boolean = !(c matches "cat(109$|110$|112$|113$|116$)") // Function to select only feature columns (omit id and label) def onlyFeatureCols(c: String): Boolean = !(c matches "id|label") // Definitive set of feature columns val featureCols = trainingData.columns .filter(removeTooManyCategs) .filter(onlyFeatureCols) .map(categNewCol) // StringIndexer for categorical columns (OneHotEncoder should be evaluated as well) val stringIndexerStages = trainingData.columns.filter(isCateg) .map(c => new StringIndexer() .setInputCol(c) .setOutputCol(categNewCol(c)) .fit(trainInput.select(c).union(testInput.select(c)))) // VectorAssembler for training features val assembler = new VectorAssembler() .setInputCols(featureCols) .setOutputCol("features") }
Example 62
Source File: MetadataTest.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.feature.operator.{MetadataTransformUtils, VectorCartesian} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfter, FunSuite} class MetadataTest extends FunSuite with BeforeAndAfter { var spark: SparkSession = _ before { spark = SparkSession.builder().master("local").getOrCreate() } after { spark.close() } test("test_vector_cartesian") { val data = spark.read.format("libsvm") .option("numFeatures", "123") .load("data/a9a/a9a_123d_train_trans.libsvm") .persist() val cartesian = new VectorCartesian() .setInputCols(Array("features", "features")) .setOutputCol("cartesian_features") val assembler = new VectorAssembler() .setInputCols(Array("features", "cartesian_features")) .setOutputCol("assemble_features") val pipeline = new Pipeline() .setStages(Array(cartesian, assembler)) val featureModel = pipeline.fit(data) val crossDF = featureModel.transform(data) crossDF.schema.fields.foreach { field => println("name: " + field.name) println("metadata: " + field.metadata.toString()) } } test("test_three_order_cartesian") { val data = spark.read.format("libsvm") .option("numFeatures", 8) .load("data/abalone/abalone_8d_train.libsvm") .persist() val cartesian = new VectorCartesian() .setInputCols(Array("features", "features")) .setOutputCol("f_f") val cartesian2 = new VectorCartesian() .setInputCols(Array("features", "f_f")) .setOutputCol("f_f_f") val pipeline = new Pipeline() .setStages(Array(cartesian, cartesian2)) val crossDF = pipeline.fit(data).transform(data).persist() // first cartesian, the number of dimensions is 64 println("first cartesian dimension = " + crossDF.select("f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).length) println(crossDF.select("f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).mkString(",")) println() // second cartesian, the number of dimensions is 512 println("second cartesian dimension = " + crossDF.select("f_f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).length) println(crossDF.select("f_f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).mkString(",")) } }
Example 63
Source File: FeatureCrossSelectorExample.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.examples import org.apache.spark.SparkConf import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.feature.operator.{VarianceSelector, VectorCartesian} import org.apache.spark.sql.SparkSession object FeatureCrossSelectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf() val input = conf.get("spark.input.path", "data/a9a/a9a_123d_train_trans.libsvm") val numFeatures = conf.get("spark.num.feature", "123") val twoOrderNumFeatures = conf.getInt("spark.two.order.num.feature", 123) val threeOrderNumFeatures = conf.getInt("spark.three.order.num.feature", 123) val spark = SparkSession.builder().master("local").config(conf).getOrCreate() val data = spark.read.format("libsvm") .option("numFeatures", numFeatures) .load(input) .persist() val cartesian = new VectorCartesian() .setInputCols(Array("features", "features")) .setOutputCol("f_f") val selector = new VarianceSelector() .setFeaturesCol("f_f") .setOutputCol("selected_f_f") .setNumTopFeatures(twoOrderNumFeatures) val cartesian2 = new VectorCartesian() .setInputCols(Array("features", "selected_f_f")) .setOutputCol("f_f_f") val selector2 = new VarianceSelector() .setFeaturesCol("f_f_f") .setOutputCol("selected_f_f_f") .setNumTopFeatures(threeOrderNumFeatures) val assembler = new VectorAssembler() .setInputCols(Array("features", "selected_f_f", "selected_f_f_f")) .setOutputCol("assembled_features") val pipeline = new Pipeline() .setStages(Array(cartesian, selector, cartesian2, selector2, assembler)) val crossDF = pipeline.fit(data).transform(data).persist() data.unpersist() crossDF.drop("f_f", "f_f_f", "selected_f_f", "selected_f_f_f") crossDF.show(1) val splitDF = crossDF.randomSplit(Array(0.9, 0.1)) val trainDF = splitDF(0).persist() val testDF = splitDF(1).persist() val originalLR = new LogisticRegression() .setFeaturesCol("features") .setLabelCol("label") .setMaxIter(20) .setRegParam(0.01) val originalPredictions = originalLR.fit(trainDF).transform(testDF) originalPredictions.show(1) val originalEvaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("rawPrediction") .setMetricName("areaUnderROC") val originalAUC = originalEvaluator.evaluate(originalPredictions) println(s"original features auc: $originalAUC") val crossLR = new LogisticRegression() .setFeaturesCol("assembled_features") .setLabelCol("label") .setMaxIter(20) .setRegParam(0.01) val crossPredictions = crossLR.fit(trainDF).transform(testDF) crossPredictions.show(1) val crossEvaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("rawPrediction") .setMetricName("areaUnderROC") val crossAUC = crossEvaluator.evaluate(crossPredictions) println(s"cross features auc: $crossAUC") spark.close() } }
Example 64
Source File: LocalVectorAssembler.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.VectorAssembler import scala.collection.mutable class LocalVectorAssembler(override val sparkTransformer: VectorAssembler) extends LocalTransformer[VectorAssembler] { override def transform(localData: LocalData): LocalData = { if (sparkTransformer.getInputCols.isEmpty) { localData } else { val co = sparkTransformer.getInputCols.toList.map { inName => localData.column(inName) match { case Some(inCol) => inCol.data.map { case number: java.lang.Number => Seq(number.doubleValue()) case boolean: java.lang.Boolean => Seq(if (boolean) 1.0 else 0.0) case vector: Seq[Number @unchecked] if vector.isInstanceOf[Seq[Number]] => vector.map(_.doubleValue()) case x => throw new IllegalArgumentException(s"LocalVectorAssembler does not support the ($x) ${x.getClass} type") } case None => throw new IllegalArgumentException(s"LocalVectorAssembler needs $inName column, which doesn't exist") } } val colLen = co.headOption.getOrElse(throw new IllegalArgumentException("Input data is empty")).length val builder = mutable.ArrayBuffer.empty[Seq[Double]] var idx = 0 while (idx < colLen) { val row = co.map { column => column(idx) } builder += row.flatten idx += 1 } val result = builder.toList localData.withColumn( LocalDataColumn( sparkTransformer.getOutputCol, result ) ) } } private def assemble(vv: Seq[Seq[Double]]): Seq[Double] = { vv.flatten } } object LocalVectorAssembler extends SimpleModelLoader[VectorAssembler] with TypedTransformerConverter[VectorAssembler] { override def build(metadata: Metadata, data: LocalData): VectorAssembler = { val assembler = new VectorAssembler(metadata.uid) assembler .setInputCols(metadata.getAs[Seq[String]]("inputCols").get.toArray) .setOutputCol(metadata.outputCol.get) } override implicit def toLocal( sparkTransformer: VectorAssembler ): LocalTransformer[VectorAssembler] = new LocalVectorAssembler(sparkTransformer) }
Example 65
Source File: ModelPersistence.scala From reactive-machine-learning-systems with MIT License | 5 votes |
package com.reactivemachinelearning import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{QuantileDiscretizer, VectorAssembler} import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder} import org.apache.spark.sql.SparkSession object ModelPersistence extends App { val session = SparkSession.builder.appName("ModelPersistence").getOrCreate() val data = Seq( (0, 18.0, 0), (1, 20.0, 0), (2, 8.0, 1), (3, 5.0, 1), (4, 2.0, 0), (5, 21.0, 0), (6, 7.0, 1), (7, 18.0, 0), (8, 3.0, 1), (9, 22.0, 0), (10, 8.0, 1), (11, 2.0, 0), (12, 5.0, 1), (13, 4.0, 1), (14, 1.0, 0), (15, 11.0, 0), (16, 7.0, 1), (17, 15.0, 0), (18, 3.0, 1), (19, 20.0, 0)) val instances = session.createDataFrame(data) .toDF("id", "seeds", "label") val discretizer = new QuantileDiscretizer() .setInputCol("seeds") .setOutputCol("discretized") .setNumBuckets(3) val assembler = new VectorAssembler() .setInputCols(Array("discretized")) .setOutputCol("features") val classifier = new LogisticRegression() .setMaxIter(5) val pipeline = new Pipeline() .setStages(Array(discretizer, assembler, classifier)) val paramMaps = new ParamGridBuilder() .addGrid(classifier.regParam, Array(0.0, 0.1)) .build() val evaluator = new BinaryClassificationEvaluator() val crossValidator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setNumFolds(2) .setEstimatorParamMaps(paramMaps) val model = crossValidator.fit(instances) model.write.overwrite().save("my-model") val persistedModel = CrossValidatorModel.load("./my-model") println(s"UID: ${persistedModel.uid}") }
Example 66
Source File: GBTRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.GBTRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class GBTRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new GBTRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 67
Source File: SupportVectorMachineParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.classification import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.mleap.classification.SVMModel import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql._ class SupportVectorMachineParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new SVMModel(uid = "svm", model = new mllib.classification.SVMModel(weights = Vectors.dense(0.53, 0.67), intercept = 0.77)). setRawPredictionCol("raw_prediction"). setProbabilityCol("probability"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 68
Source File: MultinomialLabelerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.{MultinomialLabelerModel, ReverseStringIndexerModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.mleap.feature.MultinomialLabeler import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MultinomialLabelerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new MultinomialLabeler(uid = "multinomial_labeler", model = MultinomialLabelerModel(threshold = 0.1, indexer = ReverseStringIndexerModel(Seq("fico", "dtizy")))). setFeaturesCol("features"). setProbabilitiesCol("probabilities"). setLabelsCol("labels"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 69
Source File: TestSparkMl.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.databricks.runtime.testkit import java.io.File import java.nio.file.{Files, StandardCopyOption} import ml.combust.bundle.BundleFile import org.apache.spark.ml.bundle.SparkBundleContext import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.SparkSession import com.databricks.spark.avro._ import ml.combust.mleap.spark.SparkSupport._ import ml.combust.mleap.runtime.MleapSupport._ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression class TestSparkMl(session: SparkSession) extends Runnable { override def run(): Unit = { val sqlContext = session.sqlContext // Create a temporary file and copy the contents of the resource avro to it val path = Files.createTempFile("mleap-databricks-runtime-testkit", ".avro") Files.copy(getClass.getClassLoader.getResource("datasources/lending_club_sample.avro").openStream(), path, StandardCopyOption.REPLACE_EXISTING) val sampleData = sqlContext.read.avro(path.toString) sampleData.show() val stringIndexer = new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index") val featureAssembler = new VectorAssembler(). setInputCols(Array(stringIndexer.getOutputCol, "dti", "loan_amount")). setOutputCol("features") val logisticRegression = new LogisticRegression(). setFeaturesCol(featureAssembler.getOutputCol). setLabelCol("approved"). setPredictionCol("prediction") val pipeline = new Pipeline().setStages(Array(stringIndexer, featureAssembler, logisticRegression)) val model = pipeline.fit(sampleData) val modelPath = Files.createTempFile("mleap-databricks-runtime-testkit", ".zip") Files.delete(modelPath) // Save the model { println("Writing model to...", modelPath) implicit val sbc = SparkBundleContext.defaultContext.withDataset(model.transform(sampleData)) val bf = BundleFile(new File(modelPath.toString)) model.writeBundle.save(bf).get bf.close() } // Load the model { val bf = BundleFile(new File(modelPath.toString)) bf.loadMleapBundle().get bf.close() } } }
Example 70
Source File: TestXgboost.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.databricks.runtime.testkit import java.io.File import java.nio.file.{Files, StandardCopyOption} import ml.combust.bundle.BundleFile import org.apache.spark.ml.bundle.SparkBundleContext import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.SparkSession import com.databricks.spark.avro._ import ml.combust.mleap.spark.SparkSupport._ import ml.combust.mleap.runtime.MleapSupport._ import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier import org.apache.spark.ml.Pipeline class TestXgboost(session: SparkSession) extends Runnable { private val xgboostParams: Map[String, Any] = Map( "eta" -> 0.3, "max_depth" -> 2, "objective" -> "binary:logistic", "early_stopping_rounds" ->2, "num_round" -> 15, "nworkers" -> 2 ) override def run(): Unit = { val sqlContext = session.sqlContext // Create a temporary file and copy the contents of the resource avro to it val path = Files.createTempFile("mleap-databricks-runtime-testkit", ".avro") Files.copy(getClass.getClassLoader.getResource("datasources/lending_club_sample.avro").openStream(), path, StandardCopyOption.REPLACE_EXISTING) val sampleData = sqlContext.read.avro(path.toString) sampleData.show() val stringIndexer = new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index") val featureAssembler = new VectorAssembler(). setInputCols(Array(stringIndexer.getOutputCol, "dti", "loan_amount")). setOutputCol("features") val logisticRegression = new XGBoostClassifier(xgboostParams). setFeaturesCol("features"). setLabelCol("approved"). setPredictionCol("prediction") val pipeline = new Pipeline().setStages(Array(stringIndexer, featureAssembler, logisticRegression)) val model = pipeline.fit(sampleData) val modelPath = Files.createTempFile("mleap-databricks-runtime-testkit", ".zip") Files.delete(modelPath) { println("Writing model to...", modelPath) implicit val sbc = SparkBundleContext.defaultContext.withDataset(model.transform(sampleData)) val bf = BundleFile(new File(modelPath.toString)) model.writeBundle.save(bf).get bf.close() } { val bf = BundleFile(new File(modelPath.toString)) bf.loadMleapBundle() bf.close() } } }
Example 71
Source File: IsotonicRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.IsotonicRegression import org.apache.spark.sql._ class IsotonicRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount").sample(withReplacement = true, 0.05) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti")). setOutputCol("features"), new IsotonicRegression(). setFeaturesCol("dti"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("labelCol") }
Example 72
Source File: GeneralizedLinearRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.sql._ class GeneralizedLinearRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new GeneralizedLinearRegression(). setFamily("gaussian"). setLink("log"). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "maxIter", "tol", "regParam", "solver", "variancePower") }
Example 73
Source File: DecisionTreeRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class DecisionTreeRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new DecisionTreeRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 74
Source File: RandomForestRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class RandomForestRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 75
Source File: LinearRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class LinearRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new LinearRegression(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "elasticNetParam", "maxIter", "tol", "epsilon", "labelCol", "loss", "regParam", "solver") }
Example 76
Source File: AFTSurvivalRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.AFTSurvivalRegression import org.apache.spark.sql._ import org.apache.spark.sql.functions.lit class AFTSurvivalRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount").withColumn("censor", lit(1.0)) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new AFTSurvivalRegression(). setQuantileProbabilities(Array(0.5)). setFeaturesCol("features"). setLabelCol("loan_amount"). setQuantilesCol("quant"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("labelCol", "stringOrderType", "maxIter", "tol") }
Example 77
Source File: VectorAssemblerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object VectorAssemblerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorAssemblerExample") .getOrCreate() // $example on$ val dataset = spark.createDataFrame( Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0)) ).toDF("id", "hour", "mobile", "userFeatures", "clicked") val assembler = new VectorAssembler() .setInputCols(Array("hour", "mobile", "userFeatures")) .setOutputCol("features") val output = assembler.transform(dataset) println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 78
Source File: NaiveBayesClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class NaiveBayesClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new NaiveBayes(uid = "nb"). setModelType("multinomial"). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "smoothing") }
Example 79
Source File: RandomForestClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class RandomForestClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new RandomForestClassifier(). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "seed") }
Example 80
Source File: GBTClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class GBTClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new GBTClassifier(). setFeaturesCol("features"). setLabelCol("label"). setThresholds(Array(1.0, 1.0)). setProbabilityCol("myProbability"). setPredictionCol("myPrediction"). setRawPredictionCol("myRawPrediction") )).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 81
Source File: MultinomialLogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class MultinomialLogisticRegressionParitySpec extends SparkParityBase { val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0) val ages = Seq(15, 30, 40, 50, 15, 80) val heights = Seq(175, 190, 155, 160, 170, 180) val weights = Seq(67, 100, 57, 56, 56, 88) val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) }) val schema = new StructType().add("label", DoubleType, nullable = false) .add("age", IntegerType, nullable = false) .add("height", IntegerType, nullable = false) .add("weight", IntegerType, nullable = false) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new VectorAssembler(). setInputCols(Array("age", "height", "weight")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)), interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703), numClasses = 3, isMultinomial = true))).fit(dataset) }
Example 82
Source File: LogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame import org.apache.spark.ml.linalg.Vectors class LogisticRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficients = Vectors.dense(0.44, 0.77), intercept = 0.66).setThreshold(0.7).setFeaturesCol("features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 83
Source File: OneVsRestParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame class OneVsRestParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new OneVsRest().setClassifier(new LogisticRegression()). setLabelCol("fico_index"). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "classifier", "labelCol") }
Example 84
Source File: DecisionTreeClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class DecisionTreeClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new DecisionTreeClassifier(). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 85
Source File: GaussianMixtureParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.clustering.GaussianMixture import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class GaussianMixtureParitySpec extends SparkParityBase { override val dataset: DataFrame = { baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") } override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new GaussianMixture(). setFeaturesCol("features"). setPredictionCol("prediction"). setProbabilityCol("probability"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "k", "maxIter", "seed", "tol") }
Example 86
Source File: KMeansParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.clustering.KMeans import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class KMeansParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new KMeans(). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "initMode", "initSteps", "maxIter", "tol", "k", "seed") }