org.apache.spark.ml.feature.VectorIndexer Scala Examples
The following examples show how to use org.apache.spark.ml.feature.VectorIndexer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: VectorIndexerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorIndexer // $example off$ import org.apache.spark.sql.SparkSession object VectorIndexerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorIndexerExample") .getOrCreate() // $example on$ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val indexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexed") .setMaxCategories(10) val indexerModel = indexer.fit(data) val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet println(s"Chose ${categoricalFeatures.size} categorical features: " + categoricalFeatures.mkString(", ")) // Create new column "indexed" with categorical values transformed to indices val indexedData = indexerModel.transform(data) indexedData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: VectorIndexerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, VectorIndexer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class VectorIndexerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "state") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("state"). setOutputCol("state_index"), new VectorAssembler(). setInputCols(Array("dti", "loan_amount", "state_index")). setOutputCol("features"), new VectorIndexer(). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 3
Source File: GeneralizedLinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer} import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{SparkSession, _} object GeneralizedLinearRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def genLinearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = { val lr = new GeneralizedLinearRegression() .setFeaturesCol("features") .setLabelCol("label") .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3) val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr)) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) val model = pipeline.fit(training) val fullPredictions = model.transform(test).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select("label").rdd.map(_.getDouble(0)) val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError println(s" Root mean squared error (RMSE): $RMSE") } def genLinearRegressionWithSVMFormat(spark: SparkSession) = { // Load training data val training = spark.read.format("libsvm") .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt") val lr = new GeneralizedLinearRegression() .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3) // Fit the model val model = lr.fit(training) // Print the coefficients and intercept for generalized linear regression model println(s"Coefficients: ${model.coefficients}") println(s"Intercept: ${model.intercept}") // Summarize the model over the training set and print out some metrics val summary = model.summary println(s"Coefficient Standard Errors: ${summary.coefficientStandardErrors.mkString(",")}") println(s"T Values: ${summary.tValues.mkString(",")}") println(s"P Values: ${summary.pValues.mkString(",")}") println(s"Dispersion: ${summary.dispersion}") println(s"Null Deviance: ${summary.nullDeviance}") println(s"Residual Degree Of Freedom Null: ${summary.residualDegreeOfFreedomNull}") println(s"Deviance: ${summary.deviance}") println(s"Residual Degree Of Freedom: ${summary.residualDegreeOfFreedom}") println(s"AIC: ${summary.aic}") println("Deviance Residuals: ") summary.residuals().show() } }
Example 4
Source File: LinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer} import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, SparkSession} object LinearRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def linearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = { val lr = new LinearRegression() .setFeaturesCol("features") .setLabelCol("label") .setRegParam(0.1) .setElasticNetParam(1.0) .setMaxIter(10) val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr)) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) val model = pipeline.fit(training) val fullPredictions = model.transform(test).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select("label").rdd.map(_.getDouble(0)) val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError println(s" Root mean squared error (RMSE): $RMSE") } def linearRegressionWithSVMFormat(spark: SparkSession) = { // Load training data val training = spark.read.format("libsvm") .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt") val lr = new LinearRegression() .setMaxIter(10) .setRegParam(0.3) .setElasticNetParam(0.8) // Fit the model val lrModel = lr.fit(training) // Print the coefficients and intercept for linear regression println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") // Summarize the model over the training set and print out some metrics val trainingSummary = lrModel.summary println(s"numIterations: ${trainingSummary.totalIterations}") println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") trainingSummary.residuals.show() println(s"RMSE: ${trainingSummary.rootMeanSquaredError}") println(s"r2: ${trainingSummary.r2}") } }
Example 5
Source File: VectorIndexerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorIndexer // $example off$ import org.apache.spark.sql.SparkSession object VectorIndexerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorIndexerExample") .getOrCreate() // $example on$ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val indexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexed") .setMaxCategories(10) val indexerModel = indexer.fit(data) val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet println(s"Chose ${categoricalFeatures.size} categorical features: " + categoricalFeatures.mkString(", ")) // Create new column "indexed" with categorical values transformed to indices val indexedData = indexerModel.transform(data) indexedData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: VectorIndexerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorIndexer // $example off$ import org.apache.spark.sql.SparkSession object VectorIndexerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorIndexerExample") .getOrCreate() // $example on$ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val indexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexed") .setMaxCategories(10) val indexerModel = indexer.fit(data) val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet println(s"Chose ${categoricalFeatures.size} categorical features: " + categoricalFeatures.mkString(", ")) // Create new column "indexed" with categorical values transformed to indices val indexedData = indexerModel.transform(data) indexedData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 7
Source File: SeedSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.SquareLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext import rotationsymmetry.sxgboost.utils.TestingUtils._ class SeedSuite extends FunSuite with MLlibTestSparkContext with TestData { test("Different runs with the seed returns the same result"){ val data = sqlContext.createDataFrame(randomLabelPointRDD(sc, 100, 10, 3, 999)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sXGBoost1 = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) .setSampleRatio(0.5) .setFeatureSampleRatio(0.5) .setSeed(999) val sXGBoostModel1 = sXGBoost1.fit(featureIndexer.transform(data)) val sXGBoost2 = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) .setSampleRatio(0.5) .setFeatureSampleRatio(0.5) .setSeed(999) val sXGBoostModel2 = sXGBoost2.fit(featureIndexer.transform(data)) val sXGBoost3 = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) .setSampleRatio(0.5) .setFeatureSampleRatio(0.5) .setSeed(998) val sXGBoostModel3 = sXGBoost3.fit(featureIndexer.transform(data)) val evaluator = new RegressionEvaluator() val rmse1 = evaluator.evaluate(sXGBoostModel1.transform(featureIndexer.transform(data))) val rmse2 = evaluator.evaluate(sXGBoostModel2.transform(featureIndexer.transform(data))) val rmse3 = evaluator.evaluate(sXGBoostModel3.transform(featureIndexer.transform(data))) assert(rmse1 === rmse2) assert(rmse1 !~= rmse3 relTol 1e-3) } }
Example 8
Source File: SparkXGBoostRegressorSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.DecisionTreeRegressor import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.SquareLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext import rotationsymmetry.sxgboost.utils.TestingUtils._ class SparkXGBoostRegressorSuite extends FunSuite with TestData with MLlibTestSparkContext { test("Compare with DecisionTree using simple data") { val data = sqlContext.createDataFrame(sc.parallelize(simpleData, 2)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(1) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostRegressor)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures") .setMaxDepth(1) val dtPipeLine = new Pipeline() .setStages(Array(featureIndexer, dt)) val dtModel = dtPipeLine.fit(data) val evaluator = new RegressionEvaluator() val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data)) val dtrmse = evaluator.evaluate(dtModel.transform(data)) assert(sXGBoostrmse ~== dtrmse relTol 1e-5) } test("Compare with DecisionTree using random data") { val data = sqlContext.createDataFrame(randomLabelPointRDD(sc, 40, 10, 2, 999)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostRegressor)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures") .setMaxDepth(5) val dtPipeLine = new Pipeline() .setStages(Array(featureIndexer, dt)) val dtModel = dtPipeLine.fit(data) val evaluator = new RegressionEvaluator() val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data)) val dtrmse = evaluator.evaluate(dtModel.transform(data)) assert(sXGBoostrmse ~== dtrmse relTol 1e-5) } }
Example 9
Source File: SparkXGBoostClassifierSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.functions.udf import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.LogisticLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext class SparkXGBoostClassifierSuite extends FunSuite with TestData with MLlibTestSparkContext { test("test with simple data") { val rawdata = Seq( LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(0, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(1, Vectors.dense(1.0, 1.0)) ) val data = sqlContext.createDataFrame(sc.parallelize(rawdata, 2)) val truthUDF = udf { feature: Vector => if (feature(0) == feature(1)) 0.0 else 1.0 } val dataWithTruth = data.withColumn("truth", truthUDF(data("features"))) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostClassifier = new SparkXGBoostClassifier(new LogisticLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(2) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostClassifier)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("truth") .setPredictionCol("prediction") .setMetricName("precision") val precision = evaluator.evaluate(sXGBoostModel.transform(dataWithTruth)) assert(precision === 1.0) } }
Example 10
Source File: GradientBoostedTreeRegressorExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor} // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} predictions.select("prediction", "label", "features").show(5) // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label")//标签列名 //预测结果列名 .setPredictionCol("prediction") //rmse均方根误差说明样本的离散程度 .setMetricName("rmse") val rmse = evaluator.evaluate(predictions) //rmse均方根误差说明样本的离散程度 println("Root Mean Squared Error (RMSE) on test data = " + rmse) val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel] println("Learned regression GBT model:\n" + gbtModel.toDebugString) // $example off$ sc.stop() } } // scalastyle:on println
Example 11
Source File: RandomForestRegressorExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} // $example off$ import org.apache.spark.sql.Row import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} predictions.select("prediction", "label", "features").show(5) // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") //算法预测结果的存储列的名称, 默认是”prediction” .setPredictionCol("prediction") //rmse均方根误差说明样本的离散程度 .setMetricName("rmse") val rmse = evaluator.evaluate(predictions) //Root Mean Squared Error (RMSE) on test data = 0.09854713827168428 println("Root Mean Squared Error (RMSE) on test data = " + rmse) val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel] println("Learned regression forest model:\n" + rfModel.toDebugString) // $example off$ sc.stop() } } // scalastyle:on println
Example 12
Source File: VectorIndexerExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorIndexer // $example off$ import org.apache.spark.mllib.linalg.Vectors // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} import org.apache.spark.mllib.util._ indexedData.show() // $example off$ sc.stop() } } // scalastyle:on println
Example 13
Source File: VectorIndexerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorIndexer // $example off$ import org.apache.spark.sql.SparkSession object VectorIndexerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorIndexerExample") .getOrCreate() // $example on$ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val indexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexed") .setMaxCategories(10) val indexerModel = indexer.fit(data) val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet println(s"Chose ${categoricalFeatures.size} " + s"categorical features: ${categoricalFeatures.mkString(", ")}") // Create new column "indexed" with categorical values transformed to indices val indexedData = indexerModel.transform(data) indexedData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 14
Source File: VectorIndexerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorIndexer // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object VectorIndexerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VectorIndexerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val indexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexed") .setMaxCategories(10) val indexerModel = indexer.fit(data) val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet println(s"Chose ${categoricalFeatures.size} categorical features: " + categoricalFeatures.mkString(", ")) // Create new column "indexed" with categorical values transformed to indices val indexedData = indexerModel.transform(data) indexedData.show() // $example off$ sc.stop() } } // scalastyle:on println