org.apache.spark.ml.feature.StandardScaler Scala Examples
The following examples show how to use org.apache.spark.ml.feature.StandardScaler.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StandardScalerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StandardScaler // $example off$ import org.apache.spark.sql.SparkSession object StandardScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StandardScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false) // Compute summary statistics by fitting the StandardScaler. val scalerModel = scaler.fit(dataFrame) // Normalize each feature to have unit standard deviation. val scaledData = scalerModel.transform(dataFrame) scaledData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: Preparator.scala From pio-template-sr with Apache License 2.0 | 5 votes |
package org.template.sr import org.apache.predictionio.controller.PPreparator import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.apache.spark.ml.feature.StandardScaler import org.apache.spark.sql.DataFrame import org.apache.spark.ml.feature.StandardScalerModel import org.apache.spark.sql.SQLContext import org.apache.spark.mllib.linalg.Vectors class PreparedData( val rows: DataFrame, val dsp: DataSourceParams, val ssModel: org.apache.spark.mllib.feature.StandardScalerModel ) extends Serializable class Preparator extends PPreparator[TrainingData, PreparedData] { def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ if (trainingData.dsp.useStandardScaler) { val training = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features") val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(trainingData.dsp.standardScalerWithStd).setWithMean(trainingData.dsp.standardScalerWithMean) val scalerModel = scaler.fit(training) val scaledData = scalerModel.transform(training) val s1 = scaledData.select("label","censor","scaledFeatures").withColumnRenamed("scaledFeatures","features") //Prepare old StandardScaler val oldScaler = new org.apache.spark.mllib.feature.StandardScaler(withMean = trainingData.dsp.standardScalerWithMean, withStd = trainingData.dsp.standardScalerWithStd) val oldSSModel = oldScaler.fit(trainingData.rows.map(x=>(Vectors.dense(x._3)))) new PreparedData(rows = s1, dsp = trainingData.dsp, ssModel = oldSSModel) } else { new PreparedData(rows = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features"), dsp = trainingData.dsp, ssModel = null) } } }
Example 3
Source File: StandardScalerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StandardScaler // $example off$ import org.apache.spark.sql.SparkSession object StandardScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StandardScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false) // Compute summary statistics by fitting the StandardScaler. val scalerModel = scaler.fit(dataFrame) // Normalize each feature to have unit standard deviation. val scaledData = scalerModel.transform(dataFrame) scaledData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 4
Source File: StandardScalerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StandardScaler // $example off$ import org.apache.spark.sql.SparkSession object StandardScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StandardScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false) // Compute summary statistics by fitting the StandardScaler. val scalerModel = scaler.fit(dataFrame) // Normalize each feature to have unit standard deviation. val scaledData = scalerModel.transform(dataFrame) scaledData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 5
Source File: SparkStageParamTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.ml import com.salesforce.op.stages.SparkStageParam import com.salesforce.op.test.TestSparkContext import org.apache.spark.ml.feature.StandardScaler import org.joda.time.DateTime import org.json4s.JsonDSL._ import org.json4s._ import org.json4s.jackson.JsonMethods.{parse, _} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfterEach, FlatSpec} @RunWith(classOf[JUnitRunner]) class SparkStageParamTest extends FlatSpec with TestSparkContext with BeforeAndAfterEach { import SparkStageParam._ var savePath: String = _ var param: SparkStageParam[StandardScaler] = _ var stage: StandardScaler = _ override def beforeEach(): Unit = { super.beforeEach() savePath = tempDir + "/op-stage-param-test-" + DateTime.now().getMillis param = new SparkStageParam[StandardScaler](parent = "test" , name = "test", doc = "none") // by setting both to be the same, we guarantee that at least one isn't the default value stage = new StandardScaler().setWithMean(true).setWithStd(false) } // easier if test both at the same time Spec[SparkStageParam[_]] should "encode and decode properly when is set" in { param.savePath = Option(savePath) val jsonOut = param.jsonEncode(Option(stage)) val parsed = parse(jsonOut).asInstanceOf[JObject] val updated = parsed ~ ("path" -> savePath) // inject path for decoding updated shouldBe JObject( "className" -> JString(stage.getClass.getName), "uid" -> JString(stage.uid), "path" -> JString(savePath) ) val updatedJson = compact(updated) param.jsonDecode(updatedJson) match { case None => fail("Failed to recover the stage") case Some(stageRecovered) => stageRecovered shouldBe a[StandardScaler] stageRecovered.uid shouldBe stage.uid stageRecovered.getWithMean shouldBe stage.getWithMean stageRecovered.getWithStd shouldBe stage.getWithStd } } it should "except out when path is empty" in { intercept[RuntimeException](param.jsonEncode(Option(stage))).getMessage shouldBe s"Path must be set before Spark stage '${stage.uid}' can be saved" } it should "have empty path if stage is empty" in { param.savePath = Option(savePath) val jsonOut = param.jsonEncode(None) val parsed = parse(jsonOut) parsed shouldBe JObject("className" -> JString(NoClass), "uid" -> JString(NoUID)) param.jsonDecode(jsonOut) shouldBe None } }
Example 6
Source File: SparkWrapperParamsTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.features.types._ import com.salesforce.op.test.TestCommon import org.apache.spark.ml.feature.{StandardScaler, StandardScalerModel} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfterEach, FlatSpec} @RunWith(classOf[JUnitRunner]) class SparkWrapperParamsTest extends FlatSpec with BeforeAndAfterEach with TestCommon { private def estimator(sparkMlStageIn: Option[StandardScaler] = None) = { new SwUnaryEstimator[Real, Real, StandardScalerModel, StandardScaler]( inputParamName = "in", outputParamName = "out", operationName = "test-op", sparkMlStageIn = sparkMlStageIn ) } Spec[SparkWrapperParams[_]] should "have proper default values for path and stage" in { val stage = estimator() stage.getStageSavePath() shouldBe None stage.getSparkMlStage() shouldBe None } it should "when setting path, it should also set path to the stage param" in { val stage = estimator() stage.setStageSavePath("/test/path") stage.getStageSavePath() shouldBe Some("/test/path") } it should "allow set/get spark params on a wrapped stage" in { val sparkStage = new StandardScaler() val stage = estimator(sparkMlStageIn = Some(sparkStage)) stage.getSparkMlStage() shouldBe Some(sparkStage) for { sparkStage <- stage.getSparkMlStage() withMean = sparkStage.getOrDefault(sparkStage.withMean) } { withMean shouldBe false sparkStage.set[Boolean](sparkStage.withMean, true) sparkStage.get(sparkStage.withMean) shouldBe Some(true) } } }
Example 7
Source File: StandardScalerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StandardScaler // $example off$ import org.apache.spark.sql.SparkSession object StandardScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StandardScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false) // Compute summary statistics by fitting the StandardScaler. val scalerModel = scaler.fit(dataFrame) // Normalize each feature to have unit standard deviation. val scaledData = scalerModel.transform(dataFrame) scaledData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: StandardScalerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StandardScaler // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object StandardScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StandardScalerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false) // Compute summary statistics by fitting the StandardScaler. val scalerModel = scaler.fit(dataFrame) // Normalize each feature to have unit standard deviation. val scaledData = scalerModel.transform(dataFrame) scaledData.show() // $example off$ sc.stop() } } // scalastyle:on println
Example 9
Source File: StandardScalerSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{ScalerResult, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.StandardScaler import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class StandardScalerSuite extends SparkFeaturePFASuiteBase[ScalerResult] { implicit val enc = ExpressionEncoder[Vector]() val inputPath = "data/sample_lda_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(true) .setWithStd(true) override val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) override val input = withColumnAsArray(result, scaler.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() test("StandardScaler w/o Mean and Std") { val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(false) .setWithStd(false) val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("StandardScaler w/o Mean") { val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(false) .setWithStd(true) val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("StandardScaler w/o Std") { val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(true) .setWithStd(false) val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } }