org.apache.spark.ml.feature.StandardScaler Scala Example

Source File: StandardScalerExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SparkSession

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StandardScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: Preparator.scala From pio-template-sr with Apache License 2.0

5 votes

package org.template.sr



import org.apache.predictionio.controller.PPreparator
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.StandardScalerModel
import org.apache.spark.sql.SQLContext
import org.apache.spark.mllib.linalg.Vectors

class PreparedData(
  val rows: DataFrame,
  val dsp: DataSourceParams,
  val ssModel: org.apache.spark.mllib.feature.StandardScalerModel
) extends Serializable

class Preparator
  extends PPreparator[TrainingData, PreparedData] {

  def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    if (trainingData.dsp.useStandardScaler) {
      val training = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features")
      val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(trainingData.dsp.standardScalerWithStd).setWithMean(trainingData.dsp.standardScalerWithMean)
      val scalerModel = scaler.fit(training)
      val scaledData = scalerModel.transform(training)
      val s1 = scaledData.select("label","censor","scaledFeatures").withColumnRenamed("scaledFeatures","features")

      //Prepare old StandardScaler
      val oldScaler = new org.apache.spark.mllib.feature.StandardScaler(withMean = trainingData.dsp.standardScalerWithMean, withStd = trainingData.dsp.standardScalerWithStd)
      val oldSSModel = oldScaler.fit(trainingData.rows.map(x=>(Vectors.dense(x._3))))
            
      new PreparedData(rows = s1, dsp = trainingData.dsp, ssModel = oldSSModel)
    }
    else {
      new PreparedData(rows = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features"), dsp = trainingData.dsp, ssModel = null)
    }
  }
}

Source File: StandardScalerExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SparkSession

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StandardScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: StandardScalerExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SparkSession

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StandardScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: SparkStageParamTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package org.apache.spark.ml

import com.salesforce.op.stages.SparkStageParam
import com.salesforce.op.test.TestSparkContext
import org.apache.spark.ml.feature.StandardScaler
import org.joda.time.DateTime
import org.json4s.JsonDSL._
import org.json4s._
import org.json4s.jackson.JsonMethods.{parse, _}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfterEach, FlatSpec}


@RunWith(classOf[JUnitRunner])
class SparkStageParamTest extends FlatSpec with TestSparkContext with BeforeAndAfterEach {
  import SparkStageParam._

  var savePath: String = _
  var param: SparkStageParam[StandardScaler] = _
  var stage: StandardScaler = _

  override def beforeEach(): Unit = {
    super.beforeEach()
    savePath = tempDir + "/op-stage-param-test-" + DateTime.now().getMillis
    param = new SparkStageParam[StandardScaler](parent = "test" , name = "test", doc = "none")
    // by setting both to be the same, we guarantee that at least one isn't the default value
    stage = new StandardScaler().setWithMean(true).setWithStd(false)
  }

  // easier if test both at the same time
  Spec[SparkStageParam[_]] should "encode and decode properly when is set" in {
    param.savePath = Option(savePath)
    val jsonOut = param.jsonEncode(Option(stage))
    val parsed = parse(jsonOut).asInstanceOf[JObject]
    val updated = parsed ~ ("path" -> savePath) // inject path for decoding

    updated shouldBe JObject(
      "className" -> JString(stage.getClass.getName),
      "uid" -> JString(stage.uid),
      "path" -> JString(savePath)
    )
    val updatedJson = compact(updated)

    param.jsonDecode(updatedJson) match {
      case None => fail("Failed to recover the stage")
      case Some(stageRecovered) =>
        stageRecovered shouldBe a[StandardScaler]
        stageRecovered.uid shouldBe stage.uid
        stageRecovered.getWithMean shouldBe stage.getWithMean
        stageRecovered.getWithStd shouldBe stage.getWithStd
    }
  }

  it should "except out when path is empty" in {
    intercept[RuntimeException](param.jsonEncode(Option(stage))).getMessage shouldBe
      s"Path must be set before Spark stage '${stage.uid}' can be saved"
  }

  it should "have empty path if stage is empty" in {
    param.savePath = Option(savePath)
    val jsonOut = param.jsonEncode(None)
    val parsed = parse(jsonOut)

    parsed shouldBe JObject("className" -> JString(NoClass), "uid" -> JString(NoUID))
    param.jsonDecode(jsonOut) shouldBe None
  }
}

Source File: SparkWrapperParamsTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.sparkwrappers.generic

import com.salesforce.op.features.types._
import com.salesforce.op.test.TestCommon
import org.apache.spark.ml.feature.{StandardScaler, StandardScalerModel}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfterEach, FlatSpec}

@RunWith(classOf[JUnitRunner])
class SparkWrapperParamsTest extends FlatSpec with BeforeAndAfterEach with TestCommon {

  private def estimator(sparkMlStageIn: Option[StandardScaler] = None) = {
    new SwUnaryEstimator[Real, Real, StandardScalerModel, StandardScaler](
      inputParamName = "in", outputParamName = "out",
      operationName = "test-op", sparkMlStageIn = sparkMlStageIn
    )
  }

  Spec[SparkWrapperParams[_]] should "have proper default values for path and stage" in {
    val stage = estimator()
    stage.getStageSavePath() shouldBe None
    stage.getSparkMlStage() shouldBe None
  }
  it should "when setting path, it should also set path to the stage param" in {
    val stage = estimator()
    stage.setStageSavePath("/test/path")
    stage.getStageSavePath() shouldBe Some("/test/path")
  }
  it should "allow set/get spark params on a wrapped stage" in {
    val sparkStage = new StandardScaler()
    val stage = estimator(sparkMlStageIn = Some(sparkStage))
    stage.getSparkMlStage() shouldBe Some(sparkStage)
    for {
      sparkStage <- stage.getSparkMlStage()
      withMean = sparkStage.getOrDefault(sparkStage.withMean)
    } {
      withMean shouldBe false
      sparkStage.set[Boolean](sparkStage.withMean, true)
      sparkStage.get(sparkStage.withMean) shouldBe Some(true)
    }
  }

}

Source File: StandardScalerExample.scala From Spark-2.3.1 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SparkSession

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("StandardScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: StandardScalerExample.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object StandardScalerExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("StandardScalerExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // $example on$
    val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true)
      .setWithMean(false)

    // Compute summary statistics by fitting the StandardScaler.
    val scalerModel = scaler.fit(dataFrame)

    // Normalize each feature to have unit standard deviation.
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    // $example off$
    sc.stop()
  }
}
// scalastyle:on println

Source File: StandardScalerSuite.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{ScalerResult, SparkFeaturePFASuiteBase}
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class StandardScalerSuite extends SparkFeaturePFASuiteBase[ScalerResult] {

  implicit val enc = ExpressionEncoder[Vector]()

  val inputPath = "data/sample_lda_libsvm_data.txt"
  val dataset = spark.read.format("libsvm").load(inputPath)

  val scaler = new StandardScaler()
    .setInputCol("features")
    .setOutputCol("scaled")
    .setWithMean(true)
    .setWithStd(true)

  override val sparkTransformer = scaler.fit(dataset)

  val result = sparkTransformer.transform(dataset)
  override val input = withColumnAsArray(result, scaler.getInputCol).toJSON.collect()
  override val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()

  test("StandardScaler w/o Mean and Std") {
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setWithMean(false)
      .setWithStd(false)
    val sparkTransformer = scaler.fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("StandardScaler w/o Mean") {
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setWithMean(false)
      .setWithStd(true)
    val sparkTransformer = scaler.fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("StandardScaler w/o Std") {
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setWithMean(true)
      .setWithStd(false)
    val sparkTransformer = scaler.fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

}

org.apache.spark.ml.feature.StandardScaler Scala Examples