org.apache.spark.ml.feature.MinMaxScaler Scala Examples
The following examples show how to use org.apache.spark.ml.feature.MinMaxScaler.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MinMaxScalerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object MinMaxScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("MinMaxScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.1, -1.0)), (1, Vectors.dense(2.0, 1.1, 1.0)), (2, Vectors.dense(3.0, 10.1, 3.0)) )).toDF("id", "features") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures") // Compute summary statistics and generate MinMaxScalerModel val scalerModel = scaler.fit(dataFrame) // rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]") scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: get_features_from_peinfo.scala From gsoc_relationship with Apache License 2.0 | 5 votes |
import com.datastax.spark.connector._ import play.api.libs.json.Json import play.api.libs.json._ import java.io.{ByteArrayOutputStream, ByteArrayInputStream} import java.util.zip.{GZIPOutputStream, GZIPInputStream} import Array.concat import org.apache.spark.sql.types._ import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.linalg._ import org.apache.spark.sql.Row import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.DenseVector import PreProcessingConfig._ case class peinfo_results_by_service_name_class(service_name: String, sha256: String) case class peinfo_results_by_sha256_class(sha256: String, service_name: String, results: Array[Byte]) case class peinfo_join_results_class(sha256: String, service_name: String, results: String) case class peinfo_int_final_array_rdd_class(sha256: String, array_results: Array[Double]) case class peinfo_binaray_final_array_rdd_class(sha256:String, array_results :Array[Double]) case class peinfo_final_array_rdd_class(sha256:String, array_results: Array[Double]) def unzip(x: Array[Byte]) : String = { val inputStream = new GZIPInputStream(new ByteArrayInputStream(x)) val output = scala.io.Source.fromInputStream(inputStream).mkString return output } def findAllIntinpeinfo( peinfo_json_results : JsLookupResult, time: Double): Array[Double]= { val entropy = peinfo_json_results \\ "entropy" ; val virt_address = peinfo_json_results \\ "virt_address"; val virt_size = peinfo_json_results \\ "virt_size"; val size = peinfo_json_results \\ "size"; var i= 0; var List = Array.iterate(0.0,17)(a=>a*0) for (k <- ( peinfo_json_results \\ "section_name")){ k.as[String] match { case ".text\u0000\u0000\u0000" => { List(0)=entropy(i).as[Double]; List(1)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(2)=virt_size(i).as[Double]; List(3)=size(i).as[Double] } case ".data\u0000\u0000\u0000" => { List(4)=entropy(i).as[Double]; List(5)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(6)=virt_size(i).as[Double]; List(7)=size(i).as[Double] } case ".rsrc\u0000\u0000\u0000" => { List(8)=entropy(i).as[Double]; List(9)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(10)=virt_size(i).as[Double]; List(11)=size(i).as[Double] } case ".rdata\u0000\u0000" => { List(12)=entropy(i).as[Double]; List(13)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(14)=virt_size(i).as[Double]; List(15)=size(i).as[Double] } case other => {} } i = i + 1 } List(16)= time return List.toArray } val peinfo_results_by_service_name_meta = sc.cassandraTable[peinfo_results_by_service_name_class](keyspace,service_name_table).where("service_name=?","peinfo") val peinfo_results_by_service_name_rdd = peinfo_results_by_service_name_meta.keyBy(x=> (x.sha256,x.service_name)) val peinfo_results_by_sha256_meta = sc.cassandraTable[peinfo_results_by_sha256_class](keyspace,sha256_table) val peinfo_results_by_sha256_rdd = peinfo_results_by_sha256_meta.keyBy(x => (x.sha256,x.service_name)) val peinfo_join_results = peinfo_results_by_service_name_rdd.join(peinfo_results_by_sha256_rdd).map(x=> (new peinfo_join_results_class(x._1._1,x._1._2, unzip(x._2._2.results)))).distinct().cache() val peinfo_int_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "pe_sections"),{if ((Json.parse(x.results) \ "timestamp").isInstanceOf[JsUndefined]) 0.0 else (Json.parse(x.results) \ "timestamp" \\ "timestamp")(0).as[Double]})).filter(x=> !x._2.isInstanceOf[JsUndefined]).map(x=>new peinfo_int_final_array_rdd_class(x._1,findAllIntinpeinfo(x._2,x._3))) val peinfo_dllfunction_list= peinfo_join_results.map(x=>Json.parse(x.results) \ "imports").filter(x=> !x.isInstanceOf[JsUndefined]).flatMap(x=>x.as[List[Map[String, String]]].map(x=>(x("dll")+"."+x("function")))).toDF("func_name").groupBy("func_name").count.sort(desc("count")).filter("count > 10000").rdd.map(r => r.getString(0)).collect().toList implicit def bool2int(b:Boolean) = if (b) 1 else 0 def findAllBininpeinfo_dllfunction(peinfo_dllfunction : Seq[String]) : Array[Double] ={ val forlist = for (family <- peinfo_dllfunction_list) yield { (peinfo_dllfunction.contains(family):Int).toDouble } return (forlist).toArray } val List502 = Array.iterate(0.0,502)(a=>0.0) val peinfo_binaray_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "imports"))).map(x=>new peinfo_binaray_final_array_rdd_class(x._1,{if (x._2.isInstanceOf[JsUndefined]) List502 else findAllBininpeinfo_dllfunction(x._2.as[Seq[Map[String, String]]].map(x=>(x("dll")+"."+x("function"))))})) val peinfo_int_final_array_rdd_before_join = peinfo_int_final_array_rdd.map(x=>(x.sha256,x.array_results)) val peinfo_binaray_final_array_rdd_before_join = peinfo_binaray_final_array_rdd.map(x=>(x.sha256,x.array_results)) val peinfo_array_rdd_by_join = peinfo_int_final_array_rdd_before_join.join(peinfo_binaray_final_array_rdd_before_join).map(x=> (x._1,concat(x._2._1,x._2._2))) val peinfo_final_array_rdd = peinfo_array_rdd_by_join.map(x=>new peinfo_final_array_rdd_class(x._1,x._2)) val peinfo_schema = new StructType().add("sha256", StringType).add("peinfo",VectorType) val peinfo_vector_rdd = peinfo_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results))) val peinfo_vector_rowrdd = peinfo_vector_rdd.map(p => Row(p._1,p._2)) val peinfo_vector_dataframe = spark.createDataFrame(peinfo_vector_rowrdd, peinfo_schema) val peinfo_scaler = new MinMaxScaler() .setInputCol("peinfo") .setOutputCol("scaled_peinfo") val peinfo_scalerModel = peinfo_scaler.fit(peinfo_vector_dataframe) val peinfo_scaledData_df = peinfo_scalerModel.transform(peinfo_vector_dataframe) val peinfo_scaledData_rdd = peinfo_scaledData_df.select("sha256","scaled_peinfo").rdd.map(row=>(row.getAs[String]("sha256"),row.getAs[DenseVector]("scaled_peinfo"))).map(x=>new peinfo_final_array_rdd_class(x._1,x._2.toArray)) peinfo_scaledData_rdd.toDF().write.format("parquet").save(peinfo_final_array_file)
Example 3
Source File: MinMaxScalerWithNonDefaultsParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{MinMaxScaler, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class MinMaxScalerWithNonDefaultsParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new MinMaxScaler(). setInputCol("features"). setOutputCol("scaled_features"). setMin(2.0). setMax(4.0))).fit(dataset) }
Example 4
Source File: MinMaxScalerPipelineParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, MinMaxScaler, QuantileDiscretizer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MinMaxScalerPipelineParitySpec extends SparkParityBase { private val getKeys: Map[String, Double] => Seq[String] = { input: Map[String, Double] => input.keySet.toSeq } val keyUdf = functions.udf(getKeys) override val dataset = spark.createDataFrame(Seq( (Array("1"), 1.0, Map("a" -> 0.1, "b" -> 0.2, "c" -> 0.3), 1), (Array("2"), 10.0, Map("d" -> 0.1, "e" -> 0.2, "c" -> 0.3), 0), (Array("3"), 20.0, Map("x" -> 0.1, "a" -> 0.2, "b" -> 0.3), 0), (Array("4"), 15.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0), (Array("5"), 18.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0), (Array("6"), 25.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 1), (Array("6"), 5.0, Map("a" -> 0.1, "b" -> 0.2, "d" -> 0.3), 0), (Array("7"), 30.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0)) ) .toDF("book_id", "pv", "myInputCol0", "label") .withColumn("myInputCol", keyUdf(functions.col("myInputCol0"))) .drop("myInputCol0") override val sparkTransformer = new Pipeline() .setStages(Array(new CountVectorizer() .setInputCol("book_id") .setOutputCol("book_id_vec") .setMinDF(1) .setMinTF(1) .setBinary(true), new QuantileDiscretizer() .setInputCol("pv") .setOutputCol("pv_bucket") .setNumBuckets(3), new CountVectorizer() .setInputCol("myInputCol") .setOutputCol("myInputCol1_vec") .setMinDF(1) .setMinTF(1) .setBinary(true), new VectorAssembler() .setInputCols(Array("pv_bucket", "book_id_vec", "myInputCol1_vec")) .setOutputCol("vectorFeature"), new MinMaxScaler().setInputCol("vectorFeature").setOutputCol("scaledFeatures"))).fit(dataset) }
Example 5
Source File: MLPipelineTrackerIT.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.ml import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.types.{IntegerType, StringType, StructType} import org.scalatest.Matchers import com.hortonworks.spark.atlas._ import com.hortonworks.spark.atlas.types._ import com.hortonworks.spark.atlas.TestUtils._ class MLPipelineTrackerIT extends BaseResourceIT with Matchers with WithHiveSupport { private val atlasClient = new RestAtlasClient(atlasClientConf) def clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME) def getTableEntity(tableName: String): SACAtlasEntityWithDependencies = { val dbDefinition = createDB("db1", "hdfs:///test/db/db1") val sd = createStorageFormat() val schema = new StructType() .add("user", StringType, false) .add("age", IntegerType, true) val tableDefinition = createTable("db1", s"$tableName", schema, sd) internal.sparkTableToEntity(tableDefinition, clusterName, Some(dbDefinition)) } // Enable it to run integrated test. it("pipeline and pipeline model") { val uri = "hdfs://" val pipelineDir = "tmp/pipeline" val modelDir = "tmp/model" val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir) val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir) atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, modelDirEntity)) val df = sparkSession.createDataFrame(Seq( (1, Vectors.dense(0.0, 1.0, 4.0), 1.0), (2, Vectors.dense(1.0, 0.0, 4.0), 2.0), (3, Vectors.dense(1.0, 0.0, 5.0), 3.0), (4, Vectors.dense(0.0, 0.0, 5.0), 4.0) )).toDF("id", "features", "label") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("features_scaled") .setMin(0.0) .setMax(3.0) val pipeline = new Pipeline().setStages(Array(scaler)) val model = pipeline.fit(df) pipeline.write.overwrite().save(pipelineDir) val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity) atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, pipelineEntity)) val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity) atlasClient.createEntitiesWithDependencies(Seq(modelDirEntity, modelEntity)) val tableEntities1 = getTableEntity("chris1") val tableEntities2 = getTableEntity("chris2") atlasClient.createEntitiesWithDependencies(tableEntities1) atlasClient.createEntitiesWithDependencies(tableEntities2) } }
Example 6
Source File: MLAtlasEntityUtilsSuite.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.types import java.io.File import org.apache.atlas.{AtlasClient, AtlasConstants} import org.apache.atlas.model.instance.AtlasEntity import org.apache.commons.io.FileUtils import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.types.{IntegerType, StringType, StructType} import org.scalatest.{FunSuite, Matchers} import com.hortonworks.spark.atlas.TestUtils._ import com.hortonworks.spark.atlas.{AtlasUtils, WithHiveSupport} class MLAtlasEntityUtilsSuite extends FunSuite with Matchers with WithHiveSupport { def getTableEntity(tableName: String): AtlasEntity = { val dbDefinition = createDB("db1", "hdfs:///test/db/db1") val sd = createStorageFormat() val schema = new StructType() .add("user", StringType, false) .add("age", IntegerType, true) val tableDefinition = createTable("db1", s"$tableName", schema, sd) val tableEntities = internal.sparkTableToEntity( tableDefinition, AtlasConstants.DEFAULT_CLUSTER_NAME, Some(dbDefinition)) val tableEntity = tableEntities.entity tableEntity } test("pipeline, pipeline model, fit and transform") { val uri = "/" val pipelineDir = "tmp/pipeline" val modelDir = "tmp/model" val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir) pipelineDirEntity.entity.getAttribute("uri") should be (uri) pipelineDirEntity.entity.getAttribute("directory") should be (pipelineDir) pipelineDirEntity.dependencies.length should be (0) val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir) modelDirEntity.entity.getAttribute("uri") should be (uri) modelDirEntity.entity.getAttribute("directory") should be (modelDir) modelDirEntity.dependencies.length should be (0) val df = sparkSession.createDataFrame(Seq( (1, Vectors.dense(0.0, 1.0, 4.0), 1.0), (2, Vectors.dense(1.0, 0.0, 4.0), 2.0), (3, Vectors.dense(1.0, 0.0, 5.0), 3.0), (4, Vectors.dense(0.0, 0.0, 5.0), 4.0) )).toDF("id", "features", "label") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("features_scaled") .setMin(0.0) .setMax(3.0) val pipeline = new Pipeline().setStages(Array(scaler)) val model = pipeline.fit(df) pipeline.write.overwrite().save(pipelineDir) val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity) pipelineEntity.entity.getTypeName should be (metadata.ML_PIPELINE_TYPE_STRING) pipelineEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be ( pipeline.uid) pipelineEntity.entity.getAttribute("name") should be (pipeline.uid) pipelineEntity.entity.getRelationshipAttribute("directory") should be ( AtlasUtils.entityToReference(pipelineDirEntity.entity, useGuid = false)) pipelineEntity.dependencies should be (Seq(pipelineDirEntity)) val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity) val modelUid = model.uid.replaceAll("pipeline", "model") modelEntity.entity.getTypeName should be (metadata.ML_MODEL_TYPE_STRING) modelEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (modelUid) modelEntity.entity.getAttribute("name") should be (modelUid) modelEntity.entity.getRelationshipAttribute("directory") should be ( AtlasUtils.entityToReference(modelDirEntity.entity, useGuid = false)) modelEntity.dependencies should be (Seq(modelDirEntity)) FileUtils.deleteDirectory(new File("tmp")) } }
Example 7
Source File: MinMaxScalerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object MinMaxScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("MinMaxScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.1, -1.0)), (1, Vectors.dense(2.0, 1.1, 1.0)), (2, Vectors.dense(3.0, 10.1, 3.0)) )).toDF("id", "features") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures") // Compute summary statistics and generate MinMaxScalerModel val scalerModel = scaler.fit(dataFrame) // rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]") scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: MinMaxScalerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object MinMaxScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("MinMaxScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.1, -1.0)), (1, Vectors.dense(2.0, 1.1, 1.0)), (2, Vectors.dense(3.0, 10.1, 3.0)) )).toDF("id", "features") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures") // Compute summary statistics and generate MinMaxScalerModel val scalerModel = scaler.fit(dataFrame) // rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]") scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 9
Source File: OpEstimatorWrapperTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.features.types._ import com.salesforce.op.test.{PrestigeData, TestFeatureBuilder, _} import org.apache.spark.ml.feature.{MinMaxScaler, MinMaxScalerModel} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner import org.slf4j.LoggerFactory @RunWith(classOf[JUnitRunner]) class OpEstimatorWrapperTest extends FlatSpec with TestSparkContext with PrestigeData { val log = LoggerFactory.getLogger(this.getClass) val (ds, education, income, women, prestige) = TestFeatureBuilder[OPVector, OPVector, OPVector, OPVector]("education", "income", "women", "prestige", prestigeSeq.map(p => (Vectors.dense(p.prestige).toOPVector, Vectors.dense(p.education).toOPVector, Vectors.dense(p.income).toOPVector, Vectors.dense(p.women).toOPVector) ) ) Spec[OpEstimatorWrapper[_, _, _, _]] should "scale variables properly with default min/max params" in { val baseScaler = new MinMaxScaler() val scalerModel: MinMaxScalerModel = fitScalerModel(baseScaler) (scalerModel.getMax - 1.0).abs should be < 1E-6 } it should "scale variables properly with custom min/max params" in { val maxParam = 100 val baseScaler = new MinMaxScaler().setMax(maxParam) val scalerModel: MinMaxScalerModel = fitScalerModel(baseScaler) (scalerModel.getMax - maxParam).abs should be < 1E-6 } it should "should have the expected feature name" in { val wrappedEstimator = new OpEstimatorWrapper[OPVector, OPVector, MinMaxScaler, MinMaxScalerModel](new MinMaxScaler()).setInput(income) wrappedEstimator.getOutput().name shouldBe wrappedEstimator.getOutputFeatureName } private def fitScalerModel(baseScaler: MinMaxScaler): MinMaxScalerModel = { val scaler = new OpEstimatorWrapper[OPVector, OPVector, MinMaxScaler, MinMaxScalerModel](baseScaler).setInput(income) val model = scaler.fit(ds) val scalerModel = model.getSparkMlStage().get if (log.isInfoEnabled) { val output = scalerModel.transform(ds) output.show(false) } scalerModel } }
Example 10
Source File: MinMaxScalerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object MinMaxScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("MinMaxScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.1, -1.0)), (1, Vectors.dense(2.0, 1.1, 1.0)), (2, Vectors.dense(3.0, 10.1, 3.0)) )).toDF("id", "features") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures") // Compute summary statistics and generate MinMaxScalerModel val scalerModel = scaler.fit(dataFrame) // rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]") scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 11
Source File: MinMaxScalerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MinMaxScaler // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MinMaxScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("MinMaxScalerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures") // Compute summary statistics and generate MinMaxScalerModel val scalerModel = scaler.fit(dataFrame) // rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) scaledData.show() // $example off$ sc.stop() } } // scalastyle:on println
Example 12
Source File: MyNormalize.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.sql.SparkSession import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.feature.MinMaxScaler import org.apache.log4j.{Level, Logger} object MyNormalize { def parseWine(str: String): (Int, Vector) = { val columns = str.split(",") // don't use the entire row of data (columns(0).toInt, Vectors.dense(columns(1).toFloat, columns(2).toFloat, columns(3).toFloat)) } def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("My Normalize") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ //http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data val data = spark.read.text("../data/sparkml2/chapter4/wine.data").as[String].map(parseWine) val df = data.toDF("id", "feature") df.printSchema() df.show(false) val scale = new MinMaxScaler() .setInputCol("feature") .setOutputCol("scaled") .setMax(1) .setMin(-1) scale.fit(df).transform(df).select("scaled").show(false) spark.stop() } }