org.apache.spark.ml.feature.Bucketizer Scala Examples
The following examples show how to use org.apache.spark.ml.feature.Bucketizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: BucketizerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Bucketizer // $example off$ import org.apache.spark.sql.SparkSession object BucketizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("BucketizerExample") .getOrCreate() // $example on$ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity) val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9) val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val bucketizer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits) // Transform original data into its bucket index. val bucketedData = bucketizer.transform(dataFrame) println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets") bucketedData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: BuckerizerWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} import com.tencent.angel.spark.automl.feature.TransformerWrapper import org.apache.spark.ml.feature.Bucketizer class BuckerizerWrapper extends TransformerWrapper { override val transformer = new Bucketizer() override var parent: TransformerWrapper = _ override val requiredInputCols: Array[String] = Array("features") override val requiredOutputCols: Array[String] = Array("outBucketizer") override val hasMultiInputs: Boolean = false override val hasMultiOutputs: Boolean = false override val needAncestorInputs: Boolean = false override val relation: InToOutRelation = OneToOne override def declareInAndOut(): this.type = { transformer.setInputCol(getInputCols(0)) transformer.setOutputCol(getOutputCols(0)) this } }
Example 3
Source File: BucketizerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.core.feature.HandleInvalid import ml.combust.mleap.runtime.transformer.feature.BucketizerUtil._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.Bucketizer class BucketizerOp extends SimpleSparkOp[Bucketizer] { override val Model: OpModel[SparkBundleContext, Bucketizer] = new OpModel[SparkBundleContext, Bucketizer] { override val klazz: Class[Bucketizer] = classOf[Bucketizer] override def opName: String = Bundle.BuiltinOps.feature.bucketizer override def store(model: Model, obj: Bucketizer) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("splits", Value.doubleList(obj.getSplits)) .withValue("handle_invalid", Value.string(obj.getHandleInvalid)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): Bucketizer = { val m = new Bucketizer(uid = "").setSplits(restoreSplits(model.value("splits").getDoubleList.toArray)) val handleInvalid = model.getValue("handle_invalid").map(_.getString).getOrElse(HandleInvalid.default.asParamString) m.set(m.handleInvalid, handleInvalid) m } } override def sparkLoad(uid: String, shape: NodeShape, model: Bucketizer): Bucketizer = { val m = new Bucketizer(uid = uid).setSplits(model.getSplits) m.set(m.handleInvalid, model.getHandleInvalid) m } override def sparkInputs(obj: Bucketizer): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: Bucketizer): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 4
Source File: BucketizerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Bucketizer // $example off$ import org.apache.spark.sql.SparkSession object BucketizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("BucketizerExample") .getOrCreate() // $example on$ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity) val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9) val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val bucketizer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits) // Transform original data into its bucket index. val bucketedData = bucketizer.transform(dataFrame) println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets") bucketedData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 5
Source File: BucketizerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Bucketizer // $example off$ import org.apache.spark.sql.SparkSession object BucketizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("BucketizerExample") .getOrCreate() // $example on$ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity) val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9) val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val bucketizer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits) // Transform original data into its bucket index. val bucketedData = bucketizer.transform(dataFrame) println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets") bucketedData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: QuantileDiscretizerSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.TestEnv import odkl.analysis.spark.util.SQLOperations import org.apache.spark.ml.feature.Bucketizer import org.apache.spark.sql.{DataFrame, Dataset} import org.scalatest.FlatSpec class QuantileDiscretizerSpec extends FlatSpec with TestEnv with org.scalatest.Matchers with SQLOperations { private lazy val data = QuantileDiscretizerSpec._data private lazy val model = QuantileDiscretizerSpec._model private lazy val transformed = QuantileDiscretizerSpec._transformed "QuantileDiscretize" should "found 10 buckets for filled column" in { val fullSplits = model.getSplitsArray(0) fullSplits should contain theSameElementsInOrderAs (Seq(Double.NegativeInfinity) ++ Array.tabulate(10) { i => Math.pow(10, i) } ++ Seq(Double.PositiveInfinity)) } "QuantileDiscretize" should "found 5 buckets for partly filled column" in { val fullSplits = model.getSplitsArray(1) fullSplits should contain theSameElementsInOrderAs (Seq(Double.NegativeInfinity) ++ Array.tabulate(5) { _ + 1.0 } ++ Seq(Double.PositiveInfinity)) } "QuantileDiscretize" should "found 1 bucket for partly filled column" in { val fullSplits = model.getSplitsArray(2) fullSplits should contain theSameElementsInOrderAs Seq(Double.NegativeInfinity, 1.12, Double.PositiveInfinity) } "QuantileDiscretize" should "add zero bucket for empty column" in { val fullSplits = model.getSplitsArray(3) fullSplits should contain theSameElementsInOrderAs Seq(Double.NegativeInfinity, 0.0, Double.PositiveInfinity) } import sqlc.implicits._ "Transformed data" should "contain only valid buckets for full column" in { val values = transformed.select('full_bucket.as[Double]).distinct().collect().sorted values should contain theSameElementsInOrderAs Array.tabulate(10){_ + 1.0} } "Transformed data" should "contain only valid buckets for partly filled column" in { val values = transformed.select('partlyEmpty_bucket.as[Option[Double]]).distinct().collect().sorted values should contain theSameElementsInOrderAs Seq(None) ++ Array.tabulate(5){i => Some(i + 1.0)} } "Transformed data" should "contain only single buckets for constant column" in { val values = transformed.select('constant_bucket.as[Double]).distinct().collect().sorted values should contain theSameElementsInOrderAs Seq(1.0) } "Transformed data" should "contain single buckets for empty column" in { val values = transformed.select('empty_bucket.as[Option[Double]]).distinct().collect().sorted values should contain theSameElementsInOrderAs Seq(None) } } object QuantileDiscretizerSpec extends TestEnv { import sqlc.sparkSession.implicits._ case class Entry(full: Double, partlyEmpty: Option[Double], constant: Double = 1.12, empty: Option[Double] = None) private val entries = Seq( Entry(1, Some(1.0)), Entry(10, Some(2.0)), Entry(100, Some(3.0)), Entry(1000, Some(4.0)), Entry(10000, Some(5.0)), Entry(100000, None), Entry(1000000, None), Entry(10000000, None), Entry(100000000, None), Entry(1000000000, None) ) lazy val _data: Dataset[Entry] = (entries ++ entries ++ entries ++ entries).toDS lazy val _model: Bucketizer = new QuantileDiscretizer() .setNumBuckets(20) .setInputCols(_data.schema.fieldNames) .setOutputCols(_data.schema.fieldNames.map(_ + "_bucket")) .fit(_data) lazy val _transformed: DataFrame = _model.transform(_data) }
Example 7
Source File: BucketizerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Bucketizer // $example off$ import org.apache.spark.sql.SparkSession object BucketizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("BucketizerExample") .getOrCreate() // $example on$ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity) val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9) val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val bucketizer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits) // Transform original data into its bucket index. val bucketedData = bucketizer.transform(dataFrame) println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets") bucketedData.show() // $example off$ // $example on$ val splitsArray = Array( Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity), Array(Double.NegativeInfinity, -0.3, 0.0, 0.3, Double.PositiveInfinity)) val data2 = Array( (-999.9, -999.9), (-0.5, -0.2), (-0.3, -0.1), (0.0, 0.0), (0.2, 0.4), (999.9, 999.9)) val dataFrame2 = spark.createDataFrame(data2).toDF("features1", "features2") val bucketizer2 = new Bucketizer() .setInputCols(Array("features1", "features2")) .setOutputCols(Array("bucketedFeatures1", "bucketedFeatures2")) .setSplitsArray(splitsArray) // Transform original data into its bucket index. val bucketedData2 = bucketizer2.transform(dataFrame2) println(s"Bucketizer output with [" + s"${bucketizer2.getSplitsArray(0).length-1}, " + s"${bucketizer2.getSplitsArray(1).length-1}] buckets for each input column") bucketedData2.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: BucketizerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Bucketizer // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object BucketizerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("BucketizerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity) val data = Array(-0.5, -0.3, 0.0, 0.2) val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") val bucketizer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits) // Transform original data into its bucket index. val bucketedData = bucketizer.transform(dataFrame) bucketedData.show() // $example off$ sc.stop() } } // scalastyle:on println
Example 9
Source File: BucketizerSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.{Bucketizer, QuantileDiscretizer} class BucketizerSuite extends SparkFeaturePFASuiteBase[BucketizerResult] { val splits = Array(-0.5, 0.0, 0.5, Double.PositiveInfinity) val data = Array(-0.5, -0.3, 0.0, 0.2, 999.9) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") override val sparkTransformer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits) val result = sparkTransformer.transform(df) override val input = result.select(sparkTransformer.getInputCol).toJSON.collect() override val expectedOutput = result.select(sparkTransformer.getOutputCol).toJSON.collect() // Additional test for QuantileDiscretizer test("Bucketizer result from QuantileDiscretizer") { val df = spark.range(10, 1000, 3).toDF("input") val qd = new QuantileDiscretizer() .setInputCol("input") .setOutputCol("bucketedFeatures") .setNumBuckets(10) val bucketizer = qd.fit(df) val expectedOutput = bucketizer.transform(df) parityTest(bucketizer, df.select(bucketizer.getInputCol).toJSON.collect(), expectedOutput.toJSON.collect()) } } case class BucketizerResult(bucketedFeatures: Double) extends Result