org.apache.spark.ml.feature.OneHotEncoder Scala Examples
The following examples show how to use org.apache.spark.ml.feature.OneHotEncoder.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OneHotEncoderExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object OneHotEncoderExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("OneHotEncoderExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: LocalOneHotEncoder.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.OneHotEncoder class LocalOneHotEncoder(override val sparkTransformer: OneHotEncoder) extends LocalTransformer[OneHotEncoder] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val col = column.data match { case d: List[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue()) case x => throw new IllegalArgumentException(s"Incorrect index value: $x") } col.foreach( x => assert( x >= 0.0 && x == x.toInt, s"Values from column ${sparkTransformer.getInputCol} must be indices, but got $x." ) ) val size = col.max.toInt val newData = col.map(r => { val res = Array.fill(size) { 0.0 } if (r < size) { res.update(r.toInt, 1.0) } res.toList }) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalOneHotEncoder extends SimpleModelLoader[OneHotEncoder] with TypedTransformerConverter[OneHotEncoder] { override def build(metadata: Metadata, data: LocalData): OneHotEncoder = { var ohe = new OneHotEncoder(metadata.uid) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) metadata.paramMap.get("dropLast").foreach { x => ohe = ohe.setDropLast(x.asInstanceOf[Boolean]) } ohe } override implicit def toLocal(transformer: OneHotEncoder) = new LocalOneHotEncoder(transformer) }
Example 3
Source File: OneHotEncoderExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.spark.sql.SparkSession object OneHotEncoderExample { def main(args: Array[String]): Unit = { import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} val spark = SparkSession .builder() .appName("Spark SQL basic example").master("local[1]") .config("spark.some.config.option", "some-value") .getOrCreate() // For implicit conversions like converting RDDs to DataFrames val df = spark.createDataFrame(Seq( (0, 3), (1, 2), (2, 4), (3, 3), (4, 3), (5, 4) )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.select("id", "categoryVec").show() } }
Example 4
Source File: OneHotEncoderExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object OneHotEncoderExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("OneHotEncoderExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 5
Source File: OneHotEncoderExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object OneHotEncoderExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("OneHotEncoderExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: OneHotEncoderExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object OneHotEncoderExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("OneHotEncoderExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val df = sqlContext.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.select("id", "categoryVec").show() // $example off$ sc.stop() } } // scalastyle:on println
Example 7
Source File: OneHotEncoderDemo2.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator object OneHotEncoderDemo2 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() spark.stop() } }
Example 8
Source File: StringIndexerDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.sql._ import org.apache.spark.sql.SQLContext object StringIndexerDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("label") .fit(df) val indexed = indexer.transform(df) indexed.show(false) spark.stop() } }