org.apache.spark.ml.feature.VectorSlicer Scala Examples
The following examples show how to use org.apache.spark.ml.feature.VectorSlicer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: VectorSlicerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SparkSession object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: VectorSlicerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.core.types.TensorShape import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.DataFrame import org.apache.spark.sql.mleap.TypeConverters.sparkToMleapDataShape import org.apache.spark.sql.types.StructField class VectorSlicerOp extends SimpleSparkOp[VectorSlicer] { override val Model: OpModel[SparkBundleContext, VectorSlicer] = new OpModel[SparkBundleContext, VectorSlicer] { override val klazz: Class[VectorSlicer] = classOf[VectorSlicer] override def opName: String = Bundle.BuiltinOps.feature.vector_slicer override def store(model: Model, obj: VectorSlicer) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val dataset = context.context.dataset.get val namedIndicesMap: Array[(String, Int)] = if(obj.getNames.nonEmpty) { extractNamedIndices(obj.getInputCol, obj.getNames, dataset) } else { Array() } val (names, namedIndices) = namedIndicesMap.unzip val inputShape = sparkToMleapDataShape(dataset.schema(obj.getInputCol), dataset).asInstanceOf[TensorShape] model.withValue("indices", Value.longList(obj.getIndices.map(_.toLong).toSeq)). withValue("names", Value.stringList(names)). withValue("named_indices", Value.intList(namedIndices)). withValue("input_size", Value.int(inputShape.dimensions.get.head)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): VectorSlicer = { val names = model.value("names").getStringList new VectorSlicer(uid = "").setIndices(model.value("indices").getLongList.map(_.toInt).toArray). setNames(names.toArray) } private def extractNamedIndices(inputCol: String, names: Array[String], dataset: DataFrame): Array[(String, Int)] = { names.zip(getFeatureIndicesFromNames(dataset.schema(inputCol), names)) } private def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } } override def sparkLoad(uid: String, shape: NodeShape, model: VectorSlicer): VectorSlicer = { new VectorSlicer(uid = uid).setIndices(model.getIndices).setNames(model.getNames) } override def sparkInputs(obj: VectorSlicer): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: VectorSlicer): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 3
Source File: VectorSlicerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{VectorAssembler, VectorSlicer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class VectorSlicerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new VectorSlicer(). setIndices(Array(1)). setNames(Array("dti")). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) }
Example 4
Source File: VectorSlicerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SparkSession object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 5
Source File: VectorSlicerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SparkSession object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: VectorSlicerExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} output.show() println(output.select("userFeatures", "features").first()) // $example off$ sc.stop() } } // scalastyle:on println
Example 7
Source File: VectorSlicerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.StructType // $example off$ object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: VectorSlicerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object VectorSlicerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VectorSlicerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val data = Array(Row(Vectors.dense(-2.0, 2.3, 0.0))) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataRDD = sc.parallelize(data) val dataset = sqlContext.createDataFrame(dataRDD, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) println(output.select("userFeatures", "features").first()) // $example off$ sc.stop() } } // scalastyle:on println
Example 9
Source File: VectorSlicerSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors class VectorSlicerSuite extends SparkFeaturePFASuiteBase[VectorSlicerResult] { import spark.implicits._ val data = Seq( (7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0), (8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0), (9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0) ) val df = spark.createDataset(data).toDF("id", "features", "label") override val sparkTransformer = new VectorSlicer() .setInputCol("features") .setIndices(Array(0, 1, 3)) .setOutputCol("selectedFeatures") val result = sparkTransformer.transform(df) override val input = withColumnAsArray(result, sparkTransformer.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, sparkTransformer.getOutputCol).toJSON.collect() } case class VectorSlicerResult(selectedFeatures: Seq[Double]) extends Result