org.apache.spark.ml.attribute.NumericAttribute Scala Examples
The following examples show how to use org.apache.spark.ml.attribute.NumericAttribute.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: VectorSlicerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SparkSession object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: VectorSlicerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 3
Source File: OneHotEncoderOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribute, NumericAttribute} import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.OneHotEncoderModel import org.apache.spark.sql.types.StructField import scala.util.{Failure, Try} object OneHotEncoderOp { def sizeForField(field: StructField): Int = { val attr = Attribute.fromStructField(field) (attr match { case nominal: NominalAttribute => if (nominal.values.isDefined) { Try(nominal.values.get.length) } else if (nominal.numValues.isDefined) { Try(nominal.numValues.get) } else { Failure(new RuntimeException(s"invalid nominal value for field ${field.name}")) } case binary: BinaryAttribute => Try(2) case _: NumericAttribute => Failure(new RuntimeException(s"invalid numeric attribute for field ${field.name}")) case _ => Failure(new RuntimeException(s"unsupported attribute for field ${field.name}")) // optimistic about unknown attributes }).get } } class OneHotEncoderOp extends SimpleSparkOp[OneHotEncoderModel] { override val Model: OpModel[SparkBundleContext, OneHotEncoderModel] = new OpModel[SparkBundleContext, OneHotEncoderModel] { override val klazz: Class[OneHotEncoderModel] = classOf[OneHotEncoderModel] override def opName: String = Bundle.BuiltinOps.feature.one_hot_encoder override def store(model: Model, obj: OneHotEncoderModel) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val df = context.context.dataset.get val categorySizes = obj.getInputCols.map { f ⇒ OneHotEncoderOp.sizeForField(df.schema(f)) } model.withValue("category_sizes", Value.intList(categorySizes)) .withValue("drop_last", Value.boolean(obj.getDropLast)) .withValue("handle_invalid", Value.string(obj.getHandleInvalid)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): OneHotEncoderModel = { new OneHotEncoderModel(uid = "", categorySizes = model.value("category_sizes").getIntList.toArray) .setDropLast(model.value("drop_last").getBoolean) .setHandleInvalid(model.value("handle_invalid").getString) } } override def sparkLoad(uid: String, shape: NodeShape, model: OneHotEncoderModel): OneHotEncoderModel = { new OneHotEncoderModel(uid = uid, categorySizes = model.categorySizes) .setDropLast(model.getDropLast) .setHandleInvalid(model.getHandleInvalid) } override def sparkInputs(obj: OneHotEncoderModel): Seq[ParamSpec] = Seq(ParamSpec("input", obj.inputCols)) override def sparkOutputs(obj: OneHotEncoderModel): Seq[ParamSpec] = Seq(ParamSpec("output", obj.outputCols)) }
Example 4
Source File: ReverseStringIndexerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.core.types.{DataShape, ScalarShape} import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribute, NumericAttribute} import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.IndexToString import org.apache.spark.sql.types.StructField import ml.combust.mleap.runtime.types.BundleTypeConverters._ import scala.util.{Failure, Try} object ReverseStringIndexerOp { def labelsForField(field: StructField): Array[String] = { val attr = Attribute.fromStructField(field) (attr match { case nominal: NominalAttribute => if (nominal.values.isDefined) { Try(nominal.values.get) } else { Failure(new RuntimeException(s"invalid nominal value for field ${field.name}")) } case _: BinaryAttribute => Failure(new RuntimeException(s"invalid binary attribute for field ${field.name}")) case _: NumericAttribute => Failure(new RuntimeException(s"invalid numeric attribute for field ${field.name}")) case _ => Failure(new RuntimeException(s"unsupported attribute for field ${field.name}")) // optimistic about unknown attributes }).get } } class ReverseStringIndexerOp extends SimpleSparkOp[IndexToString] { override val Model: OpModel[SparkBundleContext, IndexToString] = new OpModel[SparkBundleContext, IndexToString] { override val klazz: Class[IndexToString] = classOf[IndexToString] override def opName: String = Bundle.BuiltinOps.feature.reverse_string_indexer override def store(model: Model, obj: IndexToString) (implicit context: BundleContext[SparkBundleContext]): Model = { val labels = obj.get(obj.labels).getOrElse { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val df = context.context.dataset.get ReverseStringIndexerOp.labelsForField(df.schema(obj.getInputCol)) } model.withValue("labels", Value.stringList(labels)). withValue("input_shape", Value.dataShape(ScalarShape(false))) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): IndexToString = { model.getValue("input_shape").map(_.getDataShape: DataShape).foreach { shape => require(shape.isScalar, "cannot deserialize non-scalar input to Spark IndexToString model") } new IndexToString(uid = "").setLabels(model.value("labels").getStringList.toArray) } } override def sparkLoad(uid: String, shape: NodeShape, model: IndexToString): IndexToString = { new IndexToString(uid = uid).setLabels(model.getLabels) } override def sparkInputs(obj: IndexToString): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: IndexToString): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 5
Source File: VectorSlicerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SparkSession object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: VectorSlicerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 7
Source File: TreeUtils.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute} import org.apache.spark.sql.DataFrame object TreeUtils { def setMetadata( data: DataFrame, featuresColName: String, featureArity: Array[Int]): DataFrame = { val featuresAttributes = featureArity.zipWithIndex.map { case (arity: Int, feature: Int) => if (arity > 0) { NominalAttribute.defaultAttr.withIndex(feature).withNumValues(arity) } else { NumericAttribute.defaultAttr.withIndex(feature) } } val featuresMetadata = new AttributeGroup("features", featuresAttributes).toMetadata() data.select(data(featuresColName).as(featuresColName, featuresMetadata)) } }
Example 8
Source File: VectorSlicerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SparkSession object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 9
Source File: VectorSlicerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 10
Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new VectorAssembler) } test("assemble") { import org.apache.spark.ml.feature.VectorAssembler.assemble assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty)) assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0))) val dv = Vectors.dense(2.0, 0.0) assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0))) val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0)) assert(assemble(0.0, dv, 1.0, sv) === Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0))) for (v <- Seq(1, "a", null)) { intercept[SparkException](assemble(v)) intercept[SparkException](assemble(1.0, v)) } } test("assemble should compress vectors") { import org.apache.spark.ml.feature.VectorAssembler.assemble val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) assert(v1.isInstanceOf[SparseVector]) val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) assert(v2.isInstanceOf[DenseVector]) } test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) )).toDF("id", "x", "y", "name", "z", "n") val assembler = new VectorAssembler() .setInputCols(Array("x", "y", "z", "n")) .setOutputCol("features") assembler.transform(df).select("features").collect().foreach { case Row(v: Vector) => assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0))) } } test("ML attributes") { val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari") val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0) val user = new AttributeGroup("user", Array( NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"), NumericAttribute.defaultAttr.withName("salary"))) val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0))) val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad") .select( col("browser").as("browser", browser.toMetadata()), col("hour").as("hour", hour.toMetadata()), col("count"), // "count" is an integer column without ML attribute col("user").as("user", user.toMetadata()), col("ad")) // "ad" is a vector column without ML attribute val assembler = new VectorAssembler() .setInputCols(Array("browser", "hour", "count", "user", "ad")) .setOutputCol("features") val output = assembler.transform(df) val schema = output.schema val features = AttributeGroup.fromStructField(schema("features")) assert(features.size === 7) val browserOut = features.getAttr(0) assert(browserOut === browser.withIndex(0).withName("browser")) val hourOut = features.getAttr(1) assert(hourOut === hour.withIndex(1).withName("hour")) val countOut = features.getAttr(2) assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2)) val userGenderOut = features.getAttr(3) assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3)) val userSalaryOut = features.getAttr(4) assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4)) assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5)) assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6)) } }
Example 11
Source File: VectorSlicerExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} output.show() println(output.select("userFeatures", "features").first()) // $example off$ sc.stop() } } // scalastyle:on println
Example 12
Source File: VectorSlicerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SQLContext} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") {//参数 val slicer = new VectorSlicer ParamsSuite.checkParams(slicer) //指数 assert(slicer.getIndices.length === 0) //名称 assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.validateParams() } } } test("feature validity checks") {//特征有效性检查 import VectorSlicer._ //如果给定的特征索引是有效的,返回true assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) //如果给定的特征名称有效,返回true assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") {//测试向量机 val sqlContext = new SQLContext(sc) val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 //预计在选择指数1,4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = sqlContext.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) //VectorSlicer是一个转换器输入特征向量,输出原始特征向量子集. val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df))//transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) //transform主要是用来把 一个 DataFrame 转换成另一个 DataFrame validateResults(vectorSlicer.transform(df)) } }
Example 13
Source File: VectorExplodeSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.TestEnv import odkl.analysis.spark.util.SQLOperations import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.{functions, Row} import org.apache.spark.sql.types.{StructType, StructField, DoubleType} import org.scalatest.FlatSpec class VectorExplodeSpec extends FlatSpec with TestEnv with org.scalatest.Matchers with SQLOperations with WithModels with HasMetricsBlock { case class Point(id: Int, vector: Vector, mean: Vector) lazy val data = sqlc.createDataFrame(Seq( Point(1, Vectors.dense(1.0, 3.0), Vectors.dense(10.0, 30.0)), Point(2, Vectors.dense(2.0, 4.0), Vectors.sparse(2, Array(1), Array(20.0))) )) lazy val withMetadata = data.withColumn( "vector", data("vector").as("vector", new AttributeGroup("vector", Array[Attribute]( NumericAttribute.defaultAttr.withName("fixed"), NumericAttribute.defaultAttr.withName("var") )).toMetadata())) .withColumn( "mean", data("mean").as("mean", new AttributeGroup("vector", Array[Attribute]( NumericAttribute.defaultAttr.withName("fixed"), NumericAttribute.defaultAttr.withName("var") )).toMetadata())) lazy val explode = new VectorExplode().transform(withMetadata) "Explode " should " add data" in { val result = explode.orderBy("id", "value").collect() result(0).getInt(0) should be(1) result(0).getString(1) should be("fixed") result(0).getDouble(2) should be(1.0) result(0).getDouble(3) should be(10.0) result(1).getInt(0) should be(1) result(1).getString(1) should be("var") result(1).getDouble(2) should be(3.0) result(1).getDouble(3) should be(30.0) result(2).getInt(0) should be(2) result(2).getString(1) should be("fixed") result(2).getDouble(2) should be(2.0) result(2).isNullAt(3) should be(true) result(3).getInt(0) should be(2) result(3).getString(1) should be("var") result(3).getDouble(2) should be(4.0) result(3).getDouble(3) should be(20.0) } "Explode " should " create schema" in { val fields = explode.schema.fields fields(0).name should be("id") fields(1).name should be("value") fields(2).name should be("vector") fields(3).name should be("mean") } }
Example 14
Source File: VectorSlicerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.StructType // $example off$ object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 15
Source File: VectorSlicerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object VectorSlicerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VectorSlicerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val data = Array(Row(Vectors.dense(-2.0, 2.3, 0.0))) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataRDD = sc.parallelize(data) val dataset = sqlContext.createDataFrame(dataRDD, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) println(output.select("userFeatures", "features").first()) // $example off$ sc.stop() } } // scalastyle:on println