org.apache.spark.ml.linalg.VectorUDT Scala Examples
The following examples show how to use org.apache.spark.ml.linalg.VectorUDT.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DCT.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 2
Source File: HashBasedDeduplicatorSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package odkl.analysis.spark.texts import odkl.analysis.spark.TestEnv import org.apache.spark.ml.odkl.texts.HashBasedDeduplicator import org.apache.spark.ml.linalg.{VectorUDT, Vectors} import org.apache.spark.ml.odkl.MatrixUtils import org.apache.spark.sql.Row import org.apache.spark.sql.types.{LongType, StringType, StructType} import org.scalatest.FlatSpec class HashBasedDeduplicatorSpec extends FlatSpec with TestEnv with org.scalatest.Matchers { "cotrect HashBasedDeduplicator " should " remove similar vectors based on hash " in { val vectorsSize = 10000 val vector1 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 1.0, 1.0)), 1L, "vector1") val vector2 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 1.0, 0.0)), 1L, "vector2") val vector3 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 0.0, 1.0)), 2L, "vector3") //pretty similar, but in 2nd bucket val vector4 = (Vectors.sparse(vectorsSize, Array(1, 2), Array(1.0, 1.0)), 1L, "vector4") //completly another but in 1-st bucket val schema = new StructType() .add("vector", MatrixUtils.vectorUDT) .add("hash", LongType) .add("alias", StringType) val dataFrame = sqlc.createDataFrame(sc.parallelize(Seq(vector1, vector2, vector3, vector4).map(Row.fromTuple(_))), schema) val deduplicator = new HashBasedDeduplicator() .setInputColHash("hash") .setInputColVector("vector") .setSimilarityTreshold(0.80) val answer = deduplicator.transform(dataFrame) .collect().map(row => (row.getLong(1), row.getString(2))) assert(answer.exists(_._2 == "vector1")) //should stay assert(!answer.exists(_._2 == "vector2")) //should be removed assert(answer.exists(_._2 == "vector3")) //should stay cause in other bucket (FalseNegative) assert(answer.exists(_._2 == "vector4")) //should stay cause different (FalsePositive) } }
Example 3
Source File: ForecastPipelineStage.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import org.apache.spark.ml.param.shared.{HasNFutures, HasPredictionCol, HasValidationCol} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.types.{StructType, StringType, StructField, MapType} trait ForecastPipelineStage extends PipelineStage with HasNFutures with HasPredictionCol with HasValidationCol { def setValidationCol(value: String): this.type = set(validationCol, value) override def transformSchema(schema: StructType): StructType = { schema .add(StructField($(validationCol), new VectorUDT)) .add(StructField(IUberdataForecastUtil.ALGORITHM, StringType)) .add(StructField(IUberdataForecastUtil.PARAMS, MapType(StringType, StringType))) } }
Example 4
Source File: ElementwiseProduct.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 5
Source File: Normalizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 6
Source File: DCT.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 7
Source File: BinaryClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 8
Source File: MetadataUtils.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.types.StructField def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 9
Source File: VectorSlicerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 10
Source File: NameAssigner.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasInputCols import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Dataset, functions} import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType} class NameAssigner(override val uid: String) extends Transformer with HasInputCols{ def setInputCols(column: String*) : this.type = set(inputCols, column.toArray) def this() = this(Identifiable.randomUID("NameAssigner")) override def transform(dataset: Dataset[_]): DataFrame = { $(inputCols) $(inputCols).foldLeft(dataset.toDF)((data, column) => { val metadata: Metadata = dataset.schema(column).metadata val attributes = AttributeGroup.fromStructField( StructField(column, new VectorUDT, nullable = false, metadata = metadata)) val map = attributes.attributes .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) val func = functions.udf[String, Number](x => if(x == null) { null } else { val i = x.intValue() map.getOrElse(i, i.toString) }) data.withColumn(column, func(data(column)).as(column, metadata)) }).toDF } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.map(f => if ($(inputCols).contains(f.name)) { StructField(f.name, StringType, f.nullable, f.metadata) } else { f })) }
Example 11
Source File: MatrixUtils.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.ml.linalg.{DenseMatrix, Matrix, VectorUDT} object MatrixUtils { def vectorUDT = new VectorUDT() def transformDense(matrix: DenseMatrix, transformer: (Int, Int, Double) => Double): DenseMatrix = { matrix.foreachActive((i, j, v) => { matrix(i, j) = transformer(i, j, v) }) matrix } def applyNonZeros(source: Matrix, target: DenseMatrix, transformer: (Int, Int, Double, Double) => Double): DenseMatrix = { source.foreachActive((i, j, v) => { val index = target.index(i, j) target.values(index) = transformer(i, j, v, target.values(index)) }) target } def applyAll(source: Matrix, target: DenseMatrix, transformer: (Int, Int, Double, Double) => Double): DenseMatrix = { for (j <- 0 until source.numCols; i <- 0 until source.numRows) { val index = target.index(i, j) target.values(index) = transformer(i, j, source(i, j), target.values(index)) } target } }
Example 12
Source File: VectorExplode.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.util.collection.OpenHashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.odkl.SparkSqlUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, functions} class VectorExplode(override val uid: String) extends Transformer with DefaultParamsWritable { val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.") def setValueCol(value: String) : this.type = set(valueCol, value) setDefault(valueCol -> "value") def this() = this(Identifiable.randomUID("vectorExplode")) override def transform(dataset: Dataset[_]): DataFrame = { val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT]) val resultSchema = StructType(Seq( StructField($(valueCol), StringType, nullable = false)) ++ vectors.map(f => StructField(f.name, DoubleType, nullable = true)) ) val arraySize = resultSchema.size - 1 val names: Array[Map[Int, String]] = vectors.map( f => { AttributeGroup.fromStructField(f).attributes .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) }) val maxCapacity = names.map(_.size).max val explodeVectors : (Row => Array[Row]) = (r: Row ) => { val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity) for(i <- 0 until r.length) { val vector = r.getAs[Vector](i) vector.foreachActive((index, value) => { val name = names(i).getOrElse(index, s"${vectors(i).name}_$index") accumulator.changeValue( name, Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN}, v => {v(i) = value; v}) }) } accumulator.map(x => new GenericRowWithSchema( (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray, resultSchema)).toArray } val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*) val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType))) val expression = functions.explode(explodeUDF(vectorsStruct)) dataset .withColumn(uid, expression) .select( dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++ resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.fields.map(x => x.dataType match { case vector: VectorUDT => StructField(x.name, typeFromVector(x)) case _ => x } )) def typeFromVector(field: StructField): StructType = { val attributes = AttributeGroup.fromStructField(field) StructType(attributes.attributes .map(_.map(a => a.name.getOrElse(s"_${a.index.get}"))) .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" }) .map(name => StructField(name, DoubleType, nullable = false))) } }
Example 13
Source File: VectorizeEncoder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.core.data.DataTransformer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} class VectorizeEncoder(override val uid: String) extends Transformer with HasIdCol with HasTimeCol with HasInputCols with HasLabelCol with HasGroupByCol with HasOutputCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("vectorizer")) def setIdCol(input: String) = set(idCol, input) def setLabelCol(input: String) = set(labelCol, input) def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy)) def setInputCol(input: Array[String]) = set(inputCols, input) def setTimeCol(time: String) = set(timeCol, Some(time)) def setOutputCol(output: String) = set(outputCol, output) override def transform(dataSet: Dataset[_]): DataFrame = { val context = dataSet.sqlContext.sparkContext val input = context.broadcast($(inputCols)) val allColumnNames = dataSet.schema.map(_.name) val nonInputColumnIndexes = context.broadcast( allColumnNames.zipWithIndex.filter( f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol) || f._1 == $(timeCol).getOrElse(""))) val result = dataSet.rdd.map { case (row: Row) => val rowSeq = row.toSeq val nonInputColumns = nonInputColumnIndexes.value.map { case (_, index) => rowSeq(index) } val size = input.value.length val (values, indices) = input.value .filter(col => row.getAs(col) != null) .map { column => DataTransformer.toDouble(row.getAs(column)) } .zipWithIndex .filter(f => f._1 != 0d) .unzip Row( nonInputColumns :+ org.apache.spark.ml.linalg.Vectors .sparse(size, indices.toArray, values.toArray): _* ) } val newSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(result, newSchema) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType( schema.filter( col => !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol) || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("") ) ).add(StructField($(outputCol), new VectorUDT)) }
Example 14
Source File: ElementwiseProduct.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 15
Source File: Normalizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 16
Source File: DCT.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 17
Source File: BinaryClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 18
Source File: ChiSquareTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.mllib.stat.{Statistics => OldStatistics} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col @Since("2.2.0") def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = { val spark = dataset.sparkSession import spark.implicits._ SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT) SchemaUtils.checkNumericType(dataset.schema, labelCol) val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, Vector)] .rdd.map { case (label, features) => OldLabeledPoint(label, OldVectors.fromML(features)) } val testResults = OldStatistics.chiSqTest(rdd) val pValues: Vector = Vectors.dense(testResults.map(_.pValue)) val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom) val statistics: Vector = Vectors.dense(testResults.map(_.statistic)) spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics))) } }
Example 19
Source File: MetadataUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.types.StructField def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 20
Source File: SimpleVectorAssembler.scala From albedo with MIT License | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuilder def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val schema = dataset.schema val assembleFunc = udf { r: Row => SimpleVectorAssembler.assemble(r.toSeq: _*) } val args = $(inputCols).map { c => schema(c).dataType match { case DoubleType => dataset(c) case _: VectorUDT => dataset(c) case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid") } } dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol))) } override def transformSchema(schema: StructType): StructType = { val inputColNames = $(inputCols) val outputColName = $(outputCol) val inputDataTypes = inputColNames.map(name => schema(name).dataType) inputDataTypes.foreach { case _: NumericType | BooleanType => case t if t.isInstanceOf[VectorUDT] => case other => throw new IllegalArgumentException(s"Data type $other is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) } override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra) } object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] { override def load(path: String): SimpleVectorAssembler = super.load(path) def assemble(vv: Any*): Vector = { val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case null => // TODO: output Double.NaN? throw new SparkException("Values to assemble cannot be null.") case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } Vectors.sparse(cur, indices.result(), values.result()).compressed } }
Example 21
Source File: SQLUtils.scala From glow with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.TaskContext import org.apache.spark.ml.linalg.{MatrixUDT, VectorUDT} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ object SQLUtils { def verifyHasFields(schema: StructType, fields: Seq[StructField]): Unit = { fields.foreach { field => val candidateFields = schema.fields.filter(_.name == field.name) if (candidateFields.length != 1) { throw new IllegalArgumentException( s"Schema must have exactly one field called ${field.name}. Current schema is $schema") } val schemaField = candidateFields.head if (!SQLUtils.structFieldsEqualExceptNullability(schemaField, field)) { throw new IllegalArgumentException(s"Schema must have a field $field") } } } def structFieldsEqualExceptNullability(f1: StructField, f2: StructField): Boolean = { f1.name == f2.name && f1.dataType.asNullable == f2.dataType.asNullable } def dataTypesEqualExceptNullability(dt1: DataType, dt2: DataType): Boolean = { dt1.asNullable == dt2.asNullable } def setTaskContext(context: TaskContext): Unit = { TaskContext.setTaskContext(context) } def newMatrixUDT(): MatrixUDT = { new MatrixUDT() } def newVectorUDT(): VectorUDT = { new VectorUDT() } def newAnalysisException(msg: String): AnalysisException = { new AnalysisException(msg) } def anyDataType: ADT = AnyDataType type ADT = AbstractDataType def getSessionExtensions(session: SparkSession): SparkSessionExtensions = { session.extensions } // Used to create an empty RDD with 1 partition to be compatible with our partition-based functionality def createEmptyRDD(sess: SparkSession, numPartitions: Int = 1): RDD[InternalRow] = { sess.sparkContext.parallelize(Seq.empty[InternalRow], numPartitions) } }
Example 22
Source File: MultinomialLabeler.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.MultinomialLabelerModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasFeaturesCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{udf, col} import ml.combust.mleap.core.util.VectorConverters._ class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"), val model: MultinomialLabelerModel) extends Transformer with HasFeaturesCol with HasProbabilitiesCol with HasLabelsCol { def setFeaturesCol(value: String): this.type = set(featuresCol, value) def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value) def setLabelsCol(value: String): this.type = set(labelsCol, value) @org.apache.spark.annotation.Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val probabilitiesUdf = udf { (vector: Vector) => model.top(vector).map(_._1).toArray } val labelsUdf = udf { (vector: Vector) => model.topLabels(vector).toArray } dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))). withColumn($(labelsCol), labelsUdf(col($(featuresCol)))) } override def copy(extra: ParamMap): Transformer = copyValues(new MultinomialLabeler(uid, model), extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT], s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(probabilitiesCol)), s"Output column ${$(probabilitiesCol)} already exists.") require(!inputFields.exists(_.name == $(labelsCol)), s"Output column ${$(labelsCol)} already exists.") StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)), StructField($(labelsCol), ArrayType(StringType)))) } }
Example 23
Source File: Normalizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 24
Source File: IDF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path import org.apache.spark.annotation.Since import org.apache.spark.ml._ import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType @Since("2.0.0") def idf: Vector = idfModel.idf.asML @Since("1.6.0") override def write: MLWriter = new IDFModelWriter(this) } @Since("1.6.0") object IDFModel extends MLReadable[IDFModel] { private[IDFModel] class IDFModelWriter(instance: IDFModel) extends MLWriter { private case class Data(idf: Vector) override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) val data = Data(instance.idf) val dataPath = new Path(path, "data").toString sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } } private class IDFModelReader extends MLReader[IDFModel] { private val className = classOf[IDFModel].getName override def load(path: String): IDFModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath) val Row(idf: Vector) = MLUtils.convertVectorColumnsToML(data, "idf") .select("idf") .head() val model = new IDFModel(metadata.uid, new feature.IDFModel(OldVectors.fromML(idf))) DefaultParamsReader.getAndSetParams(model, metadata) model } } @Since("1.6.0") override def read: MLReader[IDFModel] = new IDFModelReader @Since("1.6.0") override def load(path: String): IDFModel = super.load(path) }
Example 25
Source File: BinaryClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 26
Source File: MetadataUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.types.StructField def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 27
Source File: VectorSlicerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 28
Source File: VectorSlicerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.core.types.TensorShape import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.DataFrame import org.apache.spark.sql.mleap.TypeConverters.sparkToMleapDataShape import org.apache.spark.sql.types.StructField class VectorSlicerOp extends SimpleSparkOp[VectorSlicer] { override val Model: OpModel[SparkBundleContext, VectorSlicer] = new OpModel[SparkBundleContext, VectorSlicer] { override val klazz: Class[VectorSlicer] = classOf[VectorSlicer] override def opName: String = Bundle.BuiltinOps.feature.vector_slicer override def store(model: Model, obj: VectorSlicer) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val dataset = context.context.dataset.get val namedIndicesMap: Array[(String, Int)] = if(obj.getNames.nonEmpty) { extractNamedIndices(obj.getInputCol, obj.getNames, dataset) } else { Array() } val (names, namedIndices) = namedIndicesMap.unzip val inputShape = sparkToMleapDataShape(dataset.schema(obj.getInputCol), dataset).asInstanceOf[TensorShape] model.withValue("indices", Value.longList(obj.getIndices.map(_.toLong).toSeq)). withValue("names", Value.stringList(names)). withValue("named_indices", Value.intList(namedIndices)). withValue("input_size", Value.int(inputShape.dimensions.get.head)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): VectorSlicer = { val names = model.value("names").getStringList new VectorSlicer(uid = "").setIndices(model.value("indices").getLongList.map(_.toInt).toArray). setNames(names.toArray) } private def extractNamedIndices(inputCol: String, names: Array[String], dataset: DataFrame): Array[(String, Int)] = { names.zip(getFeatureIndicesFromNames(dataset.schema(inputCol), names)) } private def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } } override def sparkLoad(uid: String, shape: NodeShape, model: VectorSlicer): VectorSlicer = { new VectorSlicer(uid = uid).setIndices(model.getIndices).setNames(model.getNames) } override def sparkInputs(obj: VectorSlicer): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: VectorSlicer): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 29
Source File: InteractionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.bundle.DataShape import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.core.annotation.SparkCode import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute} import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.Interaction import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.DataFrame import org.apache.spark.sql.mleap.TypeConverters._ import ml.combust.mleap.runtime.types.BundleTypeConverters._ import org.apache.spark.sql.types.{BooleanType, NumericType} class InteractionOp extends SimpleSparkOp[Interaction] { override val Model: OpModel[SparkBundleContext, Interaction] = new OpModel[SparkBundleContext, Interaction] { override val klazz: Class[Interaction] = classOf[Interaction] override def opName: String = Bundle.BuiltinOps.feature.interaction override def store(model: Model, obj: Interaction) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val dataset = context.context.dataset.get val spec = buildSpec(obj.getInputCols, dataset) val inputShapes = obj.getInputCols.map(v => sparkToMleapDataShape(dataset.schema(v), dataset): DataShape) val m = model.withValue("num_inputs", Value.int(spec.length)). withValue("input_shapes", Value.dataShapeList(inputShapes)) spec.zipWithIndex.foldLeft(m) { case (m2, (numFeatures, index)) => m2.withValue(s"num_features$index", Value.intList(numFeatures)) } } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): Interaction = { // No need to do anything here, everything is handled through Spark meta data new Interaction() } @SparkCode(uri = "https://github.com/apache/spark/blob/branch-2.1/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala") private def buildSpec(inputCols: Array[String], dataset: DataFrame): Array[Array[Int]] = { def getNumFeatures(attr: Attribute): Int = { attr match { case nominal: NominalAttribute => math.max(1, nominal.getNumValues.getOrElse( throw new IllegalArgumentException("Nominal features must have attr numValues defined."))) case _ => 1 // numeric feature } } inputCols.map(dataset.schema.apply).map { f => f.dataType match { case _: NumericType | BooleanType => Array(getNumFeatures(Attribute.fromStructField(f))) case _: VectorUDT => val attrs = AttributeGroup.fromStructField(f).attributes.getOrElse( throw new IllegalArgumentException("Vector attributes must be defined for interaction.")) attrs.map(getNumFeatures) } } } } override def sparkLoad(uid: String, shape: NodeShape, model: Interaction): Interaction = { new Interaction(uid = uid) } override def sparkInputs(obj: Interaction): Seq[ParamSpec] = { Seq("input" -> obj.inputCols) } override def sparkOutputs(obj: Interaction): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 30
Source File: IsotonicRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle._ import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.ml.param.Param import org.apache.spark.ml.regression.IsotonicRegressionModel import org.apache.spark.mllib.regression class IsotonicRegressionOp extends SimpleSparkOp[IsotonicRegressionModel] { override val Model: OpModel[SparkBundleContext, IsotonicRegressionModel] = new OpModel[SparkBundleContext, IsotonicRegressionModel] { override val klazz: Class[IsotonicRegressionModel] = classOf[IsotonicRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.isotonic_regression override def store(model: Model, obj: IsotonicRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) var m = model.withValue("boundaries", Value.doubleList(obj.boundaries.toArray.toSeq)). withValue("predictions", Value.doubleList(obj.predictions.toArray.toSeq)). withValue("isotonic", Value.boolean(obj.getIsotonic)) if(context.context.dataset.get.schema(obj.getFeaturesCol).dataType.isInstanceOf[VectorUDT]) { m = m.withValue("feature_index", Value.long(obj.getFeatureIndex)) } m } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): IsotonicRegressionModel = { val oldModel = new regression.IsotonicRegressionModel(boundaries = model.value("boundaries").getDoubleList.toArray, predictions = model.value("predictions").getDoubleList.toArray, isotonic = model.value("isotonic").getBoolean) val m = new IsotonicRegressionModel(uid = "", oldModel = oldModel) model.getValue("feature_index").foreach(i => m.setFeatureIndex(i.getLong.toInt)) m } } override def sparkLoad(uid: String, shape: NodeShape, model: IsotonicRegressionModel): IsotonicRegressionModel = { val oldModel = new regression.IsotonicRegressionModel(boundaries = model.boundaries.toArray, predictions = model.predictions.toArray, isotonic = model.getIsotonic) new IsotonicRegressionModel(uid = uid, oldModel = oldModel).setFeatureIndex(model.getFeatureIndex) } override def sparkInputs(obj: IsotonicRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: IsotonicRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 31
Source File: BundleTypeConverters.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle import ml.bundle import ml.bundle.{BasicType, DataShapeType} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types._ import scala.language.implicitConversions trait BundleTypeConverters { implicit def sparkToBundleDataShape(field: StructField) (implicit dataset: DataFrame): bundle.DataShape = { field.dataType match { case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType | StringType | ArrayType(ByteType, false) => bundle.DataShape(DataShapeType.SCALAR) case ArrayType(_, _) => bundle.DataShape(DataShapeType.LIST) case _: VectorUDT => // collect size information from dataset if necessary bundle.DataShape(bundle.DataShapeType.TENSOR) case _ => throw new IllegalArgumentException(s"invalid shape for field $field") } } implicit def sparkToBundleBasicType(dataType: DataType) (implicit dataset: DataFrame): bundle.BasicType = { dataType match { case BooleanType => BasicType.BOOLEAN case ByteType => BasicType.BYTE case ShortType => BasicType.SHORT case IntegerType => BasicType.INT case LongType => BasicType.LONG case FloatType => BasicType.FLOAT case DoubleType => BasicType.DOUBLE case StringType => BasicType.STRING case ArrayType(ByteType, _) => BasicType.BYTE_STRING case ArrayType(dt, _) => sparkToBundleBasicType(dt) case _: VectorUDT => BasicType.DOUBLE case _ => throw new IllegalArgumentException(s"invalid spark basic type $dataType") } } implicit def sparkToBundleDataType(field: StructField) (implicit dataset: DataFrame): bundle.DataType = { bundle.DataType(field.dataType, Some(field)) } } object BundleTypeConverters extends BundleTypeConverters
Example 32
Source File: ElementwiseProduct.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 33
Source File: ElementwiseProduct.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 34
Source File: Normalizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 35
Source File: DCT.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 36
Source File: BinaryClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 37
Source File: MetadataUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.types.StructField def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 38
Source File: VectorSlicerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 39
Source File: XGBoostBaseBestModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.{Booster, DMatrix} import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType} trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]] extends BestModelFinder[G, M] with HasGroupByCol { protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast { StructType( Seq( StructField($(groupByCol).get, FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT)))) } protected def xGBoostEvaluation(row: Row, model: Booster, broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]], id: G, parameters: ParamMap): ModelParamEvaluation[G] = { val featuresArray = row .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME) .map { vec => val values = vec.toArray.map(DataTransformer.toFloat) LabeledPoint(values.head, null, values.tail) } val features = new DMatrix(featuresArray.toIterator) log.warn(s"Evaluating forecast for id $id, with xgboost") val prediction = model.predict(features).flatten val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length) val toBeValidated = featuresArray.zip(forecastToBeValidated) val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f => (f._1.label.toDouble, f._2.toDouble))) val metricName = broadcastEvaluator.value.getMetricName new ModelParamEvaluation[G]( id, metric, parameters, Some(metricName), SupportedAlgorithm.XGBoostAlgorithm) } }
Example 40
Source File: MovingAverage.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types._ def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(windowSize -> 3) override def transform(dataSet: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataSet.schema) val sparkContext = dataSet.sqlContext.sparkContext val inputType = outputSchema($(inputCol)).dataType val inputTypeBr = sparkContext.broadcast(inputType) val dataSetRdd = dataSet.rdd val inputColName = sparkContext.broadcast($(inputCol)) val inputColIndex = dataSet.columns.indexOf($(inputCol)) val inputColIndexBr = sparkContext.broadcast(inputColIndex) val windowSizeBr = sparkContext.broadcast($(windowSize)) val maRdd = dataSetRdd.map { case (row: Row) => val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) { val vector = row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value) (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1))) } else { val iterable = row.getAs[Iterable[Double]](inputColName.value) (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1))) } val (before, after) = row.toSeq.splitAt(inputColIndexBr.value) Row( (before :+ rawValue) ++ after.tail :+ MovingAverageCalc .simpleMovingAverageArray(array, windowSizeBr.value): _* ) } dataSet.sqlContext.createDataFrame(maRdd, outputSchema) } override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra) } object MovingAverageCalc { private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = { (for (i <- 1 to values.length) yield //TODO rollback this comment with the right size of features to make the meanaverage return // the features values for the first values of the calc if (i < period) 0d //values(i) else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d) } } object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] { override def load(path: String): MovingAverage[_] = super.load(path) }