org.apache.spark.mllib.linalg.VectorUDT Scala Examples
The following examples show how to use org.apache.spark.mllib.linalg.VectorUDT.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DLEstimatorBase.scala From BigDL with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol} import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row} abstract class DLEstimatorBase[Learner <: DLEstimatorBase[Learner, M], M <: DLTransformerBase[M]] extends Estimator[M] with HasLabelCol { protected def internalFit(dataFrame: DataFrame): M override def fit(dataFrame: DataFrame): M = { transformSchema(dataFrame.schema, logging = true) internalFit(dataFrame) } override def copy(extra: ParamMap): Learner = defaultCopy(extra) }
Example 2
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val idf = udf { vec: Vector => idfModel.transform(vec) } dataset.withColumn($(outputCol), idf(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): IDFModel = { val copied = new IDFModel(uid, idfModel) copyValues(copied, extra) } }
Example 3
Source File: BinaryClassificationEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() case other => throw new IllegalArgumentException(s"Does not support metric $other.") } metrics.unpersist() metric } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 4
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val idf = udf { vec: Vector => idfModel.transform(vec) } dataset.withColumn($(outputCol), idf(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): IDFModel = { val copied = new IDFModel(uid, idfModel) copyValues(copied, extra).setParent(parent) } }
Example 5
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.types.DataType def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT }
Example 6
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StructField, StructType} override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val pcaOp = udf { pcaModel.transform _ } dataset.withColumn($(outputCol), pcaOp(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[VectorUDT], s"Input column ${$(inputCol)} must be a vector column") require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.") val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) StructType(outputFields) } override def copy(extra: ParamMap): PCAModel = { val copied = new PCAModel(uid, pcaModel) copyValues(copied, extra).setParent(parent) } }
Example 7
Source File: BinaryClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //ROC曲线下面积 setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { //ROC曲线下面积为1.0时表示一个完美的分类器 case "areaUnderROC" => metrics.areaUnderROC() //准确率与召回率 case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true//ROC曲线下面积为1.0时表示一个完美的分类器,0.5则表示一个随机的性能 case "areaUnderPR" => true //准确率与召回率 } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 8
Source File: MetadataUtils.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.types.StructField def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 9
Source File: Normalizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.types.DataType def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) normalizer.transform } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 10
Source File: DCT.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.types.DataType def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 11
Source File: BinaryClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("1.2.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 12
Source File: LibSVMRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import com.google.common.base.Objects import org.apache.spark.Logging import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @Since("1.6.0") class DefaultSource extends RelationProvider with DataSourceRegister { @Since("1.6.0") override def shortName(): String = "libsvm" @Since("1.6.0") override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) : BaseRelation = { val path = parameters.getOrElse("path", throw new IllegalArgumentException("'path' must be specified")) val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt val vectorType = parameters.getOrElse("vectorType", "sparse") new LibSVMRelation(path, numFeatures, vectorType)(sqlContext) } }
Example 13
Source File: MetadataUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.types.StructField def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 14
Source File: DataFrameToMleap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter import com.truecar.mleap.runtime.types.StringArrayType import com.truecar.mleap.spark import com.truecar.mleap.spark.SparkDataset import com.truecar.mleap.runtime.types import com.truecar.mleap.spark.SparkLeapFrame import org.apache.spark.ml.mleap import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types._ import com.truecar.mleap.runtime.{Row => MleapRow} case class DataFrameToMleap(dataset: DataFrame) { def toMleap: SparkLeapFrame = { val mleapFields = dataset.schema.fields.flatMap { field => field.dataType match { case _: NumericType | BooleanType | StringType => Seq(types.StructField(field.name, types.DoubleType)) case _: VectorUDT => Seq(types.StructField(field.name, types.VectorType)) case _: StringType => Seq(types.StructField(field.name, types.StringType)) case dataType: ArrayType => dataType.elementType match { case StringType => Seq(types.StructField(field.name, StringArrayType)) case _ => Seq() } case _ => Seq() } } toMleap(types.StructType(mleapFields)) } def toMleap(schema: types.StructType): SparkLeapFrame = { val sparkSchema = dataset.schema // cast MLeap field numeric types to DoubleTypes val mleapCols = schema.fields.map { field => field.dataType match { case types.DoubleType => dataset.col(field.name).cast(DoubleType).as(s"mleap.${field.name}") case types.StringType => dataset.col(field.name).cast(StringType).as(s"mleap.${field.name}") case types.VectorType => dataset.col(field.name).cast(new mleap.VectorUDT()).as(s"mleap.${field.name}") case types.StringArrayType => dataset.col(field.name).cast(new ArrayType(StringType, containsNull = false)).as(s"mleap.${field.name}") } } val cols = Seq(dataset.col("*")) ++ mleapCols val castDataset = dataset.select(cols: _*) val sparkIndices = sparkSchema.fields.indices val mleapIndices = (sparkSchema.fields.length until (sparkSchema.fields.length + schema.fields.length)).toArray val rdd = castDataset.rdd.map { row => // finish converting Spark data structure to MLeap // TODO: make a Spark UDT for MleapVector and just // cast like we do for numeric types val mleapValues = mleapIndices.map(row.get) val mleapRow = MleapRow(mleapValues: _*) val sparkValues: IndexedSeq[Any] = sparkIndices.map(row.get) (mleapRow, sparkValues) } val mleapDataset = SparkDataset(rdd) SparkLeapFrame(schema, sparkSchema, mleapDataset) } }
Example 15
Source File: StructTypeToSpark.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter import com.truecar.mleap.runtime.types import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.types._ case class StructTypeToSpark(schema: types.StructType) { def toSpark: StructType = { val fields = schema.fields.map { field => field.dataType match { case types.DoubleType => StructField(field.name, DoubleType) case types.StringType => StructField(field.name, StringType) case types.VectorType => StructField(field.name, new VectorUDT()) case types.StringArrayType => StructField(field.name, new ArrayType(StringType, containsNull = false)) } } StructType(fields) } }
Example 16
Source File: StructTypeToMleap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter import com.truecar.mleap.runtime.types import org.apache.spark.SparkException import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.types._ case class StructTypeToMleap(schema: StructType) { def toMleap: types.StructType = { val leapFields = schema.fields.map { field => val sparkType = field.dataType val sparkTypeName = sparkType.typeName val dataType = sparkType match { case _: NumericType | BooleanType => types.DoubleType case _: StringType => types.StringType case _: VectorUDT => types.VectorType case dataType: ArrayType if dataType.elementType == StringType => types.StringArrayType case _ => throw new SparkException(s"unsupported MLeap datatype: $sparkTypeName") } types.StructField(field.name, dataType) } types.StructType(leapFields) } }