org.apache.spark.ml.param.Param Scala Examples
The following examples show how to use org.apache.spark.ml.param.Param.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: VectorExplode.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.util.collection.OpenHashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.odkl.SparkSqlUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, functions} class VectorExplode(override val uid: String) extends Transformer with DefaultParamsWritable { val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.") def setValueCol(value: String) : this.type = set(valueCol, value) setDefault(valueCol -> "value") def this() = this(Identifiable.randomUID("vectorExplode")) override def transform(dataset: Dataset[_]): DataFrame = { val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT]) val resultSchema = StructType(Seq( StructField($(valueCol), StringType, nullable = false)) ++ vectors.map(f => StructField(f.name, DoubleType, nullable = true)) ) val arraySize = resultSchema.size - 1 val names: Array[Map[Int, String]] = vectors.map( f => { AttributeGroup.fromStructField(f).attributes .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) }) val maxCapacity = names.map(_.size).max val explodeVectors : (Row => Array[Row]) = (r: Row ) => { val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity) for(i <- 0 until r.length) { val vector = r.getAs[Vector](i) vector.foreachActive((index, value) => { val name = names(i).getOrElse(index, s"${vectors(i).name}_$index") accumulator.changeValue( name, Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN}, v => {v(i) = value; v}) }) } accumulator.map(x => new GenericRowWithSchema( (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray, resultSchema)).toArray } val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*) val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType))) val expression = functions.explode(explodeUDF(vectorsStruct)) dataset .withColumn(uid, expression) .select( dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++ resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.fields.map(x => x.dataType match { case vector: VectorUDT => StructField(x.name, typeFromVector(x)) case _ => x } )) def typeFromVector(field: StructField): StructType = { val attributes = AttributeGroup.fromStructField(field) StructType(attributes.attributes .map(_.map(a => a.name.getOrElse(s"_${a.index.get}"))) .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" }) .map(name => StructField(name, DoubleType, nullable = false))) } }
Example 2
Source File: MulticlassClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 3
Source File: RegressionEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 4
Source File: RegressionEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => -metrics.rootMeanSquaredError case "mse" => -metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => -metrics.meanAbsoluteError } metric } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 5
Source File: TransformerWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import org.apache.spark.sql.types.StructType import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.Transformer import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class TransformerWrapper( executionContext: ExecutionContext, transformer: Transformer) extends ML.Model[TransformerWrapper] { override def copy(extra: ParamMap): TransformerWrapper = { val params = ParamTransformer.transform(extra) val transformerCopy = transformer.replicate().set(params: _*) new TransformerWrapper(executionContext, transformerCopy) } override def transformDF(dataset: sql.DataFrame): sql.DataFrame = { transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())) .sparkDataFrame } override def transformSchema(schema: StructType): StructType = { transformer._transformSchema(schema).get } override lazy val params: Array[Param[_]] = { transformer.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("TransformerWrapper") }
Example 6
Source File: EvaluatorWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.evaluation import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.Evaluator import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class EvaluatorWrapper( context: ExecutionContext, evaluator: Evaluator) extends ML.Evaluator { override def evaluateDF(dataset: sql.DataFrame): Double = { evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value } override def copy(extra: ParamMap): evaluation.Evaluator = { val params = ParamTransformer.transform(extra) val evaluatorCopy = evaluator.replicate().set(params: _*) new EvaluatorWrapper(context, evaluatorCopy) } override lazy val params: Array[Param[_]] = { evaluator.params.map(new ParamWrapper(uid, _)) } override def isLargerBetter: Boolean = evaluator.isLargerBetter override val uid: String = Identifiable.randomUID("EvaluatorWrapper") }
Example 7
Source File: SparkStageParam.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import org.apache.hadoop.fs.Path import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.param.{Param, ParamPair, Params} import org.apache.spark.ml.util.{Identifiable, MLReader, MLWritable} import org.apache.spark.util.SparkUtils import org.json4s.JsonAST.{JObject, JValue} import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.json4s.{DefaultFormats, Formats, JString} class SparkStageParam[S <: PipelineStage with Params] ( parent: String, name: String, doc: String, isValid: Option[S] => Boolean ) extends Param[Option[S]](parent, name, doc, isValid) { import SparkStageParam._ override def jsonDecode(jsonStr: String): Option[S] = { val json = parse(jsonStr) val uid = (json \ "uid").extractOpt[String] val path = (json \ "path").extractOpt[String] path -> uid match { case (None, _) | (_, None) | (_, Some(NoUID)) => savePath = None None case (Some(p), Some(stageUid)) => savePath = Option(p) val stagePath = new Path(p, stageUid).toString val className = (json \ "className").extract[String] val cls = SparkUtils.classForName(className) val stage = cls.getMethod("read").invoke(null).asInstanceOf[MLReader[PipelineStage]].load(stagePath) Option(stage).map(_.asInstanceOf[S]) } } } object SparkStageParam { implicit val formats: Formats = DefaultFormats val NoClass = "" val NoUID = "" def updateParamsMetadataWithPath(jValue: JValue, path: String): JValue = jValue match { case JObject(pairs) => JObject( pairs.map { case (SparkWrapperParams.SparkStageParamName, j) => SparkWrapperParams.SparkStageParamName -> j.merge(JObject("path" -> JString(path))) case param => param } ) case j => throw new IllegalArgumentException(s"Cannot recognize JSON Spark params metadata: $j") } }
Example 8
Source File: MimeTypeDetector.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import java.io.InputStream import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.UnaryTransformer import org.apache.commons.io.input.BoundedInputStream import org.apache.spark.ml.param.{LongParam, Param, Params} import org.apache.tika.detect.{DefaultDetector, Detector} import org.apache.tika.metadata.{HttpHeaders, Metadata} import org.apache.tika.mime.MediaType def detect(in: InputStream, typeHint: String): MediaType = { val meta = if (typeHint == null || typeHint.isEmpty) emptyMeta else { val meta = new Metadata() meta.add(HttpHeaders.CONTENT_TYPE, typeHint) meta } // parses the input stream and detects the media type detector.detect(in, meta) } }
Example 9
Source File: DateToUnitCircleTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.sequence.SequenceTransformer import com.salesforce.op.utils.spark.OpVectorMetadata import com.salesforce.op.{FeatureHistory, UID} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.{Param, Params} import scala.reflect.runtime.universe.TypeTag trait DateToUnitCircleParams extends Params { final val timePeriod: Param[String] = new Param[String](parent = this, name = "timePeriods", doc = "The time period to extract from the timestamp", isValid = (value: String) => TimePeriod.values.map(_.entryName).contains(value) ) setDefault(timePeriod, TimePeriod.HourOfDay.entryName) class DateToUnitCircleTransformer[T <: Date] ( uid: String = UID[DateToUnitCircleTransformer[_]] )(implicit tti: TypeTag[T], val ttiv: TypeTag[T#Value]) extends SequenceTransformer[T, OPVector]( operationName = "dateToUnitCircle", uid = uid ) with DateToUnitCircleParams { override def transformFn: Seq[T] => OPVector = timestamp => { val randians = timestamp.flatMap(ts => DateToUnitCircle.convertToRandians(ts.v, getTimePeriod)).toArray Vectors.dense(randians).toOPVector } override def onGetMetadata(): Unit = { super.onGetMetadata() val timePeriod = getTimePeriod val columns = inN.flatMap{ f => DateToUnitCircle.metadataValues(timePeriod) .map(iv => f.toColumnMetaData().copy(descriptorValue = Option(iv))) } val history = inN.flatMap(f => Seq(f.name -> FeatureHistory(originFeatures = f.originFeatures, stages = f.stages))) setMetadata(OpVectorMetadata(getOutputFeatureName, columns, history.toMap).toMetadata) } } private[op] object DateToUnitCircle { def metadataValues(timePeriod: TimePeriod): Seq[String] = Seq(s"x_$timePeriod", s"y_$timePeriod") def convertToBin(timestamp: Long, timePeriodDesired: TimePeriod): Double = getPeriodWithSize(timestamp, timePeriodDesired)._1 def convertToRandians(timestamp: Option[Long], timePeriodDesired: TimePeriod): Array[Double] = timestamp.map { ts => val (timePeriod, periodSize) = getPeriodWithSize(ts, timePeriodDesired) val radians = (2 * math.Pi * timePeriod) / periodSize Array(math.cos(radians), math.sin(radians)) }.getOrElse(Array(0.0, 0.0)) private def getPeriodWithSize(timestamp: Long, timePeriod: TimePeriod): (Double, Int) = { val tpv = timePeriod.extractTimePeriodVal(timestamp) val period = if (tpv.min == 1) tpv.value - 1 else tpv.value (period.toDouble, tpv.max) } }
Example 10
Source File: MulticlassClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 setDefault(metricName -> "f1") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision//准确率 case "recall" => metrics.recall//召回率 case "weightedPrecision" => metrics.weightedPrecision//加权准确率 case "weightedRecall" => metrics.weightedRecall//加权召回率 } metric } override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true//F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "precision" => true//准确率 case "recall" => true//召回率 case "weightedPrecision" => true//加权准确率 case "weightedRecall" => true//加权召回率 } override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) }
Example 11
Source File: RegressionEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //默认均方根误差 setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { //均方根误差 case "rmse" => metrics.rootMeanSquaredError //均方差 case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 //平均绝对误差 case "mae" => metrics.meanAbsoluteError } metric } override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false//均方根误差 case "mse" => false//均方差 case "r2" => true//平方系统 case "mae" => false//平均绝对误差 } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 12
Source File: LanguageAwareAnalyzer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.util.StopwordAnalyzerBase import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.HasOutputCol import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } def this() = this(Identifiable.randomUID("languageAnalyzer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF } @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputColText) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true)) } else { SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true)) } } } object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] { override def load(path: String): LanguageAwareAnalyzer = super.load(path) }
Example 13
Source File: LanguageDetectorTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import com.google.common.base.Optional import com.optimaize.langdetect.LanguageDetector import com.optimaize.langdetect.i18n.LdLocale import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import scala.collection.Map def setOutputCol(value: String): this.type = set(outputCol, value) def this() = this(Identifiable.randomUID("languageDetector")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } @transient object languageDetectorWrapped extends Serializable { val languageDetector: LanguageDetector = LanguageDetectorUtils.buildLanguageDetector( LanguageDetectorUtils.readListLangsBuiltIn(), $(minimalConfidence), $(languagePriors).toMap) } }
Example 14
Source File: RegexpReplaceTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("RegexpReplaceTransformer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), regexp_replace(dataset.col($(inputCol)), $(regexpPattern), $(regexpReplacement))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputCol))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), StringType) } else { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } } } object RegexpReplaceTransformer extends DefaultParamsReadable[RegexpReplaceTransformer] { override def load(path: String): RegexpReplaceTransformer = super.load(path) }
Example 15
Source File: HasConfigurations.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.hyperopt import org.apache.spark.ml.odkl.ModelWithSummary import org.apache.spark.ml.odkl.ModelWithSummary.Block import org.apache.spark.ml.param.{Param, Params} import org.apache.spark.repro.MetricsExtractor import org.apache.spark.repro.ReproContext.logMetircs import org.apache.spark.sql.{DataFrame, functions} trait HasConfigurations extends Params with MetricsExtractor { val configurations: Block = Block("configurations") val configurationIndexColumn = new Param[String](this, "configurationIndexColumn", "Name of the column to store id of config for further analysis.") val resultingMetricColumn = new Param[String](this, "resultingMetricColumn", "Name of the column to store resulting metrics for further analysis.") val errorColumn = new Param[String](this, "errorColumn", "Name of the column to store text of the error if occurs.") def getConfigurationIndexColumn: String = $(configurationIndexColumn) def setConfigurationIndexColumn(value: String): this.type = set(configurationIndexColumn, value) def getResultingMetricColumn: String = $(resultingMetricColumn) def setResultingMetricColumn(value: String): this.type = set(resultingMetricColumn, value) def getErrorColumn: String = $(errorColumn) def setErrorColumn(value: String): this.type = set(errorColumn, value) setDefault( configurationIndexColumn -> "configurationIndex", resultingMetricColumn -> "resultingMetric", errorColumn -> "error" ) protected def extractImpl(model: ModelWithSummary[_]) : Option[DataFrame] = { // Report only resulting metrics to the context assuming that detailed metrics // where reported by forks. model.summary.blocks.get(configurations).map(data => data.select( data(getConfigurationIndexColumn).as("invertedStep"), data(getResultingMetricColumn).as("value"), functions.lit("target").as("metric") ) ) } }
Example 16
Source File: ElementwiseProduct.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 17
Source File: SimpleReproContext.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.repro import org.apache.spark.ml.param.{Param, ParamPair, Params} import org.apache.spark.ml.util.MLWritable import org.apache.spark.sql.{DataFrame, Dataset, SparkSession, functions} class SimpleReproContext private (spark: SparkSession, basePath: String, tags: Seq[(String,String)]) extends ReproContext { def this(basePath: String)(implicit spark: SparkSession) = this(spark, basePath, Seq()) var accumulatedMetrics : Seq[DataFrame] = Seq() var accumulatedParams: Seq[(Seq[String], Iterable[ParamPair[_]])] = Seq() override def persistEstimator(estimator: MLWritable): Unit = { estimator.save(basePath + "/estimator") } override def persistModel(model: MLWritable): Unit = { model.save(basePath + "/model") } override def dive(tags: Seq[(String, String)]): ReproContext = new SimpleReproContext( spark, basePath, this.tags ++ tags) override def logParamPairs(params: Iterable[ParamPair[_]], path: Seq[String]): Unit = accumulatedParams = accumulatedParams :+ path -> params override def logMetircs(metrics: => DataFrame): Unit = accumulatedMetrics = accumulatedMetrics :+ metrics override def start(): Unit = { import spark.implicits._ accumulatedParams.map { case (path, params) => params.view .map(x => x.param.name -> x.param.asInstanceOf[Param[Any]].jsonEncode(x.value)) .toSeq .toDF("param", "value") .withColumn("path", functions.lit(path.mkString("/"))) }.reduce(_ unionByName _) .write.parquet(taggedPrefix + "/params") } override def finish(): Unit = { accumulatedMetrics.reduceOption(_ unionByName _).foreach( _.write.parquet(taggedPrefix + "/metrics")) } private def taggedPrefix: String = { tags.map(x => x._1 + "=" + x._2).mkString(basePath + "/", "/", "") } }
Example 18
Source File: MetricsExtractor.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.repro import org.apache.spark.ml.feature.SQLTransformer import org.apache.spark.ml.odkl.ModelWithSummary import org.apache.spark.ml.param.{Param, Params} import org.apache.spark.sql.DataFrame trait MetricsExtractor extends Params { val extractExpression = new Param[String](this, "extractExpression", "Optional SQL expression for transforming metrics before uploading to repro context") def setExtractExpression(value: String) : this.type = set(extractExpression, value) final def extract(model: ModelWithSummary[_]): Option[DataFrame] = { extractImpl(model) .map(data => get(extractExpression) .map(expression => { new SQLTransformer().setStatement(expression).transform(data) }) .getOrElse(data)) } protected def extractImpl(model: ModelWithSummary[_]): Option[DataFrame] }
Example 19
Source File: SQLTransformer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) // Call SessionCatalog.dropTempView to avoid unpersisting the possibly cached dataset. dataset.sparkSession.sessionState.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 20
Source File: ElementwiseProduct.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 21
Source File: MulticlassClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 22
Source File: RegressionEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 23
Source File: SageMakerAlgorithmParams.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.algorithms import org.apache.spark.ml.param.{IntParam, Param, Params, ParamValidators} val featureDim : IntParam = new IntParam(this, "feature_dim", "The dimension of the input vectors. Must be > 0.", ParamValidators.gtEq(1)) def getFeatureDim: Int = $(featureDim) protected def autoOrAboveParamValidator(lowerBound: Double, inclusive: Boolean): String => Boolean = { (value: String) => try { value == "auto" || { if (inclusive) { value.toDouble >= lowerBound } else { value.toDouble > lowerBound } } } catch { case e: NumberFormatException => false } } protected def inArrayOrAboveParamValidator(validValues: Array[String], lowerBound: Double): String => Boolean = { (value: String) => try { validValues.contains(value) || value.toDouble > lowerBound } catch { case e: NumberFormatException => false } } protected def parseTrueAndFalse(param: Param[String]): Boolean = { $(param) match { case "True" => true case "False" => false case _ => throw new IllegalArgumentException("Param is neither 'True' nor 'False'") } } }
Example 24
Source File: SQLTransformer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkContext import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.param.{ParamMap, Param} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{SQLContext, DataFrame, Row} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("1.6.0") override def transform(dataset: DataFrame): DataFrame = { val tableName = Identifiable.randomUID(uid) dataset.registerTempTable(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val outputDF = dataset.sqlContext.sql(realStatement) outputDF } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val sc = SparkContext.getOrCreate() val sqlContext = SQLContext.getOrCreate(sc) val dummyRDD = sc.parallelize(Seq(Row.empty)) val dummyDF = sqlContext.createDataFrame(dummyRDD, schema) dummyDF.registerTempTable(tableIdentifier) val outputSchema = sqlContext.sql($(statement)).schema outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 25
Source File: MulticlassClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("1.5.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision case "recall" => metrics.recall case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall } metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true case "precision" => true case "recall" => true case "weightedPrecision" => true case "weightedRecall" => true } @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 26
Source File: RegressionEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("1.4.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema val predictionColName = $(predictionCol) val predictionType = schema($(predictionCol)).dataType require(predictionType == FloatType || predictionType == DoubleType, s"Prediction column $predictionColName must be of type float or double, " + s" but not $predictionType") val labelColName = $(labelCol) val labelType = schema($(labelCol)).dataType require(labelType == FloatType || labelType == DoubleType, s"Label column $labelColName must be of type float or double, but not $labelType") val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 27
Source File: RankingMetricFormatter.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ import ws.vinta.albedo.evaluators.RankingEvaluator._ class RankingMetricFormatter(override val uid: String, val sourceType: String) extends Transformer with DefaultParamsWritable { def this(sourceType: String) = { this(Identifiable.randomUID("rankingMetricFormatter"), sourceType) } val userCol = new Param[String](this, "userCol", "User column name") def getUserCol: String = $(userCol) def setUserCol(value: String): this.type = set(userCol, value) setDefault(userCol -> "user") val itemCol = new Param[String](this, "itemCol", "Item column name") def getItemCol: String = $(itemCol) def setItemCol(value: String): this.type = set(itemCol, value) setDefault(itemCol -> "item") val predictionCol = new Param[String](this, "predictionCol", "Prediction column name") def getPredictionCol: String = $(predictionCol) def setPredictionCol(value: String): this.type = set(predictionCol, value) setDefault(predictionCol -> "prediction") val topK = new IntParam(this, "topK", "Recommend top-k items for every user") def getTopK: Int = $(topK) def setTopK(value: Int): this.type = set(topK, value) setDefault(topK -> 15) override def transformSchema(schema: StructType): StructType = { Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType) .foreach{ case(columnName: String, expectedDataType: DataType) => { val actualDataType = schema(columnName).dataType require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.") } } schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) sourceType match { case "als" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK))) case "lr" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK))) } } override def copy(extra: ParamMap): RankingMetricFormatter = { val copied = new RankingMetricFormatter(uid, sourceType) copyValues(copied, extra) } } object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter]
Example 28
Source File: ContentRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.http.HttpHost import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.elasticsearch.action.search.SearchRequest import org.elasticsearch.client.{RestClient, RestHighLevelClient} import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item import org.elasticsearch.index.query.QueryBuilders._ import org.elasticsearch.search.SearchHit import org.elasticsearch.search.builder.SearchSourceBuilder import ws.vinta.albedo.closures.DBFunctions._ class ContentRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("contentRecommender")) } val enableEvaluationMode = new Param[Boolean](this, "enableEvaluationMode", "Should be enable for evaluation only") def getEnableEvaluationMode: Boolean = $(enableEvaluationMode) def setEnableEvaluationMode(value: Boolean): this.type = set(enableEvaluationMode, value) setDefault(enableEvaluationMode -> false) override def source = "content" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) import userDF.sparkSession.implicits._ val userRecommendedItemDF = userDF .as[Int] .flatMap { case (userId) => { // 因為 More Like This query 用 document id 查詢時 // 結果會過濾掉那些做為條件的 document ids // 但是這樣在 evaluate 的時候就不太合適了 // 所以我們改用後 k 個 repo 當作查詢條件 val limit = $(topK) val offset = if ($(enableEvaluationMode)) $(topK) else 0 val repoIds = selectUserStarredRepos(userId, limit, offset) val lowClient = RestClient.builder(new HttpHost("127.0.0.1", 9200, "http")).build() val highClient = new RestHighLevelClient(lowClient) val fields = Array("description", "full_name", "language", "topics") val texts = Array("") val items = repoIds.map((itemId: Int) => new Item("repo", "repo_info_doc", itemId.toString)) val queryBuilder = moreLikeThisQuery(fields, texts, items) .minTermFreq(2) .maxQueryTerms(50) val searchSourceBuilder = new SearchSourceBuilder() searchSourceBuilder.query(queryBuilder) searchSourceBuilder.size($(topK)) searchSourceBuilder.from(0) val searchRequest = new SearchRequest() searchRequest.indices("repo") searchRequest.types("repo_info_doc") searchRequest.source(searchSourceBuilder) val searchResponse = highClient.search(searchRequest) val hits = searchResponse.getHits val searchHits = hits.getHits val userItemScoreTuples = searchHits.map((searchHit: SearchHit) => { val itemId = searchHit.getId.toInt val score = searchHit.getScore (userId, itemId, score) }) lowClient.close() userItemScoreTuples } } .toDF($(userCol), $(itemCol), $(scoreCol)) .withColumn($(sourceCol), lit(source)) userRecommendedItemDF } }
Example 29
Source File: TransformerWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import org.apache.spark.sql.types.StructType import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.Transformer import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class TransformerWrapper( executionContext: ExecutionContext, transformer: Transformer) extends ML.Model[TransformerWrapper] { override def copy(extra: ParamMap): TransformerWrapper = { val params = ParamTransformer.transform(extra) val transformerCopy = transformer.replicate().set(params: _*) new TransformerWrapper(executionContext, transformerCopy) } override def transformDF(dataset: sql.DataFrame): sql.DataFrame = { transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())) .sparkDataFrame } override def transformSchema(schema: StructType): StructType = { transformer._transformSchema(schema).get } override lazy val params: Array[Param[_]] = { transformer.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("TransformerWrapper") }
Example 30
Source File: EvaluatorWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.evaluation import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.Evaluator import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class EvaluatorWrapper( context: ExecutionContext, evaluator: Evaluator) extends ML.Evaluator { override def evaluateDF(dataset: sql.DataFrame): Double = { evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value } override def copy(extra: ParamMap): evaluation.Evaluator = { val params = ParamTransformer.transform(extra) val evaluatorCopy = evaluator.replicate().set(params: _*) new EvaluatorWrapper(context, evaluatorCopy) } override lazy val params: Array[Param[_]] = { evaluator.params.map(new ParamWrapper(uid, _)) } override def isLargerBetter: Boolean = evaluator.isLargerBetter override val uid: String = Identifiable.randomUID("EvaluatorWrapper") }
Example 31
Source File: IsotonicRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle._ import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.ml.param.Param import org.apache.spark.ml.regression.IsotonicRegressionModel import org.apache.spark.mllib.regression class IsotonicRegressionOp extends SimpleSparkOp[IsotonicRegressionModel] { override val Model: OpModel[SparkBundleContext, IsotonicRegressionModel] = new OpModel[SparkBundleContext, IsotonicRegressionModel] { override val klazz: Class[IsotonicRegressionModel] = classOf[IsotonicRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.isotonic_regression override def store(model: Model, obj: IsotonicRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) var m = model.withValue("boundaries", Value.doubleList(obj.boundaries.toArray.toSeq)). withValue("predictions", Value.doubleList(obj.predictions.toArray.toSeq)). withValue("isotonic", Value.boolean(obj.getIsotonic)) if(context.context.dataset.get.schema(obj.getFeaturesCol).dataType.isInstanceOf[VectorUDT]) { m = m.withValue("feature_index", Value.long(obj.getFeatureIndex)) } m } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): IsotonicRegressionModel = { val oldModel = new regression.IsotonicRegressionModel(boundaries = model.value("boundaries").getDoubleList.toArray, predictions = model.value("predictions").getDoubleList.toArray, isotonic = model.value("isotonic").getBoolean) val m = new IsotonicRegressionModel(uid = "", oldModel = oldModel) model.getValue("feature_index").foreach(i => m.setFeatureIndex(i.getLong.toInt)) m } } override def sparkLoad(uid: String, shape: NodeShape, model: IsotonicRegressionModel): IsotonicRegressionModel = { val oldModel = new regression.IsotonicRegressionModel(boundaries = model.boundaries.toArray, predictions = model.predictions.toArray, isotonic = model.getIsotonic) new IsotonicRegressionModel(uid = uid, oldModel = oldModel).setFeatureIndex(model.getFeatureIndex) } override def sparkInputs(obj: IsotonicRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: IsotonicRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 32
Source File: ElementwiseProduct.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 33
Source File: MulticlassClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 34
Source File: RegressionEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 35
Source File: ParamUtils.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common.utils import io.hydrosphere.spark_ml_serving.common.Metadata import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Param object ParamUtils { def set[TransformerT <: Transformer, ParamT](transformer: TransformerT, param: Param[ParamT], metadata: Metadata): TransformerT = { transformer.set(param, extract(param, metadata)) } def extract[T](param: Param[T], metadata: Metadata): T = { metadata.getAs[Any](param.name).getOrElse(throw new IllegalArgumentException(param.name)) match { case p: BigInt => p.intValue().asInstanceOf[T] case p => p.asInstanceOf[T] } } }
Example 36
Source File: Sampler.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import scala.util.Random class Sampler(fraction: Double, override val uid: String, seed: Int = Random.nextInt) extends Transformer { def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler")) final def getOutputCol: String = $(inputCol) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sample(false, fraction, seed).toDF } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Sampler = defaultCopy(extra) } object Sampler { def main(args: Array[String]): Unit = { val ss = SparkSession .builder .master("local") .appName("preprocess") .getOrCreate() val training = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") println(training.count) val sampler = new Sampler(0.5) .setInputCol("features") val pipeline = new Pipeline() .setStages(Array(sampler)) val model = pipeline.fit(training) val test = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") model.transform(test).select("*") .collect() .foreach { case Row(label: Double, vector: Vector) => println(s"($label, " + s"${vector.toSparse.indices.mkString("[", ",", "]")}, " + s"${vector.toSparse.values.mkString("[", ",", "]")}") } ss.stop() } }
Example 37
Source File: ElementwiseProductOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.ElementwiseProduct import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.Param class ElementwiseProductOp extends SimpleSparkOp[ElementwiseProduct] { override val Model: OpModel[SparkBundleContext, ElementwiseProduct] = new OpModel[SparkBundleContext, ElementwiseProduct] { override val klazz: Class[ElementwiseProduct] = classOf[ElementwiseProduct] override def opName: String = Bundle.BuiltinOps.feature.elementwise_product override def store(model: Model, obj: ElementwiseProduct) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("scaling_vec", Value.vector(obj.getScalingVec.toArray)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): ElementwiseProduct = { new ElementwiseProduct(uid = "").setScalingVec(Vectors.dense(model.value("scaling_vec").getTensor[Double].toArray)) } } override def sparkLoad(uid: String, shape: NodeShape, model: ElementwiseProduct): ElementwiseProduct = { new ElementwiseProduct(uid = uid).setScalingVec(model.getScalingVec) } override def sparkInputs(obj: ElementwiseProduct): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: ElementwiseProduct): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol ) } }
Example 38
Source File: DCTOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.core.types.TensorShape import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.DCT import org.apache.spark.ml.param.Param import org.apache.spark.sql.mleap.TypeConverters.sparkToMleapDataShape class DCTOp extends SimpleSparkOp[DCT] { override val Model: OpModel[SparkBundleContext, DCT] = new OpModel[SparkBundleContext, DCT] { override val klazz: Class[DCT] = classOf[DCT] override def opName: String = Bundle.BuiltinOps.feature.dct override def store(model: Model, obj: DCT) (implicit context: BundleContext[SparkBundleContext]): Model = { val dataset = context.context.dataset.get val inputShape = sparkToMleapDataShape(dataset.schema(obj.getInputCol), dataset).asInstanceOf[TensorShape] model.withValue("inverse", Value.boolean(obj.getInverse)) .withValue("input_size", Value.int(inputShape.dimensions.get.head)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): DCT = { new DCT(uid = "").setInverse(model.value("inverse").getBoolean) } } override def sparkLoad(uid: String, shape: NodeShape, model: DCT): DCT = { new DCT(uid = uid).setInverse(model.getInverse) } override def sparkInputs(obj: DCT): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: DCT): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 39
Source File: IDFOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.IDFModel import org.apache.spark.ml.param.Param import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.Vectors class IDFOp extends SimpleSparkOp[IDFModel] { override val Model: OpModel[SparkBundleContext, IDFModel] = new OpModel[SparkBundleContext, IDFModel] { override val klazz: Class[IDFModel] = classOf[IDFModel] override def opName: String = Bundle.BuiltinOps.feature.idf override def store(model: Model, obj: IDFModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("idf", Value.vector(obj.idf.toArray)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): IDFModel = { val idfModel = new feature.IDFModel(Vectors.dense(model.value("idf").getTensor[Double].toArray)) new IDFModel(uid = "", idfModel = idfModel) } } override def sparkLoad(uid: String, shape: NodeShape, model: IDFModel): IDFModel = { new IDFModel(uid = uid, idfModel = new feature.IDFModel(Vectors.dense(model.idf.toArray))) } override def sparkInputs(obj: IDFModel): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: IDFModel): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 40
Source File: CountVectorizerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.CountVectorizerModel import org.apache.spark.ml.param.Param class CountVectorizerOp extends SimpleSparkOp[CountVectorizerModel] { override val Model: OpModel[SparkBundleContext, CountVectorizerModel] = new OpModel[SparkBundleContext, CountVectorizerModel] { override val klazz: Class[CountVectorizerModel] = classOf[CountVectorizerModel] override def opName: String = Bundle.BuiltinOps.feature.count_vectorizer override def store(model: Model, obj: CountVectorizerModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("vocabulary", Value.stringList(obj.vocabulary)). withValue("binary", Value.boolean(obj.getBinary)). withValue("min_tf", Value.double(obj.getMinTF)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): CountVectorizerModel = { new CountVectorizerModel(uid = "", vocabulary = model.value("vocabulary").getStringList.toArray). setBinary(model.value("binary").getBoolean). setMinTF(model.value("min_tf").getDouble) } } override def sparkLoad(uid: String, shape: NodeShape, model: CountVectorizerModel): CountVectorizerModel = { new CountVectorizerModel(uid = uid, vocabulary = model.vocabulary) .setBinary(model.getBinary) .setMinTF(model.getMinTF) } override def sparkInputs(obj: CountVectorizerModel): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: CountVectorizerModel): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 41
Source File: HashingTermFrequencyOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.HashingTF import org.apache.spark.ml.param.Param class HashingTermFrequencyOp extends SimpleSparkOp[HashingTF] { override val Model: OpModel[SparkBundleContext, HashingTF] = new OpModel[SparkBundleContext, HashingTF] { override val klazz: Class[HashingTF] = classOf[HashingTF] override def opName: String = Bundle.BuiltinOps.feature.hashing_term_frequency override def store(model: Model, obj: HashingTF) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("num_features", Value.long(obj.getNumFeatures)). withValue("binary", Value.boolean(obj.getBinary)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): HashingTF = { new HashingTF(uid = "").setNumFeatures(model.value("num_features").getLong.toInt). setBinary(model.value("binary").getBoolean) } } override def sparkLoad(uid: String, shape: NodeShape, model: HashingTF): HashingTF = { new HashingTF(uid = uid).setBinary(model.getBinary).setNumFeatures(model.getNumFeatures) } override def sparkInputs(obj: HashingTF): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: HashingTF): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 42
Source File: AFTSurvivalRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.Param import org.apache.spark.ml.regression.AFTSurvivalRegressionModel class AFTSurvivalRegressionOp extends SimpleSparkOp[AFTSurvivalRegressionModel] { override val Model: OpModel[SparkBundleContext, AFTSurvivalRegressionModel] = new OpModel[SparkBundleContext, AFTSurvivalRegressionModel] { override val klazz: Class[AFTSurvivalRegressionModel] = classOf[AFTSurvivalRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.aft_survival_regression override def store(model: Model, obj: AFTSurvivalRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)). withValue("quantile_probabilities", Value.doubleList(obj.getQuantileProbabilities)). withValue("scale", Value.double(obj.scale)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): AFTSurvivalRegressionModel = { new AFTSurvivalRegressionModel(uid = "", coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble, scale = model.value("scale").getDouble). setQuantileProbabilities(model.value("quantile_probabilities").getDoubleList.toArray) } } override def sparkLoad(uid: String, shape: NodeShape, model: AFTSurvivalRegressionModel): AFTSurvivalRegressionModel = { new AFTSurvivalRegressionModel(uid = uid, coefficients = model.coefficients, intercept = model.intercept, scale = model.scale).setQuantileProbabilities(model.getQuantileProbabilities) } override def sparkInputs(obj: AFTSurvivalRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: AFTSurvivalRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol, "quantiles" -> obj.quantilesCol) } }
Example 43
Source File: GBTRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.serializer.ModelSerializer import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.param.Param import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, GBTRegressionModel} class GBTRegressionOp extends SimpleSparkOp[GBTRegressionModel] { override val Model: OpModel[SparkBundleContext, GBTRegressionModel] = new OpModel[SparkBundleContext, GBTRegressionModel] { override val klazz: Class[GBTRegressionModel] = classOf[GBTRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.gbt_regression override def store(model: Model, obj: GBTRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { var i = 0 val trees = obj.trees.map { tree => val name = s"tree$i" ModelSerializer(context.bundleContext(name)).write(tree).get i = i + 1 name } model.withValue("num_features", Value.long(obj.numFeatures)). withValue("tree_weights", Value.doubleList(obj.treeWeights)). withValue("trees", Value.stringList(trees)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): GBTRegressionModel = { val numFeatures = model.value("num_features").getLong.toInt val treeWeights = model.value("tree_weights").getDoubleList.toArray val models = model.value("trees").getStringList.map { tree => ModelSerializer(context.bundleContext(tree)).read().get.asInstanceOf[DecisionTreeRegressionModel] }.toArray new GBTRegressionModel(uid = "", _trees = models, _treeWeights = treeWeights, numFeatures = numFeatures) } } override def sparkLoad(uid: String, shape: NodeShape, model: GBTRegressionModel): GBTRegressionModel = { new GBTRegressionModel(uid = uid, _trees = model.trees, _treeWeights = model.treeWeights, numFeatures = model.numFeatures) } override def sparkInputs(obj: GBTRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: GBTRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 44
Source File: RandomForestRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.serializer.ModelSerializer import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.bundle.tree.decision.SparkNodeWrapper import org.apache.spark.ml.param.Param import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, RandomForestRegressionModel} class RandomForestRegressionOp extends SimpleSparkOp[RandomForestRegressionModel] { implicit val nodeWrapper = SparkNodeWrapper override val Model: OpModel[SparkBundleContext, RandomForestRegressionModel] = new OpModel[SparkBundleContext, RandomForestRegressionModel] { override val klazz: Class[RandomForestRegressionModel] = classOf[RandomForestRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.random_forest_regression override def store(model: Model, obj: RandomForestRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { var i = 0 val trees = obj.trees.map { tree => val name = s"tree$i" ModelSerializer(context.bundleContext(name)).write(tree).get i = i + 1 name } model.withValue("num_features", Value.long(obj.numFeatures)). withValue("tree_weights", Value.doubleList(obj.treeWeights)). withValue("trees", Value.stringList(trees)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): RandomForestRegressionModel = { val numFeatures = model.value("num_features").getLong.toInt val treeWeights = model.value("tree_weights").getDoubleList // TODO: get rid of this when Spark supports setting tree weights for(weight <- treeWeights) { require(weight == 1.0, "tree weights must be 1.0 for Spark") } val models = model.value("trees").getStringList.map { tree => ModelSerializer(context.bundleContext(tree)).read().get.asInstanceOf[DecisionTreeRegressionModel] }.toArray new RandomForestRegressionModel(uid = "", numFeatures = numFeatures, _trees = models) } } override def sparkLoad(uid: String, shape: NodeShape, model: RandomForestRegressionModel): RandomForestRegressionModel = { new RandomForestRegressionModel(uid = uid, _trees = model.trees, numFeatures = model.numFeatures) } override def sparkInputs(obj: RandomForestRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: RandomForestRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 45
Source File: DecisionTreeRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.dsl._ import ml.combust.bundle.tree.decision.TreeSerializer import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.bundle.tree.decision.SparkNodeWrapper import org.apache.spark.ml.param.Param import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, LinearRegressionModel} class DecisionTreeRegressionOp extends SimpleSparkOp[DecisionTreeRegressionModel] { implicit val nodeWrapper = SparkNodeWrapper override val Model: OpModel[SparkBundleContext, DecisionTreeRegressionModel] = new OpModel[SparkBundleContext, DecisionTreeRegressionModel] { override val klazz: Class[DecisionTreeRegressionModel] = classOf[DecisionTreeRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.decision_tree_regression override def store(model: Model, obj: DecisionTreeRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { TreeSerializer[org.apache.spark.ml.tree.Node](context.file("tree"), withImpurities = false).write(obj.rootNode) model.withValue("num_features", Value.long(obj.numFeatures)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): DecisionTreeRegressionModel = { val rootNode = TreeSerializer[org.apache.spark.ml.tree.Node](context.file("tree"), withImpurities = false).read().get new DecisionTreeRegressionModel(uid = "", rootNode = rootNode, numFeatures = model.value("num_features").getLong.toInt) } } override def sparkLoad(uid: String, shape: NodeShape, model: DecisionTreeRegressionModel): DecisionTreeRegressionModel = { new DecisionTreeRegressionModel(uid = uid, rootNode = model.rootNode, numFeatures = model.numFeatures) } override def sparkInputs(obj: DecisionTreeRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: DecisionTreeRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 46
Source File: SQLTransformer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 47
Source File: LinearRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.Param import org.apache.spark.ml.regression.LinearRegressionModel class LinearRegressionOp extends SimpleSparkOp[LinearRegressionModel] { override val Model: OpModel[SparkBundleContext, LinearRegressionModel] = new OpModel[SparkBundleContext, LinearRegressionModel] { override val klazz: Class[LinearRegressionModel] = classOf[LinearRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.linear_regression override def store(model: Model, obj: LinearRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): LinearRegressionModel = { new LinearRegressionModel(uid = "", coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble) } } override def sparkLoad(uid: String, shape: NodeShape, model: LinearRegressionModel): LinearRegressionModel = { new LinearRegressionModel(uid = uid, coefficients = model.coefficients, intercept = model.intercept) } override def sparkInputs(obj: LinearRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: LinearRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 48
Source File: ParamUtil.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.util import org.apache.spark.ml.param.{Param, Params} trait ParamUtil { def setOptional[T](obj1: Params, obj2: Params, param1: Param[T], param2: Param[T]): Unit = { if(obj2.isSet(param2)) { obj1.set(param1, obj2.get(param2).get) } else { obj1.clear(param1) } } } object ParamUtil extends ParamUtil
Example 49
Source File: GaussianProcessParams.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.commons.kernel.{Kernel, RBFKernel} import org.apache.spark.ml.param.shared.{HasAggregationDepth, HasMaxIter, HasSeed, HasTol} import org.apache.spark.ml.param.{DoubleParam, IntParam, Param} private[ml] trait GaussianProcessParams extends PredictorParams with HasMaxIter with HasTol with HasAggregationDepth with HasSeed { final val activeSetProvider = new Param[ActiveSetProvider](this, "activeSetProvider", "the class which provides the active set used by Projected Process Approximation") final val kernel = new Param[() => Kernel](this, "kernel", "function of no arguments which returns " + "the kernel of the prior Gaussian Process") final val datasetSizeForExpert = new IntParam(this, "datasetSizeForExpert", "The number of data points fed to each expert. " + "Time and space complexity of training quadratically grows with it.") final val sigma2 = new DoubleParam(this, "sigma2", "The variance of noise in the inputs. The value is added to the diagonal of the " + "kernel Matrix. Also prevents numerical issues associated with inversion " + "of a computationally singular matrix ") final val activeSetSize = new IntParam(this, "activeSetSize", "Number of latent functions to project the process onto. " + "The size of the produced model and prediction complexity " + "linearly depend on this value.") def setActiveSetProvider(value : ActiveSetProvider): this.type = set(activeSetProvider, value) setDefault(activeSetProvider -> RandomActiveSetProvider) def setDatasetSizeForExpert(value: Int): this.type = set(datasetSizeForExpert, value) setDefault(datasetSizeForExpert -> 100) def setMaxIter(value: Int): this.type = set(maxIter, value) setDefault(maxIter -> 100) def setSigma2(value: Double): this.type = set(sigma2, value) setDefault(sigma2 -> 1e-3) def setKernel(value: () => Kernel): this.type = set(kernel, value) setDefault(kernel -> (() => new RBFKernel())) def setTol(value: Double): this.type = set(tol, value) setDefault(tol -> 1E-6) def setActiveSetSize(value: Int): this.type = set(activeSetSize, value) setDefault(activeSetSize -> 100) def setSeed(value: Long): this.type = set(seed, value) }
Example 50
Source File: SQLTransformer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 51
Source File: ElementwiseProduct.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 52
Source File: MulticlassClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 53
Source File: RegressionEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 54
Source File: Cleaner.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions.clean import com.hankcs.hanlp.HanLP import config.paramconf.{HasOutputCol, HasInputCol} import functions.MySchemaUtils import functions.clean.chinese.BCConvert import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1) override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val cleanFunc = udf {line: String => var cleaned = "" getFanJian match { case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line) case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line) case _ => cleaned = line } getQuanBan match { case "q2b" => cleaned = BCConvert.qj2bj(cleaned) case "b2q" => cleaned = BCConvert.bj2qj(cleaned) case _ => cleaned = cleaned } cleaned } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record => val outputIndex = record.fieldIndex($(outputCol)) record.getString(outputIndex).length >= getMinLineLen } } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.typeName.equals(StringType.typeName), s"Input type must be StringType but got $inputType.") MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable) } } object Cleaner extends DefaultParamsReadable[Cleaner] { override def load(path: String): Cleaner = super.load(path) }
Example 55
Source File: AnnotatorParam.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.param import java.util.{Date, TimeZone} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.Identifiable import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.json4s.jackson.Serialization.write object SerializableFormat extends Formats with Serializable { class SerializableDateFormat extends DateFormat { def timezone: TimeZone = throw new Exception("SerializableFormat does not implement dateformat") override def format(d: Date): String = throw new Exception("SerializableFormat does not implement dateformat") override def parse(s: String): Option[Date] = throw new Exception("SerializableFormat does not implement dateformat") } override def dateFormat: DateFormat = new SerializableDateFormat } implicit val formats = SerializableFormat override def jsonEncode(value: A): String = write(value.serialize) override def jsonDecode(json: String): A = parse(json).extract[B].deserialize }
Example 56
Source File: NerOverwriter.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.ner import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel} import org.apache.spark.ml.param.{Param, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} def getNewResult: String = $(newResult) setDefault( newResult -> "I-OVERWRITE" ) override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { var annotationsOverwritten = annotations annotationsOverwritten.map { tokenAnnotation => val stopWordsSet = $(stopWords).toSet if (stopWordsSet.contains(tokenAnnotation.metadata("word"))) { Annotation( outputAnnotatorType, tokenAnnotation.begin, tokenAnnotation.end, $(newResult), tokenAnnotation.metadata ) } else { Annotation( outputAnnotatorType, tokenAnnotation.begin, tokenAnnotation.end, tokenAnnotation.result, tokenAnnotation.metadata ) } } } } object NerOverwriter extends DefaultParamsReadable[NerOverwriter]
Example 57
Source File: HasStorageRef.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.storage import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable import org.apache.spark.ml.param.Param import org.apache.spark.sql.Dataset trait HasStorageRef extends ParamsAndFeaturesWritable { val storageRef = new Param[String](this, "storageRef", "storage unique identifier") setDefault(storageRef, this.uid) def createDatabaseConnection(database: Database.Name): RocksDBConnection = RocksDBConnection.getOrCreate(database, $(storageRef)) def setStorageRef(value: String): this.type = { if (get(storageRef).nonEmpty) throw new UnsupportedOperationException(s"Cannot override storage ref on $this. " + s"Please re-use current ref: $getStorageRef") set(this.storageRef, value) } def getStorageRef: String = $(storageRef) def validateStorageRef(dataset: Dataset[_], inputCols: Array[String], annotatorType: String): Unit = { require(isDefined(storageRef), "This Annotator does not have a storage reference defined. This could be an outdated " + "model or an incorrectly created one. Make sure storageRef param is defined and set.") require(HasStorageRef.getStorageRefFromInput(dataset, inputCols, annotatorType) == $(storageRef), s"Found input column with storage metadata. But such ref does not match to the ref this annotator requires. " + s"Make sure you are loading the annotator with ref: ${$(storageRef)}") } } object HasStorageRef { def getStorageRefFromInput(dataset: Dataset[_], inputCols: Array[String], annotatorType: String): String = { val storageCol = dataset.schema.fields .find(f => inputCols.contains(f.name) && f.metadata.getString("annotatorType") == annotatorType) .getOrElse(throw new Exception(s"Could not find a column of type $annotatorType. Make sure your pipeline is correct.")) .name val storage_meta = dataset.select(storageCol).schema.fields.head.metadata require(storage_meta.contains("ref"), s"Could not find a ref name in column $storageCol. " + s"Make sure $storageCol was created appropriately with a valid storageRef") storage_meta.getString("ref") } }
Example 58
Source File: S2CellTransformer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import com.google.common.geometry.{S2LatLng, S2CellId} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} class S2CellTransformer(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("S2CellTransformer")) // Input/Output column names val latCol: Param[String] = new Param[String](this, "latCol", "latitude column") val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column") val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column") val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]", (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i)) // Default parameters setDefault( latCol -> "lat", lonCol -> "lon", cellCol -> "cell", level -> 10 ) def getLatCol: String = $(latCol) def getLonCol: String = $(lonCol) def getCellCol: String = $(cellCol) def getLevel: Int = $(level) def setLatCol(value: String): this.type = set(latCol, value) def setLonCol(value: String): this.type = set(lonCol, value) def setCellCol(value: String): this.type = set(cellCol, value) def setLevel(value: Int): this.type = set(level, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val currentLevel = $(level) val t = udf { (lat: Double, lon: Double) => val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon)) cellId.parent(currentLevel).toToken } val metadata = outputSchema($(cellCol)).metadata dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val latColumnName = $(latCol) val latDataType = schema(latColumnName).dataType require(latDataType == DoubleType, s"The latitude column $latColumnName must be Double type, " + s"but got $latDataType.") val lonColumnName = $(lonCol) val lonDataType = schema(lonColumnName).dataType require(lonDataType == DoubleType, s"The longitude column $lonColumnName must be Double type, " + s"but got $lonDataType.") val inputFields = schema.fields val outputColName = $(cellCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = NominalAttribute.defaultAttr.withName($(cellCol)) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) }
Example 59
Source File: EvaluationUtils.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.automl import com.microsoft.ml.spark.core.metrics.MetricConstants import com.microsoft.ml.spark.core.schema.SchemaConstants import com.microsoft.ml.spark.train.{TrainClassifier, TrainRegressor, TrainedClassifierModel, TrainedRegressorModel} import org.apache.spark.injections.RegressionUtils import org.apache.spark.ml.classification.{ClassificationModel, Classifier} import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.regression._ object EvaluationUtils { val ModelTypeUnsupportedErr = "Model type not supported for evaluation" // Find type of trained models def getModelType(model: PipelineStage): String = { model match { case _: TrainRegressor => SchemaConstants.RegressionKind case _: TrainClassifier => SchemaConstants.ClassificationKind case _: Classifier[_, _, _] => SchemaConstants.ClassificationKind case regressor: PipelineStage if RegressionUtils.isRegressor(regressor) => SchemaConstants.RegressionKind case _: DecisionTreeRegressor => SchemaConstants.RegressionKind case _: GBTRegressor => SchemaConstants.RegressionKind case _: RandomForestRegressor => SchemaConstants.RegressionKind case _: TrainedRegressorModel => SchemaConstants.RegressionKind case _: TrainedClassifierModel => SchemaConstants.ClassificationKind case evm: BestModel => getModelType(evm.getBestModel) case _: ClassificationModel[_, _] => SchemaConstants.ClassificationKind case _: RegressionModel[_, _] => SchemaConstants.RegressionKind case _ => throw new Exception(ModelTypeUnsupportedErr) } } def getMetricWithOperator(model: PipelineStage, evaluationMetric: String): (String, Ordering[Double]) = { val modelType = getModelType(model) getMetricWithOperator(modelType, evaluationMetric) } def getMetricWithOperator(modelType: String, evaluationMetric: String): (String, Ordering[Double]) = { val chooseHighest = Ordering.Double val chooseLowest = Ordering.Double.reverse val (evaluationMetricColumnName, operator): (String, Ordering[Double]) = modelType match { case SchemaConstants.RegressionKind => evaluationMetric match { case MetricConstants.MseSparkMetric => (MetricConstants.MseColumnName, chooseLowest) case MetricConstants.RmseSparkMetric => (MetricConstants.RmseColumnName, chooseLowest) case MetricConstants.R2SparkMetric => (MetricConstants.R2ColumnName, chooseHighest) case MetricConstants.MaeSparkMetric => (MetricConstants.MaeColumnName, chooseLowest) case _ => throw new Exception("Metric is not supported for regressors") } case SchemaConstants.ClassificationKind => evaluationMetric match { case MetricConstants.AucSparkMetric => (MetricConstants.AucColumnName, chooseHighest) case MetricConstants.PrecisionSparkMetric => (MetricConstants.PrecisionColumnName, chooseHighest) case MetricConstants.RecallSparkMetric => (MetricConstants.RecallColumnName, chooseHighest) case MetricConstants.AccuracySparkMetric => (MetricConstants.AccuracyColumnName, chooseHighest) case _ => throw new Exception("Metric is not supported for classifiers") } case _ => throw new Exception("Model type not supported for evaluation") } (evaluationMetricColumnName, operator) } def getModelParams(model: Transformer): ParamMap = { model match { case reg: TrainedRegressorModel => reg.getParamMap case cls: TrainedClassifierModel => cls.getParamMap case evm: BestModel => getModelParams(evm.getBestModel) case _ => throw new Exception("Model type not supported for evaluation") } } def modelParamsToString(model: Transformer): String = getModelParams(model).toSeq.map(pv => s"${pv.param.name}: ${pv.value}").sorted.mkString(", ") }
Example 60
Source File: SQLTransformer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }