org.apache.spark.ml.param.Params Scala Examples
The following examples show how to use org.apache.spark.ml.param.Params.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SwSequenceEstimator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStageN import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag private[stages] final class SwSequenceModel[I <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParamName: String, val operationName: String, val outputParamName: String, private val sparkMlStageIn: Option[T], val uid: String )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends Model[SwSequenceModel[I, O, T]] with SwTransformerN[I, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 2
Source File: DefaultMLWriter.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.ml.param.{ParamPair, Params} import org.apache.spark.ml.util.MLWriter import org.json4s.JsonDSL._ import org.json4s._ import org.json4s.jackson.JsonMethods._ import io.deepsense.deeplang.doperables.Transformer import io.deepsense.sparkutils.ML.MLWriterWithSparkContext class DefaultMLWriter[T <: Params](instance: T) extends MLWriter with MLWriterWithSparkContext { def saveImpl(path: String): Unit = { val modelPath = Transformer.modelFilePath(path) saveMetadata(instance, path, sc) CustomPersistence.save(sparkContext, instance, modelPath) } // Copied from org.apache.spark.ml.util.DefaultParamWriter. // We need to be consistent with Spark Format, but this method is private. private def saveMetadata( instance: Params, path: String, sc: SparkContext, extraMetadata: Option[JObject] = None, paramMap: Option[JValue] = None): Unit = { val uid = instance.uid val cls = instance.getClass.getName val params = instance.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]] val jsonParams = paramMap.getOrElse(render(params.map { case ParamPair(p, v) => p.name -> parse(p.jsonEncode(v)) }.toList)) val basicMetadata = ("class" -> cls) ~ ("timestamp" -> System.currentTimeMillis()) ~ ("sparkVersion" -> sc.version) ~ ("uid" -> uid) ~ ("paramMap" -> jsonParams) val metadata = extraMetadata match { case Some(jObject) => basicMetadata ~ jObject case None => basicMetadata } val metadataPath = new Path(path, "metadata").toString val metadataJson = compact(render(metadata)) sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath) } }
Example 3
Source File: SageMakerAlgorithmParams.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.algorithms import org.apache.spark.ml.param.{IntParam, Param, Params, ParamValidators} val featureDim : IntParam = new IntParam(this, "feature_dim", "The dimension of the input vectors. Must be > 0.", ParamValidators.gtEq(1)) def getFeatureDim: Int = $(featureDim) protected def autoOrAboveParamValidator(lowerBound: Double, inclusive: Boolean): String => Boolean = { (value: String) => try { value == "auto" || { if (inclusive) { value.toDouble >= lowerBound } else { value.toDouble > lowerBound } } } catch { case e: NumberFormatException => false } } protected def inArrayOrAboveParamValidator(validValues: Array[String], lowerBound: Double): String => Boolean = { (value: String) => try { validValues.contains(value) || value.toDouble > lowerBound } catch { case e: NumberFormatException => false } } protected def parseTrueAndFalse(param: Param[String]): Boolean = { $(param) match { case "True" => true case "False" => false case _ => throw new IllegalArgumentException("Param is neither 'True' nor 'False'") } } }
Example 4
Source File: HasParallelism.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.param.shared import scala.concurrent.ExecutionContext import org.apache.spark.ml.param.{IntParam, Params, ParamValidators} import org.apache.spark.util.ThreadUtils private[ml] def getExecutionContext: ExecutionContext = { getParallelism match { case 1 => ThreadUtils.sameThread case n => ExecutionContext.fromExecutorService(ThreadUtils .newDaemonCachedThreadPool(s"${this.getClass.getSimpleName}-thread-pool", n)) } } }
Example 5
Source File: MetricsExtractor.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.repro import org.apache.spark.ml.feature.SQLTransformer import org.apache.spark.ml.odkl.ModelWithSummary import org.apache.spark.ml.param.{Param, Params} import org.apache.spark.sql.DataFrame trait MetricsExtractor extends Params { val extractExpression = new Param[String](this, "extractExpression", "Optional SQL expression for transforming metrics before uploading to repro context") def setExtractExpression(value: String) : this.type = set(extractExpression, value) final def extract(model: ModelWithSummary[_]): Option[DataFrame] = { extractImpl(model) .map(data => get(extractExpression) .map(expression => { new SQLTransformer().setStatement(expression).transform(data) }) .getOrElse(data)) } protected def extractImpl(model: ModelWithSummary[_]): Option[DataFrame] }
Example 6
Source File: SimpleReproContext.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.repro import org.apache.spark.ml.param.{Param, ParamPair, Params} import org.apache.spark.ml.util.MLWritable import org.apache.spark.sql.{DataFrame, Dataset, SparkSession, functions} class SimpleReproContext private (spark: SparkSession, basePath: String, tags: Seq[(String,String)]) extends ReproContext { def this(basePath: String)(implicit spark: SparkSession) = this(spark, basePath, Seq()) var accumulatedMetrics : Seq[DataFrame] = Seq() var accumulatedParams: Seq[(Seq[String], Iterable[ParamPair[_]])] = Seq() override def persistEstimator(estimator: MLWritable): Unit = { estimator.save(basePath + "/estimator") } override def persistModel(model: MLWritable): Unit = { model.save(basePath + "/model") } override def dive(tags: Seq[(String, String)]): ReproContext = new SimpleReproContext( spark, basePath, this.tags ++ tags) override def logParamPairs(params: Iterable[ParamPair[_]], path: Seq[String]): Unit = accumulatedParams = accumulatedParams :+ path -> params override def logMetircs(metrics: => DataFrame): Unit = accumulatedMetrics = accumulatedMetrics :+ metrics override def start(): Unit = { import spark.implicits._ accumulatedParams.map { case (path, params) => params.view .map(x => x.param.name -> x.param.asInstanceOf[Param[Any]].jsonEncode(x.value)) .toSeq .toDF("param", "value") .withColumn("path", functions.lit(path.mkString("/"))) }.reduce(_ unionByName _) .write.parquet(taggedPrefix + "/params") } override def finish(): Unit = { accumulatedMetrics.reduceOption(_ unionByName _).foreach( _.write.parquet(taggedPrefix + "/metrics")) } private def taggedPrefix: String = { tags.map(x => x._1 + "=" + x._2).mkString(basePath + "/", "/", "") } }
Example 7
Source File: HasConfigurations.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.hyperopt import org.apache.spark.ml.odkl.ModelWithSummary import org.apache.spark.ml.odkl.ModelWithSummary.Block import org.apache.spark.ml.param.{Param, Params} import org.apache.spark.repro.MetricsExtractor import org.apache.spark.repro.ReproContext.logMetircs import org.apache.spark.sql.{DataFrame, functions} trait HasConfigurations extends Params with MetricsExtractor { val configurations: Block = Block("configurations") val configurationIndexColumn = new Param[String](this, "configurationIndexColumn", "Name of the column to store id of config for further analysis.") val resultingMetricColumn = new Param[String](this, "resultingMetricColumn", "Name of the column to store resulting metrics for further analysis.") val errorColumn = new Param[String](this, "errorColumn", "Name of the column to store text of the error if occurs.") def getConfigurationIndexColumn: String = $(configurationIndexColumn) def setConfigurationIndexColumn(value: String): this.type = set(configurationIndexColumn, value) def getResultingMetricColumn: String = $(resultingMetricColumn) def setResultingMetricColumn(value: String): this.type = set(resultingMetricColumn, value) def getErrorColumn: String = $(errorColumn) def setErrorColumn(value: String): this.type = set(errorColumn, value) setDefault( configurationIndexColumn -> "configurationIndex", resultingMetricColumn -> "resultingMetric", errorColumn -> "error" ) protected def extractImpl(model: ModelWithSummary[_]) : Option[DataFrame] = { // Report only resulting metrics to the context assuming that detailed metrics // where reported by forks. model.summary.blocks.get(configurations).map(data => data.select( data(getConfigurationIndexColumn).as("invertedStep"), data(getResultingMetricColumn).as("value"), functions.lit("target").as("metric") ) ) } }
Example 8
Source File: URLElimminator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, Params} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("URLEliminator")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), filterTextUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), StringType) } else { schema } } } object URLElimminator extends DefaultParamsReadable[URLElimminator] { override def load(path: String): URLElimminator = super.load(path) }
Example 9
Source File: RegexpReplaceTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("RegexpReplaceTransformer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), regexp_replace(dataset.col($(inputCol)), $(regexpPattern), $(regexpReplacement))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputCol))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), StringType) } else { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } } } object RegexpReplaceTransformer extends DefaultParamsReadable[RegexpReplaceTransformer] { override def load(path: String): RegexpReplaceTransformer = super.load(path) }
Example 10
Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1)) override def transform(dataset: Dataset[_]): DataFrame = { val lowerBound = $(lowerN) val upperBound = $(upperN) val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound)) dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), new ArrayType(StringType, true)) } else { schema } } } object NGramExtractor extends DefaultParamsReadable[NGramExtractor] { override def load(path: String): NGramExtractor = super.load(path) }
Example 11
Source File: LanguageAwareAnalyzer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.util.StopwordAnalyzerBase import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.HasOutputCol import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } def this() = this(Identifiable.randomUID("languageAnalyzer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF } @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputColText) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true)) } else { SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true)) } } } object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] { override def load(path: String): LanguageAwareAnalyzer = super.load(path) }
Example 12
Source File: XGBoostUtils.scala From pravda-ml with Apache License 2.0 | 5 votes |
package ml.dmlc.xgboost4j.scala.spark import ml.dmlc.xgboost4j.scala.Booster import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{BooleanParam, Params} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} import org.apache.spark.sql.{Dataset, functions} object XGBoostUtils { def getBooster(x: XGBoostClassificationModel): Booster = x._booster def getBooster(x: XGBoostRegressionModel): Booster = x._booster } trait OkXGBoostParams extends HasFeaturesCol with HasPredictionCol { this: Params => val densifyInput = new BooleanParam(this, "densifyInput", "In order to fix the difference between spark abd xgboost sparsity treatment") val predictAsDouble = new BooleanParam(this, "predictAsDouble", "Whenver to cast XGBoost prediction to double matching common behavior for other predictors.") val addRawTrees = new BooleanParam(this, "addRawTrees", "Whenever to add raw trees block to model summary.") val addSignificance = new BooleanParam(this, "addSignificance", "Whenever to add feature significance block to model summary.") def setAddSignificance(value: Boolean): this.type = set(addSignificance, value) def setAddRawTrees(value: Boolean): this.type = set(addRawTrees, value) def setDensifyInput(value: Boolean): this.type = set(densifyInput, value) def setPredictAsDouble(value: Boolean): this.type = set(predictAsDouble, value) protected def densifyIfNeeded(dataset: Dataset[_]) : Dataset[_] = { if ($(densifyInput)) { val densify = functions.udf((x: Vector) => x.toDense) val col = getFeaturesCol val metadata = dataset.schema(col).metadata dataset.withColumn( col, densify(dataset(col)).as(col, metadata)) } else { dataset } } } trait OkXGBoostClassifierParams extends XGBoostClassifierParams with OkXGBoostParams trait OkXGBoostRegressorParams extends XGBoostRegressorParams with OkXGBoostParams
Example 13
Source File: DateToUnitCircleTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.sequence.SequenceTransformer import com.salesforce.op.utils.spark.OpVectorMetadata import com.salesforce.op.{FeatureHistory, UID} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.{Param, Params} import scala.reflect.runtime.universe.TypeTag trait DateToUnitCircleParams extends Params { final val timePeriod: Param[String] = new Param[String](parent = this, name = "timePeriods", doc = "The time period to extract from the timestamp", isValid = (value: String) => TimePeriod.values.map(_.entryName).contains(value) ) setDefault(timePeriod, TimePeriod.HourOfDay.entryName) class DateToUnitCircleTransformer[T <: Date] ( uid: String = UID[DateToUnitCircleTransformer[_]] )(implicit tti: TypeTag[T], val ttiv: TypeTag[T#Value]) extends SequenceTransformer[T, OPVector]( operationName = "dateToUnitCircle", uid = uid ) with DateToUnitCircleParams { override def transformFn: Seq[T] => OPVector = timestamp => { val randians = timestamp.flatMap(ts => DateToUnitCircle.convertToRandians(ts.v, getTimePeriod)).toArray Vectors.dense(randians).toOPVector } override def onGetMetadata(): Unit = { super.onGetMetadata() val timePeriod = getTimePeriod val columns = inN.flatMap{ f => DateToUnitCircle.metadataValues(timePeriod) .map(iv => f.toColumnMetaData().copy(descriptorValue = Option(iv))) } val history = inN.flatMap(f => Seq(f.name -> FeatureHistory(originFeatures = f.originFeatures, stages = f.stages))) setMetadata(OpVectorMetadata(getOutputFeatureName, columns, history.toMap).toMetadata) } } private[op] object DateToUnitCircle { def metadataValues(timePeriod: TimePeriod): Seq[String] = Seq(s"x_$timePeriod", s"y_$timePeriod") def convertToBin(timestamp: Long, timePeriodDesired: TimePeriod): Double = getPeriodWithSize(timestamp, timePeriodDesired)._1 def convertToRandians(timestamp: Option[Long], timePeriodDesired: TimePeriod): Array[Double] = timestamp.map { ts => val (timePeriod, periodSize) = getPeriodWithSize(ts, timePeriodDesired) val radians = (2 * math.Pi * timePeriod) / periodSize Array(math.cos(radians), math.sin(radians)) }.getOrElse(Array(0.0, 0.0)) private def getPeriodWithSize(timestamp: Long, timePeriod: TimePeriod): (Double, Int) = { val tpv = timePeriod.extractTimePeriodVal(timestamp) val period = if (tpv.min == 1) tpv.value - 1 else tpv.value (period.toDouble, tpv.max) } }
Example 14
Source File: MimeTypeDetector.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import java.io.InputStream import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.UnaryTransformer import org.apache.commons.io.input.BoundedInputStream import org.apache.spark.ml.param.{LongParam, Param, Params} import org.apache.tika.detect.{DefaultDetector, Detector} import org.apache.tika.metadata.{HttpHeaders, Metadata} import org.apache.tika.mime.MediaType def detect(in: InputStream, typeHint: String): MediaType = { val meta = if (typeHint == null || typeHint.isEmpty) emptyMeta else { val meta = new Metadata() meta.add(HttpHeaders.CONTENT_TYPE, typeHint) meta } // parses the input stream and detects the media type detector.detect(in, meta) } }
Example 15
Source File: SwUnaryEstimator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage1 import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag private[stages] final class SwUnaryModel[I <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParamName: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends Model[SwUnaryModel[I, O, T]] with SwTransformer1[I, O, T] with SparkWrapperParams[T] { setSparkMlStage(sparkMlStageIn) }
Example 16
Source File: VParams.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.{BooleanParam, IntParam, ParamValidators, Params} private trait VParams extends Params{ // column number of each block in feature block matrix val colsPerBlock: IntParam = new IntParam(this, "colsPerBlock", "column number of each block in feature block matrix.", ParamValidators.gt(0)) setDefault(colsPerBlock -> 10000) def getColsPerBlock: Int = $(colsPerBlock) // row number of each block in feature block matrix val rowsPerBlock: IntParam = new IntParam(this, "rowsPerBlock", "row number of each block in feature block matrix.", ParamValidators.gt(0)) setDefault(rowsPerBlock -> 10000) def getRowsPerBlock: Int = $(rowsPerBlock) // row partition number of feature block matrix // equals to partition number of coefficient vector val rowPartitions: IntParam = new IntParam(this, "rowPartitions", "row partition number of feature block matrix.", ParamValidators.gt(0)) setDefault(rowPartitions -> 10) def getRowPartitions: Int = $(rowPartitions) // column partition number of feature block matrix val colPartitions: IntParam = new IntParam(this, "colPartitions", "column partition number of feature block matrix.", ParamValidators.gt(0)) setDefault(colPartitions -> 10) def getColPartitions: Int = $(colPartitions) // Whether to eager persist distributed vector. val eagerPersist: BooleanParam = new BooleanParam(this, "eagerPersist", "Whether to eager persist distributed vector.") setDefault(eagerPersist -> false) def getEagerPersist: Boolean = $(eagerPersist) // The number of corrections used in the LBFGS update. val numCorrections: IntParam = new IntParam(this, "numCorrections", "The number of corrections used in the LBFGS update.") setDefault(numCorrections -> 10) def getNumCorrections: Int = $(numCorrections) val generatingFeatureMatrixBuffer: IntParam = new IntParam(this, "generatingFeatureMatrixBuffer", "Buffer size when generating features block matrix.") setDefault(generatingFeatureMatrixBuffer -> 1000) def getGeneratingFeatureMatrixBuffer: Int = $(generatingFeatureMatrixBuffer) val rowPartitionSplitNumOnGeneratingFeatureMatrix: IntParam = new IntParam(this, "rowPartitionSplitsNumOnGeneratingFeatureMatrix", "row partition splits number on generating features matrix." ) setDefault(rowPartitionSplitNumOnGeneratingFeatureMatrix -> 1) def getRowPartitionSplitNumOnGeneratingFeatureMatrix: Int = $(rowPartitionSplitNumOnGeneratingFeatureMatrix) val compressFeatureMatrix: BooleanParam = new BooleanParam(this, "compressFeatureMatrix", "compress feature matrix." ) setDefault(compressFeatureMatrix -> false) def getCompressFeatureMatrix: Boolean = $(compressFeatureMatrix) }
Example 17
Source File: SwUnaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage1 import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import org.apache.spark.sql._ import scala.reflect.runtime.universe.TypeTag class SwUnaryTransformer[I <: FeatureType, O <: FeatureType, T <: Transformer with Params] ( val inputParamName: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwUnaryTransformer[I, O, T]] )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformer1[I, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 18
Source File: SwSequenceTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStageN import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import org.apache.spark.sql._ import scala.reflect.runtime.universe.TypeTag class SwSequenceTransformer[I <: FeatureType, O <: FeatureType, T <: Transformer with Params] ( val inputParamName: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwSequenceTransformer[I, O, T]] )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformerN[I, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 19
Source File: SwBinaryEstimator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage2 import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag private[stages] final class SwBinaryModel[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParam1Name: String, val inputParam2Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends Model[SwBinaryModel[I1, I2, O, T]] with SwTransformer2[I1, I2, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 20
Source File: SwQuaternaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage4 import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import org.apache.spark.sql._ import scala.reflect.runtime.universe.TypeTag class SwQuaternaryTransformer[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, I4 <: FeatureType, O <: FeatureType, T <: Transformer with Params] ( val inputParam1Name: String, val inputParam2Name: String, val inputParam3Name: String, val inputParam4Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwQuaternaryTransformer[I1, I2, I3, I4, O, T]] )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tti3: TypeTag[I3], val tti4: TypeTag[I4], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformer4[I1, I2, I3, I4, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 21
Source File: SwTernaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage3 import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Model, Transformer} import org.apache.spark.sql._ import scala.reflect.runtime.universe.TypeTag class SwTernaryTransformer[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParam1Name: String, val inputParam2Name: String, val inputParam3Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwTernaryTransformer[I1, I2, I3, O, T]] )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tti3: TypeTag[I3], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformer3[I1, I2, I3, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 22
Source File: SwBinaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage2 import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import org.apache.spark.sql.{DataFrame, Dataset} import scala.reflect.runtime.universe.TypeTag class SwBinaryTransformer[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType, T <: Transformer with Params] ( val inputParam1Name: String, val inputParam2Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwBinaryTransformer[I1, I2, O, T]] )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformer2[I1, I2, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 23
Source File: SwTransformerSpec.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.test import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import scala.reflect.ClassTag import scala.reflect.runtime.universe.WeakTypeTag def sparkStage: Option[SparkTransformerType] = transformer.getSparkMlStage() it should "have a Spark stage set" in { sparkStage match { case None => fail("Spark stage is not set") case Some(s) => withClue(s"Spark stage type is '${s.getClass.getName}' (expected '${stc.runtimeClass.getName}'):") { s.isInstanceOf[SparkTransformerType] shouldBe true } } } it should "have input column names set" in { transformer.getInputColParamNames() should not be empty } it should "have output column name set" in { transformer.getOutputColParamNames() should not be empty } it should "have inputs set on Spark stage" in { transformer.getInputColParamNames().flatMap(name => sparkStage.flatMap(s => s.get(s.getParam(name)))) shouldBe transformer.getInputFeatures().map(_.name) } it should "have output set on Spark stage" in { transformer.getOutputColParamNames().flatMap(name => sparkStage.flatMap(s => s.get(s.getParam(name)))) shouldBe Array(transformer.getOutputFeatureName) } }
Example 24
Source File: SparkStageParam.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import org.apache.hadoop.fs.Path import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.param.{Param, ParamPair, Params} import org.apache.spark.ml.util.{Identifiable, MLReader, MLWritable} import org.apache.spark.util.SparkUtils import org.json4s.JsonAST.{JObject, JValue} import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.json4s.{DefaultFormats, Formats, JString} class SparkStageParam[S <: PipelineStage with Params] ( parent: String, name: String, doc: String, isValid: Option[S] => Boolean ) extends Param[Option[S]](parent, name, doc, isValid) { import SparkStageParam._ override def jsonDecode(jsonStr: String): Option[S] = { val json = parse(jsonStr) val uid = (json \ "uid").extractOpt[String] val path = (json \ "path").extractOpt[String] path -> uid match { case (None, _) | (_, None) | (_, Some(NoUID)) => savePath = None None case (Some(p), Some(stageUid)) => savePath = Option(p) val stagePath = new Path(p, stageUid).toString val className = (json \ "className").extract[String] val cls = SparkUtils.classForName(className) val stage = cls.getMethod("read").invoke(null).asInstanceOf[MLReader[PipelineStage]].load(stagePath) Option(stage).map(_.asInstanceOf[S]) } } } object SparkStageParam { implicit val formats: Formats = DefaultFormats val NoClass = "" val NoUID = "" def updateParamsMetadataWithPath(jValue: JValue, path: String): JValue = jValue match { case JObject(pairs) => JObject( pairs.map { case (SparkWrapperParams.SparkStageParamName, j) => SparkWrapperParams.SparkStageParamName -> j.merge(JObject("path" -> JString(path))) case param => param } ) case j => throw new IllegalArgumentException(s"Cannot recognize JSON Spark params metadata: $j") } }
Example 25
Source File: DefaultMLWriter.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.serialization import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.ml.param.{ParamPair, Params} import org.apache.spark.ml.util.MLWriter import org.json4s.JsonDSL._ import org.json4s._ import org.json4s.jackson.JsonMethods._ import ai.deepsense.deeplang.doperables.Transformer import ai.deepsense.sparkutils.ML.MLWriterWithSparkContext class DefaultMLWriter[T <: Params](instance: T) extends MLWriter with MLWriterWithSparkContext { def saveImpl(path: String): Unit = { val modelPath = Transformer.modelFilePath(path) saveMetadata(instance, path, sc) CustomPersistence.save(sparkContext, instance, modelPath) } // Copied from org.apache.spark.ml.util.DefaultParamWriter. // We need to be consistent with Spark Format, but this method is private. private def saveMetadata( instance: Params, path: String, sc: SparkContext, extraMetadata: Option[JObject] = None, paramMap: Option[JValue] = None): Unit = { val uid = instance.uid val cls = instance.getClass.getName val params = instance.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]] val jsonParams = paramMap.getOrElse(render(params.map { case ParamPair(p, v) => p.name -> parse(p.jsonEncode(v)) }.toList)) val basicMetadata = ("class" -> cls) ~ ("timestamp" -> System.currentTimeMillis()) ~ ("sparkVersion" -> sc.version) ~ ("uid" -> uid) ~ ("paramMap" -> jsonParams) val metadata = extraMetadata match { case Some(jObject) => basicMetadata ~ jObject case None => basicMetadata } val metadataPath = new Path(path, "metadata").toString val metadataJson = compact(render(metadata)) sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath) } }
Example 26
Source File: ParamsAndFeaturesWritable.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.ml.param.Params import org.apache.spark.ml.util.{DefaultParamsWritable, MLWriter} import org.apache.spark.sql.SparkSession class FeaturesWriter[T](annotatorWithFeatures: HasFeatures, baseWriter: MLWriter, onWritten: (String, SparkSession) => Unit) extends MLWriter with HasFeatures { override protected def saveImpl(path: String): Unit = { baseWriter.save(path) for (feature <- annotatorWithFeatures.features) { if (feature.orDefault.isDefined) feature.serializeInfer(sparkSession, path, feature.name, feature.getOrDefault) } onWritten(path, sparkSession) } } trait ParamsAndFeaturesWritable extends DefaultParamsWritable with Params with HasFeatures { protected def onWrite(path: String, spark: SparkSession): Unit = {} override def write: MLWriter = { new FeaturesWriter( this, super.write, (path: String, spark: SparkSession) => onWrite(path, spark) ) } }
Example 27
Source File: HasEmbeddingsProperties.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.AnnotatorType import org.apache.spark.ml.param.{BooleanParam, IntParam, Params} import org.apache.spark.sql.Column import org.apache.spark.sql.types.MetadataBuilder trait HasEmbeddingsProperties extends Params { val dimension = new IntParam(this, "dimension", "Number of embedding dimensions") def setDimension(value: Int): this.type = set(this.dimension, value) def getDimension: Int = $(dimension) protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = { val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS) metadataBuilder.putLong("dimension", embeddingsDim.toLong) embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref)) col.as(col.toString, metadataBuilder.build) } protected def wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = { val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", AnnotatorType.SENTENCE_EMBEDDINGS) metadataBuilder.putLong("dimension", embeddingsDim.toLong) embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref)) col.as(col.toString, metadataBuilder.build) } }
Example 28
Source File: WordLengthFilter.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.WordLengthFilterModel import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, Params} import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} final def getWordLength: Int = $(wordLength) } class WordLengthFilter(override val uid: String) extends Transformer with WordLengthFilterParams with DefaultParamsWritable { val defaultLength = 3 var model: WordLengthFilterModel = new WordLengthFilterModel(defaultLength) //Initialize with default filter length 3 def this(model: WordLengthFilterModel) = this(uid = Identifiable.randomUID("filter_words")) def this() = this(new WordLengthFilterModel) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) def setWordLength(value: Int = defaultLength): this.type = set(wordLength, value) override def transform(dataset: Dataset[_]): DataFrame = { if(defaultLength != getWordLength) model = new WordLengthFilterModel(getWordLength) val filterWordsUdf = udf { (words: Seq[String]) => model(words) } dataset.withColumn($(outputCol), filterWordsUdf(dataset($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { require(schema($(inputCol)).dataType.isInstanceOf[ArrayType], s"Input column must be of type ArrayType(StringType,true) but got ${schema($(inputCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(outputCol)), s"Output column ${$(outputCol)} already exists.") StructType(schema.fields :+ StructField($(outputCol), ArrayType(StringType, true))) } } object WordLengthFilter extends DefaultParamsReadable[WordLengthFilter] { override def load(path: String): WordLengthFilter = super.load(path) }
Example 29
Source File: ParamUtil.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.util import org.apache.spark.ml.param.{Param, Params} trait ParamUtil { def setOptional[T](obj1: Params, obj2: Params, param1: Param[T], param2: Param[T]): Unit = { if(obj2.isSet(param2)) { obj1.set(param1, obj2.get(param2).get) } else { obj1.clear(param1) } } } object ParamUtil extends ParamUtil