org.apache.spark.ml.Model Scala Examples
The following examples show how to use org.apache.spark.ml.Model.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkModelConverter.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.base.binary.OpTransformer2 import com.salesforce.op.stages.impl.classification._ import com.salesforce.op.stages.impl.regression._ import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostRegressionModel} import org.apache.spark.ml.classification._ import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.regression._ import org.apache.spark.ml.{Model, PredictionModel} // TODO remove when loco and model selector are updated def toOPUnchecked( model: Model[_], uid: String ): OpTransformer2[RealNN, OPVector, Prediction] = { model match { case m: LogisticRegressionModel => new OpLogisticRegressionModel(m, uid = uid) case m: RandomForestClassificationModel => new OpRandomForestClassificationModel(m, uid = uid) case m: NaiveBayesModel => new OpNaiveBayesModel(m, uid) case m: DecisionTreeClassificationModel => new OpDecisionTreeClassificationModel(m, uid = uid) case m: GBTClassificationModel => new OpGBTClassificationModel(m, uid = uid) case m: LinearSVCModel => new OpLinearSVCModel(m, uid = uid) case m: MultilayerPerceptronClassificationModel => new OpMultilayerPerceptronClassificationModel(m, uid = uid) case m: LinearRegressionModel => new OpLinearRegressionModel(m, uid = uid) case m: RandomForestRegressionModel => new OpRandomForestRegressionModel(m, uid = uid) case m: GBTRegressionModel => new OpGBTRegressionModel(m, uid = uid) case m: DecisionTreeRegressionModel => new OpDecisionTreeRegressionModel(m, uid = uid) case m: GeneralizedLinearRegressionModel => new OpGeneralizedLinearRegressionModel(m, uid = uid) case m: XGBoostClassificationModel => new OpXGBoostClassificationModel(m, uid = uid) case m: XGBoostRegressionModel => new OpXGBoostRegressionModel(m, uid = uid) case m => throw new RuntimeException(s"model conversion not implemented for model $m") } } }
Example 2
Source File: SerializableSparkModel.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.Model import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import io.deepsense.sparkutils.ML class SerializableSparkModel[M <: Model[M]](val sparkModel: M) extends ML.Model[SerializableSparkModel[M]] with MLWritable { override def copy(extra: ParamMap): SerializableSparkModel[M] = new SerializableSparkModel(sparkModel.copy(extra)) override def write: MLWriter = { sparkModel match { case w: MLWritable => w.write case _ => new DefaultMLWriter(this) } } override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset) override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema) override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae" } // This class may seem unused, but it is used reflectively by spark deserialization mechanism object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] { override def read: MLReader[SerializableSparkModel[_]] = { new DefaultMLReader[SerializableSparkModel[_]]() } }
Example 3
Source File: SerializableSparkEstimator.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql import org.apache.spark.sql.types.StructType import io.deepsense.sparkutils.ML class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E) extends ML.Estimator[SerializableSparkModel[T]] with MLWritable { override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0" override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = { val result: T = sparkEstimator.fit(dataset) new SerializableSparkModel[T](result) } override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] = new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E]) override def write: MLWriter = new DefaultMLWriter(this) override def transformSchema(schema: StructType): StructType = sparkEstimator.transformSchema(schema) }
Example 4
Source File: OpPipelineStageReaderWriterTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.features._ import com.salesforce.op.features.types._ import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.test.PassengerSparkFixtureTest import com.salesforce.op.utils.reflection.ReflectionUtils import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.{Model, Transformer} import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder} import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods.{compact, parse, pretty, render} import org.json4s.{JArray, JObject} import org.scalatest.FlatSpec import org.slf4j.LoggerFactory // TODO: consider adding a read/write test for a spark wrapped stage as well private[stages] abstract class OpPipelineStageReaderWriterTest extends FlatSpec with PassengerSparkFixtureTest { val meta = new MetadataBuilder().putString("foo", "bar").build() val expectedFeaturesLength = 1 def stage: OpPipelineStageBase with Transformer val expected: Array[Real] val hasOutputName = true private val log = LoggerFactory.getLogger(this.getClass) private lazy val savePath = tempDir + "/" + this.getClass.getSimpleName + "-" + System.currentTimeMillis() private lazy val writer = new OpPipelineStageWriter(stage) private lazy val stageJsonString: String = writer.writeToJsonString(savePath) private lazy val stageJson: JValue = parse(stageJsonString) private lazy val isModel = stage.isInstanceOf[Model[_]] private val FN = FieldNames Spec(this.getClass) should "write stage uid" in { log.info(pretty(stageJson)) (stageJson \ FN.Uid.entryName).extract[String] shouldBe stage.uid } it should "write class name" in { (stageJson \ FN.Class.entryName).extract[String] shouldBe stage.getClass.getName } it should "write params map" in { val params = extractParams(stageJson).extract[Map[String, Any]] if (hasOutputName) { params should have size 4 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema", "outputFeatureName") } else { params should have size 3 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema") } } it should "write outputMetadata" in { val params = extractParams(stageJson) val metadataStr = compact(render(extractParams(stageJson) \ "outputMetadata")) val metadata = Metadata.fromJson(metadataStr) metadata shouldBe stage.getMetadata() } it should "write inputSchema" in { val schemaStr = compact(render(extractParams(stageJson) \ "inputSchema")) val schema = DataType.fromJson(schemaStr) schema shouldBe stage.getInputSchema() } it should "write input features" in { val jArray = (extractParams(stageJson) \ "inputFeatures").extract[JArray] jArray.values should have length expectedFeaturesLength val obj = jArray(0).extract[JObject] obj.values.keys shouldBe Set("name", "isResponse", "isRaw", "uid", "typeName", "stages", "originFeatures") } it should "write model ctor args" in { if (stage.isInstanceOf[Model[_]]) { val ctorArgs = (stageJson \ FN.CtorArgs.entryName).extract[JObject] val (_, args) = ReflectionUtils.bestCtorWithArgs(stage) ctorArgs.values.keys shouldBe args.map(_._1).toSet } } it should "load stage correctly" in { val reader = new OpPipelineStageReader(stage) val stageLoaded = reader.loadFromJsonString(stageJsonString, path = savePath) stageLoaded shouldBe a[OpPipelineStageBase] stageLoaded shouldBe a[Transformer] stageLoaded.getOutput() shouldBe a[FeatureLike[_]] val _ = stage.asInstanceOf[Transformer].transform(passengersDataSet) val transformed = stageLoaded.asInstanceOf[Transformer].transform(passengersDataSet) transformed.collect(stageLoaded.getOutput().asInstanceOf[FeatureLike[Real]]) shouldBe expected stageLoaded.uid shouldBe stage.uid stageLoaded.operationName shouldBe stage.operationName stageLoaded.getInputFeatures() shouldBe stage.getInputFeatures() stageLoaded.getInputSchema() shouldBe stage.getInputSchema() } private def extractParams(stageJson: JValue): JValue = { val defaultParamsMap = stageJson \ FN.DefaultParamMap.entryName val paramsMap = stageJson \ FN.ParamMap.entryName defaultParamsMap.merge(paramsMap) } }
Example 5
Source File: ModelSelectorNames.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.selector import com.salesforce.op.evaluators.{EvaluationMetrics, _} import com.salesforce.op.features.types._ import com.salesforce.op.stages._ import com.salesforce.op.stages.base.binary.OpTransformer2 import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.{DataFrame, Dataset} case object ModelSelectorNames { val TrainValSplitResults = "trainValidationSplitResults" val CrossValResults = "crossValidationResults" val TrainingEval = "trainingSetEvaluationResults" val HoldOutEval = "testSetEvaluationResults" val ResampleValues = "resamplingValues" val CuttValues = "cuttValues" val PreSplitterDataCount = "preSplitterDataCount" val BestModelUid = "bestModelUID" val BestModelName = "bestModelName" val Positive = "positiveLabels" val Negative = "negativeLabels" val Desired = "desiredFraction" val UpSample = "upSamplingFraction" val DownSample = "downSamplingFraction" val idColName = "rowId" val LabelsKept = "labelsKept" val LabelsDropped = "labelsDropped" val LabelsDroppedTotal = "labelsDroppedTotal" type ModelType = Model[_ <: Model[_]] with OpTransformer2[RealNN, OPVector, Prediction] type EstimatorType = Estimator[_ <: Model[_]] with OpPipelineStage2[RealNN, OPVector, Prediction] // Stage param names val inputParam1Name = "labelCol" val inputParam2Name = "featuresCol" val outputParamName = "outputFeatureName" } private[op] def evaluateModel(data: Dataset[_]): DataFrame = { val scored = transform(data) val metrics = evaluate(scored) val metadata = ModelSelectorSummary.fromMetadata(getMetadata().getSummaryMetadata()) .copy(holdoutEvaluation = Option(metrics)) setMetadata(metadata.toMetadata().toSummaryMetadata()) scored } }
Example 6
Source File: SwUnaryEstimator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage1 import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag private[stages] final class SwUnaryModel[I <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParamName: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends Model[SwUnaryModel[I, O, T]] with SwTransformer1[I, O, T] with SparkWrapperParams[T] { setSparkMlStage(sparkMlStageIn) }
Example 7
Source File: SwSequenceEstimator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStageN import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag private[stages] final class SwSequenceModel[I <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParamName: String, val operationName: String, val outputParamName: String, private val sparkMlStageIn: Option[T], val uid: String )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends Model[SwSequenceModel[I, O, T]] with SwTransformerN[I, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 8
Source File: SwBinaryEstimator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage2 import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag private[stages] final class SwBinaryModel[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParam1Name: String, val inputParam2Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends Model[SwBinaryModel[I1, I2, O, T]] with SwTransformer2[I1, I2, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 9
Source File: SwTernaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage3 import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Model, Transformer} import org.apache.spark.sql._ import scala.reflect.runtime.universe.TypeTag class SwTernaryTransformer[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParam1Name: String, val inputParam2Name: String, val inputParam3Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwTernaryTransformer[I1, I2, I3, O, T]] )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tti3: TypeTag[I3], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformer3[I1, I2, I3, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 10
Source File: CrossValidation.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel} import org.apache.spark.ml.{Model, Pipeline, PipelineStage} import org.apache.spark.sql._ @throws(classOf[IllegalArgumentException]) protected def apply( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): CrossValidatorModel = { require(stages.size > 0, "Cannot cross-validate pipeline without stages") require(grid.size > 0, "Cannot cross-validate with undefined grid") val pipeline = new Pipeline().setStages(stages ++ Array[PipelineStage](estimator)) new CrossValidator() .setEstimator(pipeline) .setEstimatorParamMaps(grid) .setEvaluator(new BinaryClassificationEvaluator) .setNumFolds(numFolds) .fit(trainDf) } protected def evaluate( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): Evaluator = this(trainDf, stages, grid).getEvaluator }
Example 11
Source File: OpEstimatorSpec.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.test import java.io.File import com.salesforce.op.features.types._ import com.salesforce.op.stages._ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import org.scalactic.Equality import org.scalatest.events.{Event, TestFailed} import org.scalatest.{Args, Reporter} import scala.collection.mutable.ArrayBuffer import scala.reflect._ import scala.reflect.runtime.universe._ private def modelSpec(): Unit = { // Define transformer spec for the fitted model reusing the same inputs & Spark context val modelSpec = new OpTransformerSpec[O, ModelType] { override implicit val featureTypeEquality: Equality[O] = OpEstimatorSpec.this.featureTypeEquality override implicit val seqEquality: Equality[Seq[O]] = OpEstimatorSpec.this.seqEquality lazy val transformer: ModelType = OpEstimatorSpec.this.model lazy val inputData: Dataset[_] = OpEstimatorSpec.this.inputData lazy val expectedResult: Seq[O] = OpEstimatorSpec.this.expectedResult override implicit lazy val spark = OpEstimatorSpec.this.spark override def specName: String = "model" override def tempDir: File = OpEstimatorSpec.this.tempDir } // Register all model spec tests for { testName <- modelSpec.testNames } registerTest(testName) { // Run test & collect failures val failures = ArrayBuffer.empty[TestFailed] val reporter = new Reporter { def apply(event: Event): Unit = event match { case f: TestFailed => failures += f case _ => } } // Note: We set 'runTestInNewInstance = true' to avoid restarting Spark context on every test run val args = Args(reporter, runTestInNewInstance = true) modelSpec.run(testName = Some(testName), args = args) // Propagate the failure if any for {failure <- failures.headOption} { failure.throwable.map(fail(failure.message, _)).getOrElse(fail(failure.message)) } } } }
Example 12
Source File: TypedEstimator.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml import frameless.ops.SmartProject import org.apache.spark.ml.{Estimator, Model} trait TypedEstimator[Inputs, Outputs, M <: Model[M]] { val estimator: Estimator[M] def fit[T, F[_]](ds: TypedDataset[T])( implicit smartProject: SmartProject[T, Inputs], F: SparkDelay[F] ): F[AppendTransformer[Inputs, Outputs, M]] = { implicit val sparkSession = ds.dataset.sparkSession F.delay { val inputDs = smartProject.apply(ds) val model = estimator.fit(inputDs.dataset) new AppendTransformer[Inputs, Outputs, M] { val transformer: M = model } } } }
Example 13
Source File: SerializableSparkModel.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.Model import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import ai.deepsense.sparkutils.ML class SerializableSparkModel[M <: Model[M]](val sparkModel: M) extends ML.Model[SerializableSparkModel[M]] with MLWritable { override def copy(extra: ParamMap): SerializableSparkModel[M] = new SerializableSparkModel(sparkModel.copy(extra)) override def write: MLWriter = { sparkModel match { case w: MLWritable => w.write case _ => new DefaultMLWriter(this) } } override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset) override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema) override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae" } // This class may seem unused, but it is used reflectively by spark deserialization mechanism object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] { override def read: MLReader[SerializableSparkModel[_]] = { new DefaultMLReader[SerializableSparkModel[_]]() } }
Example 14
Source File: SerializableSparkEstimator.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql import org.apache.spark.sql.types.StructType import ai.deepsense.sparkutils.ML class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E) extends ML.Estimator[SerializableSparkModel[T]] with MLWritable { override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0" override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = { val result: T = sparkEstimator.fit(dataset) new SerializableSparkModel[T](result) } override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] = new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E]) override def write: MLWriter = new DefaultMLWriter(this) override def transformSchema(schema: StructType): StructType = sparkEstimator.transformSchema(schema) }
Example 15
Source File: IsolationForest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.isolationforest import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util._ import org.apache.spark.ml.{Estimator, Model} import com.linkedin.relevance.isolationforest.{IsolationForestParams, IsolationForest => IsolationForestSource, IsolationForestModel => IsolationForestModelSource} import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.StructType object IsolationForest extends DefaultParamsReadable[IsolationForest] class IsolationForest(override val uid: String, val that: IsolationForestSource) extends Estimator[IsolationForestModel] with IsolationForestParams with DefaultParamsWritable with Wrappable { def this(uid: String) = this(uid, new IsolationForestSource(uid)) def this() = this(Identifiable.randomUID("IsolationForest")) override def copy(extra: ParamMap): IsolationForest = new IsolationForest(uid, that.copy(extra)) override def fit(data: Dataset[_]): IsolationForestModel = new IsolationForestModel(uid, that.fit(data)) override def transformSchema(schema: StructType): StructType = that.transformSchema(schema) } class IsolationForestModel(override val uid: String, val that: IsolationForestModelSource) extends Model[IsolationForestModel] with MLWritable { override def copy(extra: ParamMap): IsolationForestModel = new IsolationForestModel(uid, that.copy(extra)) override def transform(data: Dataset[_]): DataFrame = that.transform(data) override def transformSchema(schema: StructType): StructType = that.transformSchema(schema) override def write: MLWriter = that.write } class IsolationForestModelReader extends MLReader[IsolationForestModel] with Serializable { override def load(path: String): IsolationForestModel = { val that = IsolationForestModelSource.load(path) new IsolationForestModel(that.uid, that) } } object IsolationForestModel extends MLReadable[IsolationForestModel] { override def read: MLReader[IsolationForestModel] = new IsolationForestModelReader }
Example 16
Source File: StringToShortIndexer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.util.collection.OpenHashMap class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel] with StringIndexerBase { def this() = this(Identifiable.randomUID("strShortIdx")) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def fit(dataset: DataFrame): StringToShortIndexerModel = { val counts = dataset.select(col($(inputCol)).cast(StringType)) .map(_.getString(0)) .countByValue() val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") copyValues(new StringToShortIndexerModel(uid, labels).setParent(this)) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra) } class StringToShortIndexerModel ( override val uid: String, val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase { def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels) require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") private val labelToIndex: OpenHashMap[String, Short] = { val n = labels.length.toShort val map = new OpenHashMap[String, Short](n) var i: Short = 0 while (i < n) { map.update(labels(i), i) i = (i + 1).toShort } map } def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { if (!dataset.schema.fieldNames.contains($(inputCol))) { logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " + "Skip StringToShortIndexerModel.") return dataset } val indexer = udf { label: String => if (labelToIndex.contains(label)) { labelToIndex(label) } else { // TODO: handle unseen labels throw new SparkException(s"Unseen label: $label.") } } val outputColName = $(outputCol) val metadata = NominalAttribute.defaultAttr .withName(outputColName).withValues(labels).toMetadata() dataset.select(col("*"), indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { if (schema.fieldNames.contains($(inputCol))) { validateAndTransformSchema(schema) } else { // If the input column does not exist during transformation, we skip StringToShortIndexerModel. schema } } override def copy(extra: ParamMap): StringToShortIndexerModel = { val copied = new StringToShortIndexerModel(uid, labels) copyValues(copied, extra).setParent(parent) } }
Example 17
Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.mutable.ListBuffer class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline { def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty) def this(uid: String) = this(uid, Array.empty) def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages) this.setStages(baseStages) override def fit(dataset: Dataset[_]): PipelineModel = { transformSchema(dataset.schema, logging = true) val theStages = $(stages) var indexOfLastEstimator = -1 theStages.view.zipWithIndex.foreach { case (stage, index) => stage match { case _: Estimator[_] => indexOfLastEstimator = index case _ => } } var curDataset = dataset val transformers = ListBuffer.empty[Transformer] theStages.view.zipWithIndex.foreach { case (stage, index) => if (index <= indexOfLastEstimator) { val transformer = stage match { case estimator: HasRecursiveFit[_] => estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset)) case estimator: Estimator[_] => estimator.fit(curDataset) case t: Transformer => t case _ => throw new IllegalArgumentException( s"Does not support stage $stage of type ${stage.getClass}") } if (index < indexOfLastEstimator) { curDataset = transformer.transform(curDataset) } transformers += transformer } else { transformers += stage.asInstanceOf[Transformer] } } createPipeline(dataset, transformers.toArray) } } class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel) extends Model[RecursivePipelineModel] with MLWritable with Logging { def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline) // drops right at most because is itself included private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel = new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset) override def copy(extra: ParamMap): RecursivePipelineModel = { new RecursivePipelineModel(uid, innerPipeline.copy(extra)) } override def write: MLWriter = { innerPipeline.write } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match { case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset)) case t: AnnotatorModel[_] if t.getLazyAnnotator => cur case t: Transformer => t.transform(cur) }) } override def transformSchema(schema: StructType): StructType = { innerPipeline.transformSchema(schema) } }
Example 18
Source File: AnnotatorApproach.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.storage.HasStorage import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer} import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType} import org.apache.spark.ml.util.DefaultParamsWritable override final def transformSchema(schema: StructType): StructType = { require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" + msgHelper(schema) + s"\nMake sure such annotators exist in your pipeline, " + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", outputAnnotatorType) val outputFields = schema.fields :+ StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build) StructType(outputFields) } }