org.apache.spark.ml.Model Scala Examples

The following examples show how to use org.apache.spark.ml.Model. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkModelConverter.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.specific

import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
import com.salesforce.op.stages.base.binary.OpTransformer2
import com.salesforce.op.stages.impl.classification._
import com.salesforce.op.stages.impl.regression._
import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostRegressionModel}
import org.apache.spark.ml.classification._
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.regression._
import org.apache.spark.ml.{Model, PredictionModel}


  // TODO remove when loco and model selector are updated
  def toOPUnchecked(
    model: Model[_],
    uid: String
  ): OpTransformer2[RealNN, OPVector, Prediction] = {
    model match {
      case m: LogisticRegressionModel => new OpLogisticRegressionModel(m, uid = uid)
      case m: RandomForestClassificationModel => new OpRandomForestClassificationModel(m, uid = uid)
      case m: NaiveBayesModel => new OpNaiveBayesModel(m, uid)
      case m: DecisionTreeClassificationModel => new OpDecisionTreeClassificationModel(m, uid = uid)
      case m: GBTClassificationModel => new OpGBTClassificationModel(m, uid = uid)
      case m: LinearSVCModel => new OpLinearSVCModel(m, uid = uid)
      case m: MultilayerPerceptronClassificationModel => new OpMultilayerPerceptronClassificationModel(m, uid = uid)
      case m: LinearRegressionModel => new OpLinearRegressionModel(m, uid = uid)
      case m: RandomForestRegressionModel => new OpRandomForestRegressionModel(m, uid = uid)
      case m: GBTRegressionModel => new OpGBTRegressionModel(m, uid = uid)
      case m: DecisionTreeRegressionModel => new OpDecisionTreeRegressionModel(m, uid = uid)
      case m: GeneralizedLinearRegressionModel => new OpGeneralizedLinearRegressionModel(m, uid = uid)
      case m: XGBoostClassificationModel => new OpXGBoostClassificationModel(m, uid = uid)
      case m: XGBoostRegressionModel => new OpXGBoostRegressionModel(m, uid = uid)
      case m => throw new RuntimeException(s"model conversion not implemented for model $m")
    }
  }

} 
Example 2
Source File: SerializableSparkModel.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import io.deepsense.sparkutils.ML

class SerializableSparkModel[M <: Model[M]](val sparkModel: M)
  extends ML.Model[SerializableSparkModel[M]]
  with MLWritable {

  override def copy(extra: ParamMap): SerializableSparkModel[M] =
    new SerializableSparkModel(sparkModel.copy(extra))

  override def write: MLWriter = {
    sparkModel match {
      case w: MLWritable => w.write
      case _ => new DefaultMLWriter(this)
    }
  }

  override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset)

  override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema)

  override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae"
}

// This class may seem unused, but it is used reflectively by spark deserialization mechanism
object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] {
  override def read: MLReader[SerializableSparkModel[_]] = {
    new DefaultMLReader[SerializableSparkModel[_]]()
  }
} 
Example 3
Source File: SerializableSparkEstimator.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import io.deepsense.sparkutils.ML

class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E)
  extends ML.Estimator[SerializableSparkModel[T]]
  with MLWritable {

  override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0"

  override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = {
    val result: T = sparkEstimator.fit(dataset)
    new SerializableSparkModel[T](result)
  }

  override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] =
    new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E])

  override def write: MLWriter = new DefaultMLWriter(this)

  override def transformSchema(schema: StructType): StructType =
    sparkEstimator.transformSchema(schema)
} 
Example 4
Source File: OpPipelineStageReaderWriterTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages

import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.stages.OpPipelineStageReaderWriter._
import com.salesforce.op.test.PassengerSparkFixtureTest
import com.salesforce.op.utils.reflection.ReflectionUtils
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.{Model, Transformer}
import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder}
import org.json4s.JsonAST.JValue
import org.json4s.jackson.JsonMethods.{compact, parse, pretty, render}
import org.json4s.{JArray, JObject}
import org.scalatest.FlatSpec
import org.slf4j.LoggerFactory


// TODO: consider adding a read/write test for a spark wrapped stage as well
private[stages] abstract class OpPipelineStageReaderWriterTest
  extends FlatSpec with PassengerSparkFixtureTest {

  val meta = new MetadataBuilder().putString("foo", "bar").build()
  val expectedFeaturesLength = 1
  def stage: OpPipelineStageBase with Transformer
  val expected: Array[Real]
  val hasOutputName = true

  private val log = LoggerFactory.getLogger(this.getClass)
  private lazy val savePath = tempDir + "/" + this.getClass.getSimpleName + "-" + System.currentTimeMillis()
  private lazy val writer = new OpPipelineStageWriter(stage)
  private lazy val stageJsonString: String = writer.writeToJsonString(savePath)
  private lazy val stageJson: JValue = parse(stageJsonString)
  private lazy val isModel = stage.isInstanceOf[Model[_]]
  private val FN = FieldNames

  Spec(this.getClass) should "write stage uid" in {
    log.info(pretty(stageJson))
    (stageJson \ FN.Uid.entryName).extract[String] shouldBe stage.uid
  }
  it should "write class name" in {
    (stageJson \ FN.Class.entryName).extract[String] shouldBe stage.getClass.getName
  }
  it should "write params map" in {
    val params = extractParams(stageJson).extract[Map[String, Any]]
    if (hasOutputName) {
      params should have size 4
      params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema", "outputFeatureName")
    } else {
      params should have size 3
      params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema")
    }
  }
  it should "write outputMetadata" in {
    val params = extractParams(stageJson)
    val metadataStr = compact(render(extractParams(stageJson) \ "outputMetadata"))
    val metadata = Metadata.fromJson(metadataStr)
    metadata shouldBe stage.getMetadata()
  }
  it should "write inputSchema" in {
    val schemaStr = compact(render(extractParams(stageJson) \ "inputSchema"))
    val schema = DataType.fromJson(schemaStr)
    schema shouldBe stage.getInputSchema()
  }
  it should "write input features" in {
    val jArray = (extractParams(stageJson) \ "inputFeatures").extract[JArray]
    jArray.values should have length expectedFeaturesLength
    val obj = jArray(0).extract[JObject]
    obj.values.keys shouldBe Set("name", "isResponse", "isRaw", "uid", "typeName", "stages", "originFeatures")
  }
  it should "write model ctor args" in {
    if (stage.isInstanceOf[Model[_]]) {
      val ctorArgs = (stageJson \ FN.CtorArgs.entryName).extract[JObject]
      val (_, args) = ReflectionUtils.bestCtorWithArgs(stage)
      ctorArgs.values.keys shouldBe args.map(_._1).toSet
    }
  }
  it should "load stage correctly" in {
    val reader = new OpPipelineStageReader(stage)
    val stageLoaded = reader.loadFromJsonString(stageJsonString, path = savePath)
    stageLoaded shouldBe a[OpPipelineStageBase]
    stageLoaded shouldBe a[Transformer]
    stageLoaded.getOutput() shouldBe a[FeatureLike[_]]
    val _ = stage.asInstanceOf[Transformer].transform(passengersDataSet)
    val transformed = stageLoaded.asInstanceOf[Transformer].transform(passengersDataSet)
    transformed.collect(stageLoaded.getOutput().asInstanceOf[FeatureLike[Real]]) shouldBe expected
    stageLoaded.uid shouldBe stage.uid
    stageLoaded.operationName shouldBe stage.operationName
    stageLoaded.getInputFeatures() shouldBe stage.getInputFeatures()
    stageLoaded.getInputSchema() shouldBe stage.getInputSchema()
  }

  private def extractParams(stageJson: JValue): JValue = {
    val defaultParamsMap = stageJson \ FN.DefaultParamMap.entryName
    val paramsMap = stageJson \ FN.ParamMap.entryName
    defaultParamsMap.merge(paramsMap)
  }

} 
Example 5
Source File: ModelSelectorNames.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.selector

import com.salesforce.op.evaluators.{EvaluationMetrics, _}
import com.salesforce.op.features.types._
import com.salesforce.op.stages._
import com.salesforce.op.stages.base.binary.OpTransformer2
import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql.{DataFrame, Dataset}

case object ModelSelectorNames {
  val TrainValSplitResults = "trainValidationSplitResults"
  val CrossValResults = "crossValidationResults"
  val TrainingEval = "trainingSetEvaluationResults"
  val HoldOutEval = "testSetEvaluationResults"
  val ResampleValues = "resamplingValues"
  val CuttValues = "cuttValues"
  val PreSplitterDataCount = "preSplitterDataCount"
  val BestModelUid = "bestModelUID"
  val BestModelName = "bestModelName"
  val Positive = "positiveLabels"
  val Negative = "negativeLabels"
  val Desired = "desiredFraction"
  val UpSample = "upSamplingFraction"
  val DownSample = "downSamplingFraction"
  val idColName = "rowId"
  val LabelsKept = "labelsKept"
  val LabelsDropped = "labelsDropped"
  val LabelsDroppedTotal = "labelsDroppedTotal"

  type ModelType = Model[_ <: Model[_]] with OpTransformer2[RealNN, OPVector, Prediction]
  type EstimatorType = Estimator[_ <: Model[_]] with OpPipelineStage2[RealNN, OPVector, Prediction]

  // Stage param names
  val inputParam1Name = "labelCol"
  val inputParam2Name = "featuresCol"
  val outputParamName = "outputFeatureName"
}


  private[op] def evaluateModel(data: Dataset[_]): DataFrame = {
    val scored = transform(data)
    val metrics = evaluate(scored)
    val metadata = ModelSelectorSummary.fromMetadata(getMetadata().getSummaryMetadata())
      .copy(holdoutEvaluation = Option(metrics))
    setMetadata(metadata.toMetadata().toSummaryMetadata())
    scored
  }
} 
Example 6
Source File: SwUnaryEstimator.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.generic

import com.salesforce.op.UID
import com.salesforce.op.features.types.FeatureType
import com.salesforce.op.stages.OpPipelineStage1
import org.apache.spark.ml.param.Params
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql.Dataset

import scala.reflect.runtime.universe.TypeTag


private[stages] final class SwUnaryModel[I <: FeatureType, O <: FeatureType, T <: Model[T] with Params]
(
  val inputParamName: String,
  val outputParamName: String,
  val operationName: String,
  private val sparkMlStageIn: Option[T],
  val uid: String
)(
  implicit val tti: TypeTag[I],
  val tto: TypeTag[O],
  val ttov: TypeTag[O#Value]
) extends Model[SwUnaryModel[I, O, T]] with SwTransformer1[I, O, T] with SparkWrapperParams[T] {

  setSparkMlStage(sparkMlStageIn)

} 
Example 7
Source File: SwSequenceEstimator.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.generic

import com.salesforce.op.UID
import com.salesforce.op.features.types.FeatureType
import com.salesforce.op.stages.OpPipelineStageN
import org.apache.spark.ml.param.Params
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql.Dataset

import scala.reflect.runtime.universe.TypeTag


private[stages] final class SwSequenceModel[I <: FeatureType, O <: FeatureType, T <: Model[T] with Params]
(
  val inputParamName: String,
  val operationName: String,
  val outputParamName: String,
  private val sparkMlStageIn: Option[T],
  val uid: String
)(
  implicit val tti: TypeTag[I],
  val tto: TypeTag[O],
  val ttov: TypeTag[O#Value]
) extends Model[SwSequenceModel[I, O, T]] with SwTransformerN[I, O, T] {

  setSparkMlStage(sparkMlStageIn)

} 
Example 8
Source File: SwBinaryEstimator.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.generic

import com.salesforce.op.UID
import com.salesforce.op.features.types.FeatureType
import com.salesforce.op.stages.OpPipelineStage2
import org.apache.spark.ml.param.Params
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql.Dataset

import scala.reflect.runtime.universe.TypeTag



private[stages] final class SwBinaryModel[I1 <: FeatureType,
I2 <: FeatureType, O <: FeatureType, T <: Model[T] with Params]
(
  val inputParam1Name: String,
  val inputParam2Name: String,
  val outputParamName: String,
  val operationName: String,
  private val sparkMlStageIn: Option[T],
  val uid: String
)(
  implicit val tti1: TypeTag[I1],
  val tti2: TypeTag[I2],
  val tto: TypeTag[O],
  val ttov: TypeTag[O#Value]
) extends Model[SwBinaryModel[I1, I2, O, T]] with SwTransformer2[I1, I2, O, T] {

  setSparkMlStage(sparkMlStageIn)
} 
Example 9
Source File: SwTernaryTransformer.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.generic

import com.salesforce.op.UID
import com.salesforce.op.features.types.FeatureType
import com.salesforce.op.stages.OpPipelineStage3
import org.apache.spark.ml.param.Params
import org.apache.spark.ml.{Model, Transformer}
import org.apache.spark.sql._

import scala.reflect.runtime.universe.TypeTag



class SwTernaryTransformer[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, O <: FeatureType,
T <: Model[T] with Params]
(
  val inputParam1Name: String,
  val inputParam2Name: String,
  val inputParam3Name: String,
  val outputParamName: String,
  val operationName: String,
  private val sparkMlStageIn: Option[T],
  val uid: String = UID[SwTernaryTransformer[I1, I2, I3, O, T]]
)(
  implicit val tti1: TypeTag[I1],
  val tti2: TypeTag[I2],
  val tti3: TypeTag[I3],
  val tto: TypeTag[O],
  val ttov: TypeTag[O#Value]
) extends SwTransformer3[I1, I2, I3, O, T] {

  setSparkMlStage(sparkMlStageIn)

} 
Example 10
Source File: CrossValidation.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.spark.mlpipeline

import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel}
import org.apache.spark.ml.{Model, Pipeline, PipelineStage}
import org.apache.spark.sql._



  @throws(classOf[IllegalArgumentException])
  protected def apply(
    trainDf: DataFrame,
    stages: Array[PipelineStage],
    grid: Array[ParamMap]
  ): CrossValidatorModel = {
    require(stages.size > 0, "Cannot cross-validate pipeline without stages")
    require(grid.size > 0, "Cannot cross-validate with undefined grid")

    val pipeline = new Pipeline().setStages(stages ++ Array[PipelineStage](estimator))
    new CrossValidator()
      .setEstimator(pipeline)
      .setEstimatorParamMaps(grid)
      .setEvaluator(new BinaryClassificationEvaluator)
      .setNumFolds(numFolds)
      .fit(trainDf)
  }

  protected def evaluate(
    trainDf: DataFrame,
    stages: Array[PipelineStage],
    grid: Array[ParamMap]
  ): Evaluator = this(trainDf, stages, grid).getEvaluator
} 
Example 11
Source File: OpEstimatorSpec.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.test

import java.io.File

import com.salesforce.op.features.types._
import com.salesforce.op.stages._
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql.Dataset
import org.scalactic.Equality
import org.scalatest.events.{Event, TestFailed}
import org.scalatest.{Args, Reporter}

import scala.collection.mutable.ArrayBuffer
import scala.reflect._
import scala.reflect.runtime.universe._


  private def modelSpec(): Unit = {
    // Define transformer spec for the fitted model reusing the same inputs & Spark context
    val modelSpec = new OpTransformerSpec[O, ModelType] {
      override implicit val featureTypeEquality: Equality[O] = OpEstimatorSpec.this.featureTypeEquality
      override implicit val seqEquality: Equality[Seq[O]] = OpEstimatorSpec.this.seqEquality
      lazy val transformer: ModelType = OpEstimatorSpec.this.model
      lazy val inputData: Dataset[_] = OpEstimatorSpec.this.inputData
      lazy val expectedResult: Seq[O] = OpEstimatorSpec.this.expectedResult
      override implicit lazy val spark = OpEstimatorSpec.this.spark
      override def specName: String = "model"
      override def tempDir: File = OpEstimatorSpec.this.tempDir
    }

    // Register all model spec tests
    for {
      testName <- modelSpec.testNames
    } registerTest(testName) {
      // Run test & collect failures
      val failures = ArrayBuffer.empty[TestFailed]
      val reporter = new Reporter {
        def apply(event: Event): Unit = event match {
          case f: TestFailed => failures += f
          case _ =>
        }
      }
      // Note: We set 'runTestInNewInstance = true' to avoid restarting Spark context on every test run
      val args = Args(reporter, runTestInNewInstance = true)
      modelSpec.run(testName = Some(testName), args = args)

      // Propagate the failure if any
      for {failure <- failures.headOption} {
        failure.throwable.map(fail(failure.message, _)).getOrElse(fail(failure.message))
      }
    }
  }

} 
Example 12
Source File: TypedEstimator.scala    From frameless   with Apache License 2.0 5 votes vote down vote up
package frameless
package ml

import frameless.ops.SmartProject
import org.apache.spark.ml.{Estimator, Model}


trait TypedEstimator[Inputs, Outputs, M <: Model[M]] {
  val estimator: Estimator[M]

  def fit[T, F[_]](ds: TypedDataset[T])(
    implicit
    smartProject: SmartProject[T, Inputs],
    F: SparkDelay[F]
  ): F[AppendTransformer[Inputs, Outputs, M]] = {
    implicit val sparkSession = ds.dataset.sparkSession
    F.delay {
      val inputDs = smartProject.apply(ds)
      val model = estimator.fit(inputDs.dataset)
      new AppendTransformer[Inputs, Outputs, M] {
        val transformer: M = model
      }
    }
  }
} 
Example 13
Source File: SerializableSparkModel.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import ai.deepsense.sparkutils.ML

class SerializableSparkModel[M <: Model[M]](val sparkModel: M)
  extends ML.Model[SerializableSparkModel[M]]
  with MLWritable {

  override def copy(extra: ParamMap): SerializableSparkModel[M] =
    new SerializableSparkModel(sparkModel.copy(extra))

  override def write: MLWriter = {
    sparkModel match {
      case w: MLWritable => w.write
      case _ => new DefaultMLWriter(this)
    }
  }

  override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset)

  override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema)

  override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae"
}

// This class may seem unused, but it is used reflectively by spark deserialization mechanism
object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] {
  override def read: MLReader[SerializableSparkModel[_]] = {
    new DefaultMLReader[SerializableSparkModel[_]]()
  }
} 
Example 14
Source File: SerializableSparkEstimator.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import ai.deepsense.sparkutils.ML

class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E)
  extends ML.Estimator[SerializableSparkModel[T]]
  with MLWritable {

  override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0"

  override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = {
    val result: T = sparkEstimator.fit(dataset)
    new SerializableSparkModel[T](result)
  }

  override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] =
    new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E])

  override def write: MLWriter = new DefaultMLWriter(this)

  override def transformSchema(schema: StructType): StructType =
    sparkEstimator.transformSchema(schema)
} 
Example 15
Source File: IsolationForest.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.isolationforest

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util._
import org.apache.spark.ml.{Estimator, Model}
import com.linkedin.relevance.isolationforest.{IsolationForestParams,
  IsolationForest => IsolationForestSource, IsolationForestModel => IsolationForestModelSource}
import com.microsoft.ml.spark.core.contracts.Wrappable
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types.StructType

object IsolationForest extends DefaultParamsReadable[IsolationForest]

class IsolationForest(override val uid: String, val that: IsolationForestSource)
  extends Estimator[IsolationForestModel]
  with IsolationForestParams with DefaultParamsWritable with Wrappable {

  def this(uid: String) = this(uid, new IsolationForestSource(uid))

  def this() = this(Identifiable.randomUID("IsolationForest"))

  override def copy(extra: ParamMap): IsolationForest =
    new IsolationForest(uid, that.copy(extra))

  override def fit(data: Dataset[_]): IsolationForestModel =
    new IsolationForestModel(uid, that.fit(data))

  override def transformSchema(schema: StructType): StructType =
    that.transformSchema(schema)
}

class IsolationForestModel(override val uid: String, val that: IsolationForestModelSource)
  extends Model[IsolationForestModel]
  with MLWritable {

  override def copy(extra: ParamMap): IsolationForestModel =
    new IsolationForestModel(uid, that.copy(extra))

  override def transform(data: Dataset[_]): DataFrame =
    that.transform(data)

  override def transformSchema(schema: StructType): StructType =
    that.transformSchema(schema)

  override def write: MLWriter = that.write
}

class IsolationForestModelReader extends MLReader[IsolationForestModel] with Serializable {
  override def load(path: String): IsolationForestModel = {
    val that = IsolationForestModelSource.load(path)

    new IsolationForestModel(that.uid, that)
  }
}
object IsolationForestModel extends MLReadable[IsolationForestModel] {
  override def read: MLReader[IsolationForestModel] = new IsolationForestModelReader
} 
Example 16
Source File: StringToShortIndexer.scala    From spark-ext   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkException
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.util.collection.OpenHashMap


class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel]
with StringIndexerBase {

  def this() = this(Identifiable.randomUID("strShortIdx"))

  def setInputCol(value: String): this.type = set(inputCol, value)

  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def fit(dataset: DataFrame): StringToShortIndexerModel = {
    val counts = dataset.select(col($(inputCol)).cast(StringType))
      .map(_.getString(0))
      .countByValue()
    val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray
    require(labels.length <= Short.MaxValue,
      s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})")
    copyValues(new StringToShortIndexerModel(uid, labels).setParent(this))
  }

  override def transformSchema(schema: StructType): StructType = {
    validateAndTransformSchema(schema)
  }

  override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra)
}

class StringToShortIndexerModel (
  override val uid: String,
  val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase {

  def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels)

  require(labels.length <= Short.MaxValue,
    s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})")

  private val labelToIndex: OpenHashMap[String, Short] = {
    val n = labels.length.toShort
    val map = new OpenHashMap[String, Short](n)
    var i: Short = 0
    while (i < n) {
      map.update(labels(i), i)
      i = (i + 1).toShort
    }
    map
  }

  def setInputCol(value: String): this.type = set(inputCol, value)

  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataset: DataFrame): DataFrame = {
    if (!dataset.schema.fieldNames.contains($(inputCol))) {
      logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " +
        "Skip StringToShortIndexerModel.")
      return dataset
    }

    val indexer = udf { label: String =>
      if (labelToIndex.contains(label)) {
        labelToIndex(label)
      } else {
        // TODO: handle unseen labels
        throw new SparkException(s"Unseen label: $label.")
      }
    }
    val outputColName = $(outputCol)
    val metadata = NominalAttribute.defaultAttr
      .withName(outputColName).withValues(labels).toMetadata()
    dataset.select(col("*"),
      indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    if (schema.fieldNames.contains($(inputCol))) {
      validateAndTransformSchema(schema)
    } else {
      // If the input column does not exist during transformation, we skip StringToShortIndexerModel.
      schema
    }
  }

  override def copy(extra: ParamMap): StringToShortIndexerModel = {
    val copied = new StringToShortIndexerModel(uid, labels)
    copyValues(copied, extra).setParent(parent)
  }
} 
Example 17
Source File: RecursivePipeline.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import org.apache.spark.internal.Logging
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset}

import scala.collection.mutable.ListBuffer

class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline {

  def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty)

  def this(uid: String) = this(uid, Array.empty)

  def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages)

  this.setStages(baseStages)

  
  override def fit(dataset: Dataset[_]): PipelineModel = {
    transformSchema(dataset.schema, logging = true)
    val theStages = $(stages)
    var indexOfLastEstimator = -1
    theStages.view.zipWithIndex.foreach { case (stage, index) =>
      stage match {
        case _: Estimator[_] =>
          indexOfLastEstimator = index
        case _ =>
      }
    }
    var curDataset = dataset
    val transformers = ListBuffer.empty[Transformer]
    theStages.view.zipWithIndex.foreach { case (stage, index) =>
      if (index <= indexOfLastEstimator) {
        val transformer = stage match {
          case estimator: HasRecursiveFit[_] =>
            estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset))
          case estimator: Estimator[_] =>
            estimator.fit(curDataset)
          case t: Transformer =>
            t
          case _ =>
            throw new IllegalArgumentException(
              s"Does not support stage $stage of type ${stage.getClass}")
        }
        if (index < indexOfLastEstimator) {
          curDataset = transformer.transform(curDataset)
        }
        transformers += transformer
      } else {
        transformers += stage.asInstanceOf[Transformer]
      }
    }

    createPipeline(dataset, transformers.toArray)
  }

}

class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel)
  extends Model[RecursivePipelineModel] with MLWritable with Logging {

  def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline)

  // drops right at most because is itself included
  private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel =
    new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset)

  override def copy(extra: ParamMap): RecursivePipelineModel = {
    new RecursivePipelineModel(uid, innerPipeline.copy(extra))
  }

  override def write: MLWriter = {
    innerPipeline.write
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match {
      case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset))
      case t: AnnotatorModel[_] if t.getLazyAnnotator => cur
      case t: Transformer => t.transform(cur)
    })
  }

  override def transformSchema(schema: StructType): StructType = {
    innerPipeline.transformSchema(schema)
  }
} 
Example 18
Source File: AnnotatorApproach.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import com.johnsnowlabs.storage.HasStorage
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType}
import org.apache.spark.ml.util.DefaultParamsWritable


  override final def transformSchema(schema: StructType): StructType = {
    require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" +
      msgHelper(schema) +
      s"\nMake sure such annotators exist in your pipeline, " +
      s"with the right output names and that they have following annotator types: " +
      s"${inputAnnotatorTypes.mkString(", ")}")
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", outputAnnotatorType)
    val outputFields = schema.fields :+
      StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build)
    StructType(outputFields)
  }
}