org.apache.spark.ml.util.MLWriter Scala Example

Source File: MultilayerPerceptronClassifierWrapper.scala From drizzle-spark with Apache License 2.0

8 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel,
    val labelCount: Long,
    val layers: Array[Int],
    val weights: Array[Double]
  ) extends MLWritable {

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
      val rMetadata = parse(rMetadataStr)
      val labelCount = (rMetadata \ "labelCount").extract[Long]
      val layers = (rMetadata \ "layers").extract[Array[Int]]
      val weights = (rMetadata \ "weights").extract[Array[Double]]

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = ("class" -> instance.getClass.getName) ~
        ("labelCount" -> instance.labelCount) ~
        ("layers" -> instance.layers.toSeq) ~
        ("weights" -> instance.weights.toArray.toSeq)
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
}

Source File: MathUnary.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
import org.apache.spark.sql.functions.udf


    private val className = classOf[MathUnary].getName

    override def load(path: String): MathUnary = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("operation").head()
      val operation = data.getAs[String](0)

      val model = MathUnaryModel(UnaryOperation.forName(operation))
      val transformer = new MathUnary(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

}

Source File: MultilayerPerceptronClassifierWrapper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  val weights: Array[Double] = mlpModel.weights.toArray
  val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
}

Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp

import org.apache.spark.internal.Logging
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset}

import scala.collection.mutable.ListBuffer

class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline {

  def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty)

  def this(uid: String) = this(uid, Array.empty)

  def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages)

  this.setStages(baseStages)

  
  override def fit(dataset: Dataset[_]): PipelineModel = {
    transformSchema(dataset.schema, logging = true)
    val theStages = $(stages)
    var indexOfLastEstimator = -1
    theStages.view.zipWithIndex.foreach { case (stage, index) =>
      stage match {
        case _: Estimator[_] =>
          indexOfLastEstimator = index
        case _ =>
      }
    }
    var curDataset = dataset
    val transformers = ListBuffer.empty[Transformer]
    theStages.view.zipWithIndex.foreach { case (stage, index) =>
      if (index <= indexOfLastEstimator) {
        val transformer = stage match {
          case estimator: HasRecursiveFit[_] =>
            estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset))
          case estimator: Estimator[_] =>
            estimator.fit(curDataset)
          case t: Transformer =>
            t
          case _ =>
            throw new IllegalArgumentException(
              s"Does not support stage $stage of type ${stage.getClass}")
        }
        if (index < indexOfLastEstimator) {
          curDataset = transformer.transform(curDataset)
        }
        transformers += transformer
      } else {
        transformers += stage.asInstanceOf[Transformer]
      }
    }

    createPipeline(dataset, transformers.toArray)
  }

}

class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel)
  extends Model[RecursivePipelineModel] with MLWritable with Logging {

  def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline)

  // drops right at most because is itself included
  private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel =
    new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset)

  override def copy(extra: ParamMap): RecursivePipelineModel = {
    new RecursivePipelineModel(uid, innerPipeline.copy(extra))
  }

  override def write: MLWriter = {
    innerPipeline.write
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match {
      case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset))
      case t: AnnotatorModel[_] if t.getLazyAnnotator => cur
      case t: Transformer => t.transform(cur)
    })
  }

  override def transformSchema(schema: StructType): StructType = {
    innerPipeline.transformSchema(schema)
  }
}

Source File: ParamsAndFeaturesWritable.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp

import org.apache.spark.ml.param.Params
import org.apache.spark.ml.util.{DefaultParamsWritable, MLWriter}
import org.apache.spark.sql.SparkSession

class FeaturesWriter[T](annotatorWithFeatures: HasFeatures, baseWriter: MLWriter, onWritten: (String, SparkSession) => Unit)
  extends MLWriter with HasFeatures {

  override protected def saveImpl(path: String): Unit = {
    baseWriter.save(path)

    for (feature <- annotatorWithFeatures.features) {
      if (feature.orDefault.isDefined)
        feature.serializeInfer(sparkSession, path, feature.name, feature.getOrDefault)
    }

    onWritten(path, sparkSession)

  }
}

trait ParamsAndFeaturesWritable extends DefaultParamsWritable with Params with HasFeatures {

  protected def onWrite(path: String, spark: SparkSession): Unit = {}

  override def write: MLWriter = {
    new FeaturesWriter(
      this,
      super.write,
      (path: String, spark: SparkSession) => onWrite(path, spark)
    )
  }

}

Source File: TrainingHelper.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.util

import java.io.File
import java.nio.file.{Files, Paths, StandardCopyOption}
import java.sql.Timestamp
import java.util.Date

import com.johnsnowlabs.nlp.pretrained.ResourceType.ResourceType
import com.johnsnowlabs.nlp.pretrained.{ResourceMetadata, ResourceType}
import org.apache.commons.io.FileUtils
import org.apache.spark.ml.util.MLWriter


object TrainingHelper {

  def saveModel(name: String,
                language: Option[String],
                libVersion: Option[Version],
                sparkVersion: Option[Version],
                modelWriter: MLWriter,
                folder: String,
                category: Option[ResourceType] = Some(ResourceType.NOT_DEFINED)
               ): Unit = {

    // 1. Get current timestamp
    val timestamp = new Timestamp(new Date().getTime)


    // 2. Save model to file
    val file = Paths.get(folder, timestamp.toString).toString.replaceAllLiterally("\\", "/")
    modelWriter.save(file)

    // 3. Zip file
    val tempzipFile = Paths.get(folder, timestamp + ".zip")
    ZipArchiveUtil.zip(file, tempzipFile.toString)

    // 4. Set checksum
    val checksum = FileHelper.generateChecksum(tempzipFile.toString)

    // 5. Create resource metadata
    val meta = new ResourceMetadata(name, language, libVersion, sparkVersion, true, timestamp, true, category = category, checksum)

    val zipfile = Paths.get(meta.fileName)

    // 6. Move the zip
    Files.move(tempzipFile, zipfile, StandardCopyOption.REPLACE_EXISTING)

    // 7. Remove original file
    try {
      FileUtils.deleteDirectory(new File(file))
    } catch {
      case _: java.io.IOException => //file lock may prevent deletion, ignore and continue
    }

      // 6. Add to metadata.json info about resource
      val metadataFile = Paths.get(folder, "metadata.json").toString
      ResourceMetadata.addMetadataToFile(metadataFile, meta)
    }
}

Source File: MultilayerPerceptronClassifierWrapper.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  val weights: Array[Double] = mlpModel.weights.toArray
  val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
}

Source File: DefaultMLWriter.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.serialization

import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.ml.param.{ParamPair, Params}
import org.apache.spark.ml.util.MLWriter
import org.json4s.JsonDSL._
import org.json4s._
import org.json4s.jackson.JsonMethods._

import ai.deepsense.deeplang.doperables.Transformer
import ai.deepsense.sparkutils.ML.MLWriterWithSparkContext

class DefaultMLWriter[T <: Params](instance: T) extends MLWriter with MLWriterWithSparkContext {

  def saveImpl(path: String): Unit = {
    val modelPath = Transformer.modelFilePath(path)
    saveMetadata(instance, path, sc)
    CustomPersistence.save(sparkContext, instance, modelPath)
  }

  
  // Copied from org.apache.spark.ml.util.DefaultParamWriter.
  // We need to be consistent with Spark Format, but this method is private.
  private def saveMetadata(
      instance: Params,
      path: String,
      sc: SparkContext,
      extraMetadata: Option[JObject] = None,
      paramMap: Option[JValue] = None): Unit = {
    val uid = instance.uid
    val cls = instance.getClass.getName
    val params = instance.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]]
    val jsonParams = paramMap.getOrElse(render(params.map { case ParamPair(p, v) =>
      p.name -> parse(p.jsonEncode(v))
    }.toList))
    val basicMetadata = ("class" -> cls) ~
      ("timestamp" -> System.currentTimeMillis()) ~
      ("sparkVersion" -> sc.version) ~
      ("uid" -> uid) ~
      ("paramMap" -> jsonParams)
    val metadata = extraMetadata match {
      case Some(jObject) =>
        basicMetadata ~ jObject
      case None =>
        basicMetadata
    }
    val metadataPath = new Path(path, "metadata").toString
    val metadataJson = compact(render(metadata))
    sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath)
  }
}

Source File: SerializableSparkEstimator.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import ai.deepsense.sparkutils.ML

class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E)
  extends ML.Estimator[SerializableSparkModel[T]]
  with MLWritable {

  override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0"

  override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = {
    val result: T = sparkEstimator.fit(dataset)
    new SerializableSparkModel[T](result)
  }

  override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] =
    new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E])

  override def write: MLWriter = new DefaultMLWriter(this)

  override def transformSchema(schema: StructType): StructType =
    sparkEstimator.transformSchema(schema)
}

Source File: SerializableSparkModel.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import ai.deepsense.sparkutils.ML

class SerializableSparkModel[M <: Model[M]](val sparkModel: M)
  extends ML.Model[SerializableSparkModel[M]]
  with MLWritable {

  override def copy(extra: ParamMap): SerializableSparkModel[M] =
    new SerializableSparkModel(sparkModel.copy(extra))

  override def write: MLWriter = {
    sparkModel match {
      case w: MLWritable => w.write
      case _ => new DefaultMLWriter(this)
    }
  }

  override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset)

  override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema)

  override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae"
}

// This class may seem unused, but it is used reflectively by spark deserialization mechanism
object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] {
  override def read: MLReader[SerializableSparkModel[_]] = {
    new DefaultMLReader[SerializableSparkModel[_]]()
  }
}

Source File: OpPipelineStageWriter.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages

import com.salesforce.op.stages.OpPipelineStageReaderWriter._
import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams
import org.apache.hadoop.fs.Path
import org.apache.spark.ml.util.MLWriter
import org.apache.spark.ml.{Estimator, SparkDefaultParamsReadWrite}
import org.json4s.JsonAST.{JObject, JValue}
import org.json4s.jackson.JsonMethods.{compact, render}

import scala.util.{Failure, Success}


  def writeToJson(path: String): JObject = {
    stage match {
      case _: Estimator[_] => return JObject() // no need to serialize estimators
      case s: SparkWrapperParams[_] =>
        // Set save path for all Spark wrapped stages of type [[SparkWrapperParams]] so they can save
        s.setStageSavePath(path)
      case _ =>
    }
    // We produce stage metadata for all the Spark params
    val metadata = SparkDefaultParamsReadWrite.getMetadataToSave(stage)

    // Write out the stage using the specified writer instance
    val writer = readerWriterFor[OpPipelineStageBase](stage.getClass.asInstanceOf[Class[OpPipelineStageBase]])
    val stageJson: JValue = writer.write(stage) match {
      case Failure(err) => throw new RuntimeException(s"Failed to write out stage '${stage.uid}'", err)
      case Success(json) => json
    }

    // Join metadata & with stage ctor args
    val j = metadata.merge(JObject(FieldNames.CtorArgs.entryName -> stageJson))
    render(j).asInstanceOf[JObject]
  }

}

Source File: MultilayerPerceptronClassifierWrapper.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  private val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  lazy val weights: Array[Double] = mlpModel.weights.toArray
  lazy val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
}

Source File: DefaultMLWriter.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.serialization

import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.ml.param.{ParamPair, Params}
import org.apache.spark.ml.util.MLWriter
import org.json4s.JsonDSL._
import org.json4s._
import org.json4s.jackson.JsonMethods._

import io.deepsense.deeplang.doperables.Transformer
import io.deepsense.sparkutils.ML.MLWriterWithSparkContext

class DefaultMLWriter[T <: Params](instance: T) extends MLWriter with MLWriterWithSparkContext {

  def saveImpl(path: String): Unit = {
    val modelPath = Transformer.modelFilePath(path)
    saveMetadata(instance, path, sc)
    CustomPersistence.save(sparkContext, instance, modelPath)
  }

  
  // Copied from org.apache.spark.ml.util.DefaultParamWriter.
  // We need to be consistent with Spark Format, but this method is private.
  private def saveMetadata(
      instance: Params,
      path: String,
      sc: SparkContext,
      extraMetadata: Option[JObject] = None,
      paramMap: Option[JValue] = None): Unit = {
    val uid = instance.uid
    val cls = instance.getClass.getName
    val params = instance.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]]
    val jsonParams = paramMap.getOrElse(render(params.map { case ParamPair(p, v) =>
      p.name -> parse(p.jsonEncode(v))
    }.toList))
    val basicMetadata = ("class" -> cls) ~
      ("timestamp" -> System.currentTimeMillis()) ~
      ("sparkVersion" -> sc.version) ~
      ("uid" -> uid) ~
      ("paramMap" -> jsonParams)
    val metadata = extraMetadata match {
      case Some(jObject) =>
        basicMetadata ~ jObject
      case None =>
        basicMetadata
    }
    val metadataPath = new Path(path, "metadata").toString
    val metadataJson = compact(render(metadata))
    sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath)
  }
}

Source File: SerializableSparkEstimator.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import io.deepsense.sparkutils.ML

class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E)
  extends ML.Estimator[SerializableSparkModel[T]]
  with MLWritable {

  override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0"

  override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = {
    val result: T = sparkEstimator.fit(dataset)
    new SerializableSparkModel[T](result)
  }

  override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] =
    new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E])

  override def write: MLWriter = new DefaultMLWriter(this)

  override def transformSchema(schema: StructType): StructType =
    sparkEstimator.transformSchema(schema)
}

Source File: SerializableSparkModel.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import io.deepsense.sparkutils.ML

class SerializableSparkModel[M <: Model[M]](val sparkModel: M)
  extends ML.Model[SerializableSparkModel[M]]
  with MLWritable {

  override def copy(extra: ParamMap): SerializableSparkModel[M] =
    new SerializableSparkModel(sparkModel.copy(extra))

  override def write: MLWriter = {
    sparkModel match {
      case w: MLWritable => w.write
      case _ => new DefaultMLWriter(this)
    }
  }

  override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset)

  override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema)

  override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae"
}

// This class may seem unused, but it is used reflectively by spark deserialization mechanism
object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] {
  override def read: MLReader[SerializableSparkModel[_]] = {
    new DefaultMLReader[SerializableSparkModel[_]]()
  }
}

org.apache.spark.ml.util.MLWriter Scala Examples