org.apache.spark.ml.util.MLReadable Scala Examples

The following examples show how to use org.apache.spark.ml.util.MLReadable. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MultilayerPerceptronClassifierWrapper.scala    From drizzle-spark   with Apache License 2.0 8 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel,
    val labelCount: Long,
    val layers: Array[Int],
    val weights: Array[Double]
  ) extends MLWritable {

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
      val rMetadata = parse(rMetadataStr)
      val labelCount = (rMetadata \ "labelCount").extract[Long]
      val layers = (rMetadata \ "layers").extract[Array[Int]]
      val weights = (rMetadata \ "weights").extract[Array[Double]]

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = ("class" -> instance.getClass.getName) ~
        ("labelCount" -> instance.labelCount) ~
        ("layers" -> instance.layers.toSeq) ~
        ("weights" -> instance.weights.toArray.toSeq)
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 2
Source File: KNNTest.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.nn

import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.{DataFrame, Row}
import org.scalactic.Equality
import org.scalatest.Assertion

class KNNTest extends EstimatorFuzzing[KNN] with BallTreeTestBase {

  test("matches non spark result") {
    val results = new KNN().setOutputCol("matches")
      .fit(df).transform(testDF)
      .select("matches").collect()
    val sparkResults = results.map(r =>
      r.getSeq[Row](0).map(mr => mr.getDouble(1))
    )
    val tree = BallTree(uniformData, uniformData.indices)
    val nonSparkResults = uniformData.take(5).map(
      point => tree.findMaximumInnerProducts(point, 5)
    )

    sparkResults.zip(nonSparkResults).foreach { case (sr, nsr) =>
      assert(sr === nsr.map(_.distance))
    }
  }

  override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Assertion = {
    super.assertDFEq(
      df1.select("features", "values", "matches.distance"),
      df2.select("features", "values", "matches.distance")
    )(eq)
  }

  override def testObjects(): Seq[TestObject[KNN]] =
    List(new TestObject(new KNN().setOutputCol("matches"), df, testDF))

  override def reader: MLReadable[_] = KNN

  override def modelReader: MLReadable[_] = KNNModel
}

class ConditionalKNNTest extends EstimatorFuzzing[ConditionalKNN] with BallTreeTestBase {

  test("matches non spark result") {
    val results = new ConditionalKNN().setOutputCol("matches")
      .fit(df).transform(testDF)
      .select("matches").collect()
    val sparkResults = results.map(r =>
      r.getSeq[Row](0).map(mr => (mr.getDouble(1), mr.getInt(2)))
    )
    val tree = ConditionalBallTree(uniformData, uniformData.indices, uniformLabels)
    val nonSparkResults = uniformData.take(5).map(
      point => tree.findMaximumInnerProducts(point, Set(0, 1), 5)
    )

    sparkResults.zip(nonSparkResults).foreach { case (sr, nsr) =>
      assert(sr.map(_._1) === nsr.map(_.distance))
      assert(sr.forall(p => Set(1, 0)(p._2)))
    }
  }

  override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Assertion = {
    super.assertDFEq(
      df1.select("features", "values", "matches.distance"),
      df2.select("features", "values", "matches.distance")
    )(eq)
  }

  override def testObjects(): Seq[TestObject[ConditionalKNN]] =
    List(new TestObject(new ConditionalKNN().setOutputCol("matches"), df, testDF))

  override def reader: MLReadable[_] = ConditionalKNN

  override def modelReader: MLReadable[_] = ConditionalKNNModel
} 
Example 3
Source File: LambdaSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.types.StructType

class LambdaSuite extends TestBase with TransformerFuzzing[Lambda] {

  test("basic functionality") {
    val input = makeBasicDF()
    val lt = new Lambda()
      .setTransform(df => df.select("numbers"))
      .setTransformSchema(schema => new StructType(Array(schema("numbers"))))
    val output = lt.transform(input)
    val output2 = makeBasicDF().select("numbers")

  }

  test("without setting transform schema") {
    val input = makeBasicDF()
    val lt = Lambda(_.select("numbers"))
    val output = lt.transform(input)
    val output2 = makeBasicDF().select("numbers")
    assert(output === output2)
  }

  def testObjects(): Seq[TestObject[Lambda]] = List(new TestObject(Lambda(_.select("numbers")), makeBasicDF()))

  override def reader: MLReadable[_] = Lambda

} 
Example 4
Source File: TimerSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.DataFrame

class TimerSuite extends EstimatorFuzzing[Timer] {

  lazy val df: DataFrame = session
    .createDataFrame(Seq((0, "Hi I"),
                         (1, "I wish for snow today"),
                         (2, "we Cant go to the park, because of the snow!"),
                         (3, "")))
    .toDF("label", "sentence")

  test("Work with transformers and estimators") {

    val tok = new Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("tokens")

    val df2 = new Timer().setStage(tok).fit(df).transform(df)

    val df3 = new HashingTF().setInputCol("tokens").setOutputCol("hash").transform(df2)

    val idf = new IDF().setInputCol("hash").setOutputCol("idf")

    val df4 = new Timer().setStage(idf).fit(df3).transform(df3)

  }

  test("should work within pipelines") {
    val tok = new Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("tokens")
    val ttok = new Timer().setStage(tok)
    val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash")
    val idf  = new IDF().setInputCol("hash").setOutputCol("idf")
    val tidf = new Timer().setStage(idf)
    val pipe = new Pipeline().setStages(Array(ttok, hash, tidf))
    pipe.fit(df).transform(df)
  }

  test("should be able to turn off timing") {
    val tok = new Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("tokens")
    val ttok = new Timer().setStage(tok)
    val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash")
    val idf  = new IDF().setInputCol("hash").setOutputCol("idf")
    val tidf = new Timer().setStage(idf)
    val pipe = new Pipeline().setStages(Array(ttok, hash, tidf))
    val model = pipe.fit(df)

    println("Transforming")
    println(model.stages(0).params.foreach(println(_)))
    model.stages(0).asInstanceOf[TimerModel].setDisable(true)
    model.stages(2).asInstanceOf[TimerModel].setDisable(true)

    println("here")
    println(model.stages(0).getParam("disableMaterialization"))

    model.stages(0).params.foreach(p =>println("foo: " + p.toString))

    model.transform(df)
  }

  val reader: MLReadable[_] = Timer
  val modelReader: MLReadable[_] = TimerModel

  override def testObjects(): Seq[TestObject[Timer]] = Seq(new TestObject[Timer]({
    val tok = new Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("tokens")
    new Timer().setStage(tok)
  }, df))
} 
Example 5
Source File: CacherSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable

class CacherSuite extends TransformerFuzzing[Cacher] {

  import session.implicits._

  lazy val df = Seq((0, "guitars", "drums"),
               (1, "piano", "trumpet"),
               (2, "bass", "cymbals"),
               (3, "guitars", "drums"),
               (4, "piano", "trumpet"),
               (5, "bass", "cymbals"),
               (6, "guitars", "drums"),
               (7, "piano", "trumpet"),
               (8, "bass", "cymbals"),
               (9, "guitars", "drums"),
               (10, "piano", "trumpet"),
               (11, "bass", "cymbals")
    ).toDF("numbers", "words", "more")

  test("Be the identity operation") {
    val df2 = new Cacher().transform(df)
    assert(df2.collect() === df.collect())
  }

  override def testObjects(): Seq[TestObject[Cacher]] = Seq(new TestObject(new Cacher(), df))

  override def reader: MLReadable[_] = Cacher

} 
Example 6
Source File: SelectColumnsSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable

class SelectColumnsSuite extends TestBase with TransformerFuzzing[SelectColumns] {

  import session.implicits._

  test("Select all columns in a data frame") {
    val input = makeBasicDF()
    val result = new SelectColumns()
      .setCols(input.columns)
      .transform(input)
    assert(verifyResult(input, result))
  }

  test("Test: Select two columns in a data frame") {
    val expected = Seq(
      ("guitars", "drums"),
      ("piano", "trumpet"),
      ("bass", "cymbals")
    ).toDF("words", "more")
    val result = new SelectColumns()
      .setCols(Array("words", "more"))
      .transform(makeBasicDF())
    assert(verifyResult(expected, result))
  }

  test("Test: Select columns with spaces") {
    val expected = Seq(
      ("guitars", "drums"),
      ("piano", "trumpet"),
      ("bass", "cymbals")
    ).toDF("words", "Scored Labels")
    val result = new SelectColumns()
      .setCols(Array("words", "Scored Labels"))
      .transform(makeBasicDF().withColumnRenamed("more", "Scored Labels"))
    assert(verifyResult(expected, result))
  }

  test("Test: Select one column from the data frame") {
    val expected = Seq(
      "guitars",
      "piano",
      "bass"
    ).toDF("words")
    val result = new SelectColumns()
      .setCols(Array("words"))
      .transform(makeBasicDF())
    assert(verifyResult(expected, result))
  }

  test("Invalid column specified") {
    try {
      new SelectColumns().setCol("four").transform(makeBasicDF())
      fail()
    } catch {
      case _: NoSuchElementException =>
    }
  }

  def testObjects(): Seq[TestObject[SelectColumns]] = List(new TestObject(
    new SelectColumns().setCol("numbers"), makeBasicDF()))

  override def reader: MLReadable[_] = SelectColumns

} 
Example 7
Source File: MultiColumnAdapterSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.schema.DatasetExtensions._
import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.feature.{StringIndexer, Tokenizer}
import org.apache.spark.ml.util.MLReadable

import scala.collection.mutable

class MultiColumnAdapterSpec extends TestBase with EstimatorFuzzing[MultiColumnAdapter] {

  lazy val wordDF = session.createDataFrame(Seq(
    (0, "This is a test", "this is one too"),
    (1, "could be a test", "bar"),
    (2, "foo", "bar"),
    (3, "foo", "maybe not")))
    .toDF("label", "words1", "words2")
  lazy val inputCols  = Array[String]("words1",  "words2")
  lazy val outputCols = Array[String]("output1", "output2")
  lazy val stage = new StringIndexer()
  lazy val adaptedEstimator =
    new MultiColumnAdapter().setBaseStage(stage)
          .setInputCols(inputCols).setOutputCols(outputCols)

  test("parallelize transformers") {
    val stage1 = new Tokenizer()
    val transformer =
      new MultiColumnAdapter().setBaseStage(stage1)
            .setInputCols(inputCols).setOutputCols(outputCols)
    val tokenizedDF = transformer.fit(wordDF).transform(wordDF)
    val lines = tokenizedDF.getColAs[Array[String]]("output2")
    val trueLines = Array(
      Array("this", "is", "one", "too"),
      Array("bar"),
      Array("bar"),
      Array("maybe", "not")
    )
    assert(lines === trueLines)
  }

  test("parallelize estimator") {
    val stringIndexedDF = adaptedEstimator.fit(wordDF).transform(wordDF)
    val lines1 = stringIndexedDF.getColAs[Array[String]]("output1")
    val trueLines1 = mutable.ArraySeq(1, 2, 0, 0)
    assert(lines1 === trueLines1)

    val lines2 = stringIndexedDF.getColAs[Array[String]]("output2")
    val trueLines2 = mutable.ArraySeq(1, 0, 0, 2)
    assert(lines2 === trueLines2)
  }
  def testObjects(): Seq[TestObject[MultiColumnAdapter]] = List(new TestObject(adaptedEstimator, wordDF))

  override def reader: MLReadable[_] = MultiColumnAdapter

  override def modelReader: MLReadable[_] = PipelineModel

} 
Example 8
Source File: VerifyVowpalWabbitInteractions.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors}
import org.apache.spark.ml.util.MLReadable

class VerifyVowpalWabbitInteractions extends TestBase with TransformerFuzzing[VowpalWabbitInteractions] {

  case class Data(val v1: Vector, val v2: Vector, val v3: Vector)

  lazy val df = session.createDataFrame(Seq(Data(
    Vectors.dense(Array(1.0, 2.0, 3.0)),
    Vectors.sparse(8, Array(5), Array(4.0)),
    Vectors.sparse(11, Array(8, 9), Array(7.0, 8.0))
  )))

  private def featurizeUsing(interactions: VowpalWabbitInteractions) =
    interactions.transform(df).head().getAs[SparseVector]("features")

  private def verifyValues(actual: SparseVector, expected: Array[Double]) = {
    assert(actual.numNonzeros == expected.length)

    (actual.values.sorted zip expected.sorted).forall{ case (x,y) => x == y }
  }

  test("Verify VowpalWabbit Interactions 3-dense x 1-sparse") {
    val interactions = new VowpalWabbitInteractions()
      .setInputCols(Array("v1", "v2"))
      .setOutputCol("features")

    val v = featurizeUsing(interactions)

    verifyValues(v, Array(4.0, 8, 12.0))
  }

  test("Verify VowpalWabbit Interactions 1-sparse x 2-sparse") {
    val interactions = new VowpalWabbitInteractions()
      .setInputCols(Array("v2", "v3"))
      .setOutputCol("features")

    val v = featurizeUsing(interactions)

    verifyValues(v, Array(28.0, 32.0))
  }

  test("Verify VowpalWabbit Interactions 3-dense x 1-sparse x 2-sparse") {
    val interactions = new VowpalWabbitInteractions()
      .setInputCols(Array("v1", "v2", "v3"))
      .setOutputCol("features")

    val v = featurizeUsing(interactions)

    verifyValues(v, Array(
      1.0 * 5 * 7, 1 * 5 * 8.0,
      2.0 * 5 * 7, 2 * 5 * 8.0,
      3.0 * 5 * 7, 3 * 5 * 8.0
    ))
  }

  def testObjects(): Seq[TestObject[VowpalWabbitInteractions]] = List(new TestObject(
    new VowpalWabbitInteractions().setInputCols(Array("v1")).setOutputCol("out"), df))

  override def reader: MLReadable[_] = VowpalWabbitInteractions
} 
Example 9
Source File: VerifyVowpalWabbitRegressorFuzzing.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.test.benchmarks.DatasetUtils
import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.{Column, DataFrame}

class VerifyVowpalWabbitRegressorFuzzing extends EstimatorFuzzing[VowpalWabbitRegressor] {
  val numPartitions = 2

  
  def readCSV(fileName: String, fileLocation: String): DataFrame = {
    session.read
      .option("header", "true").option("inferSchema", "true")
      .option("treatEmptyValuesAsNulls", "false")
      .option("delimiter", if (fileName.endsWith(".csv")) "," else "\t")
      .csv(fileLocation)
  }

  override def reader: MLReadable[_] = VowpalWabbitRegressor

  override def modelReader: MLReadable[_] = VowpalWabbitRegressionModel

  override def testObjects(): Seq[TestObject[VowpalWabbitRegressor]] = {
    val fileName = "energyefficiency2012_data.train.csv"
    val columnsFilter = Some("X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2")
    val labelCol = "Y1"

    val fileLocation = DatasetUtils.regressionTrainFile(fileName).toString
    val readDataset = readCSV(fileName, fileLocation).repartition(numPartitions)
    val dataset =
      if (columnsFilter.isDefined) {
        readDataset.select(columnsFilter.get.split(",").map(new Column(_)): _*)
      } else {
        readDataset
      }

    val featuresColumn = "features"

    val featurizer = new VowpalWabbitFeaturizer()
      .setInputCols(dataset.columns.filter(col => col != labelCol))
      .setOutputCol("features")

    val vw = new VowpalWabbitRegressor()
    val predCol = "pred"
    val trainData = featurizer.transform(dataset)
    val model = vw.setLabelCol(labelCol)
      .setFeaturesCol("features")
      .setPredictionCol(predCol)
      .fit(trainData)

    Seq(new TestObject(
      vw,
      trainData))
  }
} 
Example 10
Source File: StratifiedRepartitionSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.TaskContext
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

class StratifiedRepartitionSuite extends TestBase with TransformerFuzzing[StratifiedRepartition] {

  import session.implicits._

  val values = "values"
  val colors = "colors"
  val const = "const"

  lazy val input = Seq(
    (0, "Blue", 2),
    (0, "Red", 2),
    (0, "Green", 2),
    (1, "Purple", 2),
    (1, "Orange", 2),
    (1, "Indigo", 2),
    (2, "Violet", 2),
    (2, "Black", 2),
    (2, "White", 2),
    (3, "Gray", 2),
    (3, "Yellow", 2),
    (3, "Cerulean", 2)
  ).toDF(values, colors, const)

  test("Assert doing a stratified repartition will ensure all keys exist across all partitions") {
    val inputSchema = new StructType()
      .add(values, IntegerType).add(colors, StringType).add(const, IntegerType)
    val inputEnc = RowEncoder(inputSchema)
    val valuesFieldIndex = inputSchema.fieldIndex(values)
    val numPartitions = 3
    val trainData = input.repartition(numPartitions).select(values, colors, const)
      .mapPartitions(iter => {
        val ctx = TaskContext.get
        val partId = ctx.partitionId
        // Remove all instances of 0 class on partition 1
        if (partId == 1) {
          iter.flatMap(row => {
            if (row.getInt(valuesFieldIndex) <= 0)
              None
            else Some(row)
          })
        } else {
          // Add back at least 3 instances on other partitions
          val oneOfEachExample = List(Row(0, "Blue", 2), Row(1, "Purple", 2), Row(2, "Black", 2), Row(3, "Gray", 2))
          (iter.toList.union(oneOfEachExample).union(oneOfEachExample).union(oneOfEachExample)).toIterator
        }
      })(inputEnc).cache()
    // Some debug to understand what data is on which partition
    trainData.foreachPartition { rows =>
      rows.foreach { row =>
        val ctx = TaskContext.get
        val partId = ctx.partitionId
        println(s"Row: $row partition id: $partId")
      }
    }
    val stratifiedInputData = new StratifiedRepartition().setLabelCol(values)
      .setMode(SPConstants.Equal).transform(trainData)
    // Assert stratified data contains all keys across all partitions, with extra count
    // for it to be evaluated
    stratifiedInputData
      .mapPartitions(iter => {
        val actualLabels = iter.map(row => row.getInt(valuesFieldIndex))
          .toArray.distinct.sorted.toList
        val expectedLabels = (0 to 3).toList
        if (actualLabels != expectedLabels)
          throw new Exception(s"Missing labels, actual: $actualLabels, expected: $expectedLabels")
        iter
      })(inputEnc).count()
    val stratifiedMixedInputData = new StratifiedRepartition().setLabelCol(values)
      .setMode(SPConstants.Mixed).transform(trainData)
    assert(stratifiedMixedInputData.count() >= trainData.count())
    val stratifiedOriginalInputData = new StratifiedRepartition().setLabelCol(values)
      .setMode(SPConstants.Original).transform(trainData)
    assert(stratifiedOriginalInputData.count() == trainData.count())
  }

  def testObjects(): Seq[TestObject[StratifiedRepartition]] = List(new TestObject(
    new StratifiedRepartition().setLabelCol(values).setMode(SPConstants.Equal), input))

  def reader: MLReadable[_] = StratifiedRepartition
} 
Example 11
Source File: SuperpixelTransformerSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.lime

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.image.NetworkUtils
import com.microsoft.ml.spark.io.split1.FileReaderUtils
import org.apache.spark.ml.util.MLReadable

class SuperpixelTransformerSuite extends TransformerFuzzing[SuperpixelTransformer]
  with NetworkUtils with FileReaderUtils {
  lazy val spt: SuperpixelTransformer = new SuperpixelTransformer().setInputCol(inputCol)

  test("basic functionality"){
    val results = spt.transform(images)
    val superpixels = SuperpixelData.fromRow(results.collect()(0).getStruct(1))
    assert(superpixels.clusters.length === 3)
    assert(superpixels.clusters.head.length == 310)
  }

  override def testObjects(): Seq[TestObject[SuperpixelTransformer]] =
    Seq(new TestObject(spt, images))

  override def reader: MLReadable[_] = SuperpixelTransformer
} 
Example 12
Source File: SimpleHTTPTransformerSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split1

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.io.http.{HandlingUtils, JSONOutputParser, SimpleHTTPTransformer}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StringType, StructType}

class SimpleHTTPTransformerSuite
  extends TransformerFuzzing[SimpleHTTPTransformer] with WithServer {

  import session.implicits._

  lazy val df: DataFrame = sc.parallelize((1 to 10).map(Tuple1(_))).toDF("data")

  def simpleTransformer: SimpleHTTPTransformer =
    new SimpleHTTPTransformer()
      .setInputCol("data")
      .setOutputParser(new JSONOutputParser()
        .setDataType(new StructType().add("blah", StringType)))
      .setUrl(url)
      .setOutputCol("results")

  test("HttpTransformerTest") {
    val results = simpleTransformer.transform(df).collect
    assert(results.length == 10)
    results.foreach(r =>
      assert(r.getStruct(2).getString(0) === "more blah"))
    assert(results(0).schema.fields.length == 3)
  }

  test("HttpTransformerTest with Flaky Connection") {
    lazy val df2: DataFrame = sc.parallelize((1 to 5).map(Tuple1(_))).toDF("data")
    val results = simpleTransformer
      .setUrl(url + "/flaky")
      .setTimeout(1)
      .transform(df2).collect
    assert(results.length == 5)
  }

  test("Basic Handling") {
    val results = simpleTransformer
      .setHandler(HandlingUtils.basic)
      .transform(df).collect
    assert(results.length == 10)
    results.foreach(r =>
      assert(r.getStruct(2).getString(0) === "more blah"))
    assert(results(0).schema.fields.length == 3)
  }

  test("Concurrent HttpTransformerTest") {
    val results =
      new SimpleHTTPTransformer()
        .setInputCol("data")
        .setOutputParser(new JSONOutputParser()
          .setDataType(new StructType().add("blah", StringType)))
        .setUrl(url)
        .setOutputCol("results")
        .setConcurrency(3)
        .transform(df)
        .collect
    assert(results.length == 10)
    assert(results.forall(_.getStruct(2).getString(0) == "more blah"))
    assert(results(0).schema.fields.length == 3)
  }

  override def testObjects(): Seq[TestObject[SimpleHTTPTransformer]] =
    Seq(new TestObject(simpleTransformer, df))

  override def reader: MLReadable[_] = SimpleHTTPTransformer

} 
Example 13
Source File: ParserSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split1

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.io.http._
import org.apache.http.client.methods.HttpPost
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, SparkSession}

trait ParserUtils extends WithServer {

  def sampleDf(spark: SparkSession): DataFrame = {
    val df = spark.createDataFrame((1 to 10).map(Tuple1(_)))
      .toDF("data")
    val df2 = new JSONInputParser().setInputCol("data")
      .setOutputCol("parsedInput").setUrl(url)
      .transform(df)
      .withColumn("unparsedOutput", udf({ x: Int =>
        HTTPResponseData(
          Array(),
          Some(EntityData(
            "{\"foo\": \"here\"}".getBytes, None, None, None, false, false, false)),
          StatusLineData(ProtocolVersionData("foo", 1, 1), 200, "bar"),
          "en")
      }).apply(col("data"))
      )

    new JSONOutputParser()
      .setDataType(new StructType().add("foo", StringType))
      .setInputCol("unparsedOutput")
      .setOutputCol("parsedOutput")
      .transform(df2)
  }

  def makeTestObject[T <: Transformer](t: T, session: SparkSession): Seq[TestObject[T]] = {
    Seq(new TestObject(t, sampleDf(session)))
  }

}

class JsonInputParserSuite extends TransformerFuzzing[JSONInputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[JSONInputParser]] = makeTestObject(
    new JSONInputParser().setInputCol("data").setOutputCol("out")
      .setUrl(url), session)

  override def reader: MLReadable[_] = JSONInputParser
}

class JsonOutputParserSuite extends TransformerFuzzing[JSONOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[JSONOutputParser]] = makeTestObject(
    new JSONOutputParser().setInputCol("unparsedOutput").setOutputCol("out")
      .setDataType(new StructType().add("foo", StringType)), session)

  override def reader: MLReadable[_] = JSONOutputParser
}

class StringOutputParserSuite extends TransformerFuzzing[StringOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[StringOutputParser]] = makeTestObject(
    new StringOutputParser().setInputCol("unparsedOutput").setOutputCol("out"), session)

  override def reader: MLReadable[_] = StringOutputParser
}

class CustomInputParserSuite extends TransformerFuzzing[CustomInputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[CustomInputParser]] = makeTestObject(
    new CustomInputParser().setInputCol("data").setOutputCol("out")
      .setUDF({ x: Int => new HttpPost(s"http://$x") }), session)

  override def reader: MLReadable[_] = CustomInputParser
}

class CustomOutputParserSuite extends TransformerFuzzing[CustomOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[CustomOutputParser]] = makeTestObject(
    new CustomOutputParser().setInputCol("unparsedOutput").setOutputCol("out")
      .setUDF({ x: HTTPResponseData => x.locale }), session)

  override def reader: MLReadable[_] = CustomOutputParser
} 
Example 14
Source File: MultilayerPerceptronClassifierWrapper.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  val weights: Array[Double] = mlpModel.weights.toArray
  val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 15
Source File: SerializableSparkModel.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import ai.deepsense.sparkutils.ML

class SerializableSparkModel[M <: Model[M]](val sparkModel: M)
  extends ML.Model[SerializableSparkModel[M]]
  with MLWritable {

  override def copy(extra: ParamMap): SerializableSparkModel[M] =
    new SerializableSparkModel(sparkModel.copy(extra))

  override def write: MLWriter = {
    sparkModel match {
      case w: MLWritable => w.write
      case _ => new DefaultMLWriter(this)
    }
  }

  override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset)

  override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema)

  override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae"
}

// This class may seem unused, but it is used reflectively by spark deserialization mechanism
object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] {
  override def read: MLReader[SerializableSparkModel[_]] = {
    new DefaultMLReader[SerializableSparkModel[_]]()
  }
} 
Example 16
Source File: MultilayerPerceptronClassifierWrapper.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  private val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  lazy val weights: Array[Double] = mlpModel.weights.toArray
  lazy val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 17
Source File: ModelUtils.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.utils

import org.apache.hadoop.mapred.InvalidInputException
import org.apache.spark.ml.util.{MLReadable, MLWritable}

object ModelUtils {
  def loadOrCreateModel[T <: MLWritable](ModelClass: MLReadable[T], path: String, createModelFunc: () => T): T = {
    try {
      ModelClass.load(path)
    } catch {
      case e: InvalidInputException => {
        if (e.getMessage.contains("Input path does not exist")) {
          val model = createModelFunc()
          model.write.overwrite().save(path)
          model
        } else {
          throw e
        }
      }
    }
  }
} 
Example 18
Source File: SerializableSparkModel.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import io.deepsense.sparkutils.ML

class SerializableSparkModel[M <: Model[M]](val sparkModel: M)
  extends ML.Model[SerializableSparkModel[M]]
  with MLWritable {

  override def copy(extra: ParamMap): SerializableSparkModel[M] =
    new SerializableSparkModel(sparkModel.copy(extra))

  override def write: MLWriter = {
    sparkModel match {
      case w: MLWritable => w.write
      case _ => new DefaultMLWriter(this)
    }
  }

  override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset)

  override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema)

  override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae"
}

// This class may seem unused, but it is used reflectively by spark deserialization mechanism
object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] {
  override def read: MLReader[SerializableSparkModel[_]] = {
    new DefaultMLReader[SerializableSparkModel[_]]()
  }
} 
Example 19
Source File: RankingAdapterSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.recommendation

import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable

class RankingAdapterSpec extends RankingTestBase with EstimatorFuzzing[RankingAdapter] {
  override def testObjects(): Seq[TestObject[RankingAdapter]] = {
    List(new TestObject(adapter, transformedDf))
  }

  override def reader: MLReadable[_] = RankingAdapter

  override def modelReader: MLReadable[_] = RankingAdapterModel
}

class RankingAdapterModelSpec extends RankingTestBase with TransformerFuzzing[RankingAdapterModel] {
  override def testObjects(): Seq[TestObject[RankingAdapterModel]] = {
    val df = transformedDf
    List(new TestObject(adapter.fit(df), df))
  }

  override def reader: MLReadable[_] = RankingAdapterModel
} 
Example 20
Source File: MathUnary.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
import org.apache.spark.sql.functions.udf


    private val className = classOf[MathUnary].getName

    override def load(path: String): MathUnary = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("operation").head()
      val operation = data.getAs[String](0)

      val model = MathUnaryModel(UnaryOperation.forName(operation))
      val transformer = new MathUnary(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

} 
Example 21
Source File: MultilayerPerceptronClassifierWrapper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  val weights: Array[Double] = mlpModel.weights.toArray
  val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 22
Source File: SpeechToTextSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.cognitive.split2

import java.net.{URI, URL}

import com.microsoft.ml.spark.Secrets
import com.microsoft.ml.spark.cognitive.{SpeechResponse, SpeechToText}
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.commons.compress.utils.IOUtils
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.{DataFrame, Row}
import org.scalactic.Equality

trait SpeechKey {
  lazy val speechKey = sys.env.getOrElse("SPEECH_API_KEY", Secrets.SpeechApiKey)
}

class SpeechToTextSuite extends TransformerFuzzing[SpeechToText]
  with SpeechKey {

  import session.implicits._

  val region = "eastus"
  val resourcesDir = System.getProperty("user.dir") + "/src/test/resources/"
  val uri = new URI(s"https://$region.api.cognitive.microsoft.com/sts/v1.0/issuetoken")
  val language = "en-us"
  val profanity = "masked"
  val format = "simple"

  lazy val stt = new SpeechToText()
    .setSubscriptionKey(speechKey)
    .setLocation(region)
    .setOutputCol("text")
    .setAudioDataCol("audio")
    .setLanguage("en-US")

  lazy val audioBytes: Array[Byte] = {
    IOUtils.toByteArray(new URL("https://mmlspark.blob.core.windows.net/datasets/Speech/test1.wav").openStream())
  }

  lazy val df: DataFrame = Seq(
    Tuple1(audioBytes)
  ).toDF("audio")

  override lazy val dfEq = new Equality[DataFrame] {
    override def areEqual(a: DataFrame, b: Any): Boolean =
      baseDfEq.areEqual(a.drop("audio"), b.asInstanceOf[DataFrame].drop("audio"))
  }

  override def testSerialization(): Unit = {
    tryWithRetries(Array(0, 100, 100, 100, 100))(super.testSerialization)
  }

  
  def jaccardSimilarity(s1: String, s2: String): Double = {
    val a = Set(s1)
    val b = Set(s2)
    a.intersect(b).size.toDouble / (a | b).size.toDouble
  }

  test("Basic Usage") {
    val toObj: Row => SpeechResponse = SpeechResponse.makeFromRowConverter
    val result = toObj(stt.setFormat("simple")
      .transform(df).select("text")
      .collect().head.getStruct(0))
    result.DisplayText.get.contains("this is a test")
  }

  test("Detailed Usage") {
    val toObj = SpeechResponse.makeFromRowConverter
    val result = toObj(stt.setFormat("detailed")
      .transform(df).select("text")
      .collect().head.getStruct(0))
    result.NBest.get.head.Display.contains("this is a test")
  }

  override def testObjects(): Seq[TestObject[SpeechToText]] =
    Seq(new TestObject(stt, df))

  override def reader: MLReadable[_] = SpeechToText
} 
Example 23
Source File: VerifyIsolationForest.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.isolationforest

import com.microsoft.ml.spark.build.BuildInfo
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.metrics.MetricConstants
import com.microsoft.ml.spark.core.test.benchmarks.Benchmarks
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row}
import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.scalactic.Tolerance._
import com.microsoft.ml.spark.train.ComputeModelStatistics

case class MammographyRecord(feature0: Double, feature1: Double, feature2: Double, feature3: Double,
                             feature4: Double, feature5: Double, label: Double)
case class ScoringResult(features: Vector, label: Double, predictedLabel: Double, outlierScore: Double)

class VerifyIsolationForest extends Benchmarks with EstimatorFuzzing[IsolationForest] {
  test ("Verify isolationForestMammographyDataTest") {
    import session.implicits._

    val data = loadMammographyData

    // Train a new isolation forest model
    val contamination = 0.02
    val isolationForest = new IsolationForest()
      .setNumEstimators(100)
      .setBootstrap(false)
      .setMaxSamples(256)
      .setMaxFeatures(1.0)
      .setFeaturesCol("features")
      .setPredictionCol("predictedLabel")
      .setScoreCol("outlierScore")
      .setContamination(0.02)
      .setContaminationError(contamination * 0.01)
      .setRandomSeed(1)

    // Score all training data instances using the new model
    val isolationForestModel = isolationForest.fit(data)

    // Calculate area under ROC curve and assert
    val scores = isolationForestModel.transform(data).as[ScoringResult]
    val metrics = new ComputeModelStatistics()
      .setEvaluationMetric(MetricConstants.AucSparkMetric)
      .setLabelCol("label")
      .setScoredLabelsCol("predictedLabel")
      .setScoresCol("outlierScore")
      .transform(scores)

    // Expectation from results in the 2008 "Isolation Forest" paper by F. T. Liu, et al.
    val aurocExpectation = 0.86
    val uncert = 0.02
    val auroc = metrics.first().getDouble(1)
    assert(auroc === aurocExpectation +- uncert, "expected area under ROC =" +
        s" $aurocExpectation +/- $uncert, but observed $auroc")
  }

  def loadMammographyData(): DataFrame = {

    import session.implicits._

    val mammographyRecordSchema = Encoders.product[MammographyRecord].schema

    val fileLocation = FileUtilities.join(BuildInfo.datasetDir,"IsolationForest", "mammography.csv").toString

    // Open source dataset from http://odds.cs.stonybrook.edu/mammography-dataset/
    val rawData = session.read
      .format("csv")
      .option("comment", "#")
      .option("header", "false")
      .schema(mammographyRecordSchema)
      .load(fileLocation)

    val assembler = new VectorAssembler()
      .setInputCols(Array("feature0", "feature1", "feature2", "feature3", "feature4", "feature5"))
      .setOutputCol("features")

    val data = assembler
      .transform(rawData)
      .select("features", "label")

    data
  }

  override def reader: MLReadable[_] = IsolationForest
  override def modelReader: MLReadable[_] = IsolationForestModel

  override def testObjects(): Seq[TestObject[IsolationForest]] = {
    val dataset = loadMammographyData.toDF

    Seq(new TestObject(
      new IsolationForest(),
      dataset))
  }
} 
Example 24
Source File: TextFeaturizerSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.featurize.text

import com.microsoft.ml.spark.core.schema.DatasetExtensions._
import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.feature.{NGram, Tokenizer}
import org.apache.spark.ml.util.MLReadable

class TextFeaturizerSpec extends EstimatorFuzzing[TextFeaturizer]{
  lazy val dfRaw = session
    .createDataFrame(Seq((0, "Hi I"),
                         (1, "I wish for snow today"),
                         (2, "we Cant go to the park, because of the snow!"),
                         (3, "")))
    .toDF("label", "sentence")
  lazy val dfTok = new Tokenizer()
    .setInputCol("sentence")
    .setOutputCol("tokens")
    .transform(dfRaw)
  lazy val dfNgram =
    new NGram().setInputCol("tokens").setOutputCol("ngrams").transform(dfTok)

  test("operate on sentences,tokens,or ngrams") {
    val tfRaw = new TextFeaturizer()
      .setInputCol("sentence")
      .setOutputCol("features")
      .setNumFeatures(20)
    val tfTok = new TextFeaturizer()
      .setUseTokenizer(false)
      .setInputCol("tokens")
      .setOutputCol("features")
      .setNumFeatures(20)
    val tfNgram = new TextFeaturizer()
      .setUseTokenizer(false)
      .setUseNGram(false)
      .setInputCol("ngrams")
      .setOutputCol("features")
      .setNumFeatures(20)

    val dfRaw2 = tfRaw.fit(dfRaw).transform(dfRaw)
    val dfTok2 = tfTok.fit(dfTok).transform(dfTok)
    val dfNgram2 = tfNgram.fit(dfNgram).transform(dfNgram)

    val linesRaw = dfRaw2.getSVCol("features")
    val linesTok = dfTok2.getSVCol("features")
    val linesNgram = dfNgram2.getSVCol("features")

    assert(linesRaw.length == 4)
    assert(linesTok.length == 4)
    assert(linesNgram.length == 4)
    assert(linesRaw(0)(0) == 0.9162907318741551)
    assert(linesTok(1)(9) == 0.5108256237659907)
    assert(linesNgram(2)(7) == 1.8325814637483102)
    assert(linesNgram(3)(1) == 0.0)
  }

  test("throw errors if the schema is incorrect") {
    val tfRaw = new TextFeaturizer()
      .setUseTokenizer(true)
      .setInputCol("sentence")
      .setOutputCol("features")
      .setNumFeatures(20)
    val tfTok = new TextFeaturizer()
      .setUseTokenizer(false)
      .setInputCol("tokens")
      .setOutputCol("features")
      .setNumFeatures(20)
    assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens"),           dfTok)
    assertSparkException[IllegalArgumentException](tfRaw.setInputCol("ngrams"),           dfNgram)
    assertSparkException[IllegalArgumentException](tfTok.setInputCol("sentence"),         dfRaw)
    assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens_incorrect"), dfTok)
    assertSparkException[IllegalArgumentException](tfRaw.setOutputCol("tokens"),          dfTok)
  }

  override def testObjects(): Seq[TestObject[TextFeaturizer]] =
    List(new TestObject(new TextFeaturizer().setInputCol("sentence"), dfRaw))

  override def reader: MLReadable[_] = TextFeaturizer
  override def modelReader: MLReadable[_] = TextFeaturizerModel
} 
Example 25
Source File: PageSplitterSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.featurize.text

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable

class PageSplitterSpec extends TransformerFuzzing[PageSplitter] {

  import session.implicits._

  lazy val df = Seq(
    "words words  words     wornssaa ehewjkdiw weijnsikjn xnh",
    "s s  s   s     s           s",
    "hsjbhjhnskjhndwjnbvckjbnwkjwenbvfkjhbnwevkjhbnwejhkbnvjkhnbndjkbnd",
    "hsjbhjhnskjhndwjnbvckjbnwkjwenbvfkjhbnwevkjhbnwejhkbnvjkhnbndjkbnd " +
      "190872340870271091309831097813097130i3u709781",
    "",
    null //scalastyle:ignore null
  ).toDF("text")

  lazy val t = new PageSplitter()
    .setInputCol("text")
    .setMaximumPageLength(20)
    .setMinimumPageLength(10)
    .setOutputCol("pages")

  test("Basic usage") {
    val resultList = t.transform(df).collect().toList
    resultList.dropRight(1).foreach { row =>
      val pages = row.getSeq[String](1).toList
      val text = row.getString(0)
      assert(pages.mkString("") === text)
      assert(pages.forall(_.length <= 20))
      assert(pages.dropRight(1).forall(_.length >= 10))
    }
  }

  override def testObjects(): Seq[TestObject[PageSplitter]] =
    List(new TestObject(t, df))

  override def reader: MLReadable[_] = PageSplitter

} 
Example 26
Source File: MultiNGramSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.featurize.text

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.util.MLReadable

import scala.collection.mutable

class MultiNGramSpec extends TransformerFuzzing[MultiNGram] {

  lazy val dfRaw = session
    .createDataFrame(Seq(
      (0, "Hi I"),
      (1, "I wish for snow today"),
      (2, "we Cant go to the park, because of the snow!"),
      (3, ""),
      (4, (1 to 10).map(_.toString).mkString(" "))
    ))
    .toDF("label", "sentence")
  lazy val dfTok = new Tokenizer()
    .setInputCol("sentence")
    .setOutputCol("tokens")
    .transform(dfRaw)

  lazy val t = new MultiNGram()
    .setLengths(Array(1, 3, 4)).setInputCol("tokens").setOutputCol("ngrams")
  lazy val dfNgram = t.transform(dfTok)

  test("operate on tokens ") {
    val grams = dfNgram.collect().last.getAs[Seq[String]]("ngrams").toSet
    assert(grams("1 2 3 4"))
    assert(grams("4"))
    assert(grams("2 3 4"))
    assert(grams.size == 25)
  }

  override def testObjects(): Seq[TestObject[MultiNGram]] =
    List(new TestObject(t, dfTok))

  override def reader: MLReadable[_] = MultiNGram

} 
Example 27
Source File: PartitionConsolidatorSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.flaky

import com.microsoft.ml.spark.core.test.base.TimeLimitedFlaky
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.io.http.PartitionConsolidator
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{DoubleType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.scalatest.Assertion

class PartitionConsolidatorSuite extends TransformerFuzzing[PartitionConsolidator] with TimeLimitedFlaky {

  import session.implicits._

  override val numCores: Option[Int] = Some(2)

  lazy val df: DataFrame = (1 to 1000).toDF("values")

  override val sortInDataframeEquality: Boolean = true

  override def testObjects(): Seq[TestObject[PartitionConsolidator]] = Seq(
    new TestObject(new PartitionConsolidator(), df))

  override def reader: MLReadable[_] = PartitionConsolidator

  def getPartitionDist(df: DataFrame): List[Int] = {
    df.rdd.mapPartitions(it => Iterator(it.length)).collect().toList
  }

  //TODO figure out what is causing the issue on the build server
  override def testSerialization(): Unit = {}

  override def testExperiments(): Unit = {}

  def basicTest(df: DataFrame): Assertion = {
    val pd1 = getPartitionDist(df)
    val newDF = new PartitionConsolidator().transform(df)
    val pd2 = getPartitionDist(newDF)
    assert(pd1.sum === pd2.sum)
    assert(pd2.max >= pd1.max)
    assert(pd1.length === pd2.length)
  }

  test("basic functionality") {
    basicTest(df)
  }

  test("works with more partitions than cores") {
    basicTest(df.repartition(12))
  }

  test("overheads") {
    val baseDF = (1 to 1000).toDF("values").cache()
    println(baseDF.count())

    def getDF: Dataset[Row] = baseDF.map { x => Thread.sleep(10); x }(
      RowEncoder(new StructType().add("values", DoubleType)))

    val t1 = getTime(3)(
      getDF.foreach(_ => ()))._2
    val t2 = getTime(3)(
      new PartitionConsolidator().transform(getDF).foreach(_ => ()))._2

    println(t2.toDouble / t1.toDouble)
    assert(t2.toDouble / t1.toDouble < 3.0)
  }

  test("works with more partitions than cores2") {
    basicTest(df.repartition(100))
  }

  test("work with 1 partition") {
    basicTest(df.repartition(1))
  }

} 
Example 28
Source File: DropColumnsSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable

class DropColumnsSuite extends TestBase with TransformerFuzzing[DropColumns] {

  import session.implicits._

  test("Drop no columns in a data frame") {
    val input = makeBasicDF()
    val result = new DropColumns()
      .setCols(Array())
      .transform(input)
    assert(verifyResult(input, result))
  }

  test("Drop all but two columns in a data frame") {
    val keep = Set("words", "more")
    val input = makeBasicDF()
    val expected = Seq(
      ("guitars", "drums"),
      ("piano", "trumpet"),
      ("bass", "cymbals")
    ).toDF("words", "more")
    val result = new DropColumns()
      .setCols(input.columns.filterNot(keep.contains))
      .transform(makeBasicDF())
    assert(verifyResult(expected, result))
  }

  test("Drop columns with spaces") {
    val result = new DropColumns()
      .setCols(Array("Scored Labels"))
      .transform(makeBasicDF().withColumnRenamed("more", "Scored Labels"))
    assert(verifyResult(makeBasicDF().drop("more"), result))
  }

  test("Invalid column specified") {
    try {
      new DropColumns().setCol("four").transform(makeBasicDF())
      fail()
    } catch {
      case _: NoSuchElementException =>
    }
  }

  def testObjects(): Seq[TestObject[DropColumns]] = List(new TestObject(
    new DropColumns().setCol("numbers"), makeBasicDF()))

  override def reader: MLReadable[_] = DropColumns

} 
Example 29
Source File: RankingTrainValidationSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.recommendation

import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing}
import org.apache.spark.ml.recommendation.ALSModel
import org.apache.spark.ml.util.MLReadable

class RankingTrainValidationSplitSpec extends RankingTestBase with EstimatorFuzzing[RankingTrainValidationSplit] {

  test("testALS") {

    val tvRecommendationSplit = new RankingTrainValidationSplit()
      .setEstimator(als)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(paramGrid)
      .setTrainRatio(0.8)
      .setUserCol(recommendationIndexer.getUserOutputCol)
      .setItemCol(recommendationIndexer.getItemOutputCol)
      .setRatingCol("rating")

    val tvModel = tvRecommendationSplit.fit(transformedDf)

    val model = tvModel.getBestModel.asInstanceOf[ALSModel]

    val items = model.recommendForAllUsers(3)
    assert(items.collect()(0)(0) == 1)

    val users = model.recommendForAllItems(3)
    assert(users.collect()(0)(0) == 4)

  }

  override def testObjects(): Seq[TestObject[RankingTrainValidationSplit]] = {
    List(new TestObject(rankingTrainValidationSplit, transformedDf))
  }

  override def reader: MLReadable[_] = RankingTrainValidationSplit

  override def modelReader: MLReadable[_] = RankingTrainValidationSplitModel
}

class RankingTrainValidationSplitModelSpec extends RankingTestBase with
  TransformerFuzzing[RankingTrainValidationSplitModel] {
  override def testObjects(): Seq[TestObject[RankingTrainValidationSplitModel]] = {
    List(new TestObject(rankingTrainValidationSplit.fit(transformedDf), transformedDf))
  }

  override def reader: MLReadable[_] = RankingTrainValidationSplitModel
} 
Example 30
Source File: ClassBalancerSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.DataFrame

class ClassBalancerSuite extends EstimatorFuzzing[ClassBalancer] {
  lazy val df: DataFrame = session
    .createDataFrame(Seq((0, 1.0, "Hi I"),
                         (1, 1.0, "I wish for snow today"),
                         (2, 2.0, "I wish for snow today"),
                         (3, 2.0, "I wish for snow today"),
                         (4, 2.0, "I wish for snow today"),
                         (5, 2.0, "I wish for snow today"),
                         (6, 0.0, "I wish for snow today"),
                         (7, 1.0, "I wish for snow today"),
                         (8, 0.0, "we Cant go to the park, because of the snow!"),
                         (9, 2.0, "")))
    .toDF("index", "label", "sentence")

  val reader: MLReadable[_] = ClassBalancer
  val modelReader: MLReadable[_] = ClassBalancerModel
  override def testObjects(): Seq[TestObject[ClassBalancer]] = Seq(new TestObject[ClassBalancer](new ClassBalancer()
    .setInputCol("label"), df))

  test("yield proper weights") {
    val model = new ClassBalancer()
      .setInputCol("label").fit(df)
    val df2 = model.transform(df)
    df2.show()

    assert(df2.collect()(8).getDouble(3) == 2.5)
    assert(df2.schema.fields.toSet == model.transformSchema(df.schema).fields.toSet)
  }
} 
Example 31
Source File: ExplodeSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.DataFrame

class ExplodeSuite extends TransformerFuzzing[Explode] {

  import session.implicits._
  lazy val df: DataFrame = Seq(
    (0, Seq("guitars", "drums")),
    (1, Seq("piano")),
    (2, Seq()))
    .toDF("numbers", "words")

  lazy val t: Explode = new Explode().setInputCol("words").setOutputCol("exploded")

  test("Basic usage") {
    val df2 = t.transform(df)
    df2.show()
    assert(df2.columns.length == 3)
    assert(df2.count() == 3)
    assert(df2.select("exploded").collect().map(_.getString(0))===Array("guitars", "drums", "piano"))
  }

  override def testObjects(): Seq[TestObject[Explode]] = Seq(new TestObject(t, df))

  override def reader: MLReadable[_] = Explode

} 
Example 32
Source File: SummarizeDataSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import java.io.File

import org.apache.spark.ml.util.MLReadable

class SummarizeDataSuite extends TransformerFuzzing[SummarizeData] {

  test("Smoke test for summarizing basic DF - schema transform") {

    val input = makeBasicDF()
    val summary = new SummarizeData()
    val result = summary.transformSchema(input.schema)
    assert(result.length > 10)
  }

  test("Smoke test for summary params") {
    val s = new SummarizeData()
    assert(s.params.length == 5)
    assert(s.params.map(s.isSet).toSeq == (1 to s.params.length).map(i => false))

    val sNoCounts = s.setCounts(false).setPercentiles(false)
    assert(sNoCounts.params.map(sNoCounts.isSet).toSeq === Seq(false, true, false, true, false))
  }

  test("Smoke test for summarizing basic DF") {
    val input = makeBasicDF()
    val summary = new SummarizeData()
    val result = summary.transform(input)
    assert(result.count === input.columns.length)
    assert(result.columns.length > 18)
  }

  test("Smoke test for summarizing missings DF") {
    val input = makeBasicNullableDF()
    val summary = new SummarizeData()
    val result = summary.transform(input)
    assert(result.count === input.columns.length)
    assert(result.columns.length > 18)
  }

  test("Smoke test for subset summarizing missings DF") {
    val input = makeBasicNullableDF()
    val summary = new SummarizeData().setPercentiles(false).setCounts(false)
    val result = summary.transform(input)
    assert(result.count === input.columns.length)
    assert(result.columns.length < 11)
  }

  override def testObjects(): Seq[TestObject[SummarizeData]] = Seq(new TestObject(
    new SummarizeData(),
    makeBasicDF()
  ))

  override def reader: MLReadable[_] = SummarizeData

} 
Example 33
Source File: RenameColumnSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable

class RenameColumnSuite extends TestBase with TransformerFuzzing[RenameColumn] {

  test("Rename columns in a data frame") {
    val base = makeBasicDF()
    val result = new RenameColumn().setInputCol("words").setOutputCol("out").transform(base)
    val expected = base.withColumnRenamed("words", "out")
    assert(verifyResult(expected, result))
  }

  test("Rename columns with outputColumn as existing column") {
    val base = makeBasicDF()
    val result = new RenameColumn().setInputCol("words").setOutputCol("numbers").transform(base)
    val expected = base.withColumnRenamed("words", "numbers")
    assert(verifyResult(expected, result))
  }

  def testObjects(): Seq[TestObject[RenameColumn]] = List(new TestObject(
    new RenameColumn().setInputCol("numbers").setOutputCol("out"), makeBasicDF()))

  override def reader: MLReadable[_] = RenameColumn

} 
Example 34
Source File: RepartitionSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.MLReadable

class RepartitionSuite extends TestBase with TransformerFuzzing[Repartition] {

  import session.implicits._

  lazy val input = Seq(
    (0, "guitars", "drums"),
    (1, "piano", "trumpet"),
    (2, "bass", "cymbals"),
    (3, "guitars", "drums"),
    (4, "piano", "trumpet"),
    (5, "bass", "cymbals"),
    (6, "guitars", "drums"),
    (7, "piano", "trumpet"),
    (8, "bass", "cymbals"),
    (9, "guitars", "drums"),
    (10, "piano", "trumpet"),
    (11, "bass", "cymbals")
  ).toDF("numbers", "words", "more")

  test("Work for several values of n") {

    def test(n: Int): Unit = {
      val result = new Repartition()
        .setN(n)
        .transform(input)
      assert(result.rdd.getNumPartitions == n)
      ()
    }
    List(1, 2, 3, 10).foreach(test)

  }

  test("Should allow a user to set the partitions specifically in pipeline transform") {
    val r = new Repartition().setN(1)
    val pipe = new Pipeline().setStages(Array(r))
    val fitPipe = pipe.fit(input)
    assert(fitPipe.transform(input).rdd.getNumPartitions==1)
    assert(fitPipe.transform(input, ParamMap(r.n->5)).rdd.getNumPartitions ==5)
  }

  def testObjects(): Seq[TestObject[Repartition]] = List(new TestObject(
    new Repartition().setN(1), input))

  def reader: MLReadable[_] = Repartition
} 
Example 35
Source File: UnicodeNormalizeSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.DataFrame

class UnicodeNormalizeSuite extends TestBase with TransformerFuzzing[UnicodeNormalize] {
  val inputCol = "words1"
  val outputCol = "norm1"

  //scalastyle:off null
  lazy val wordDF = session.createDataFrame(Seq(
    ("Schön", 1),
    ("Scho\u0308n", 1),
    (null, 1)))
    .toDF(inputCol, "dummy")

  lazy val expectedResultComposed = session.createDataFrame(Seq(
    ("Schön", 1, "schön"),
    ("Scho\u0308n", 1, "schön"),
    (null, 1, null)))
    .toDF(inputCol, "dummy", outputCol)

  lazy val expectedResultDecomposed = session.createDataFrame(Seq(
    ("Schön", 1, "sch\u0308n"),
    ("Scho\u0308n", 1, "sch\u0308n"),
    (null, 1, null)))
    .toDF(inputCol, "dummy", outputCol)
  //scalastyle:on null

  private def testForm(form: String, expected: DataFrame) = {
    val unicodeNormalize = new UnicodeNormalize()
      .setForm(form)
      .setInputCol(inputCol)
      .setOutputCol(outputCol)
    val result = unicodeNormalize.transform(wordDF)
    assert(verifyResult(result, expected))
  }

  test("Check for NFC forms") { testForm("NFC", expectedResultComposed) }

  test("Check for NFKC forms") { testForm("NFKC", expectedResultComposed) }

  test("Check for NFD forms") { testForm("NFD", expectedResultDecomposed) }

  test("Check for NFKD forms") { testForm("NFKD", expectedResultDecomposed) }

  def testObjects(): Seq[TestObject[UnicodeNormalize]] = List(new TestObject(
    new UnicodeNormalize().setInputCol("words").setOutputCol("out"), makeBasicDF()))

  override def reader: MLReadable[_] = UnicodeNormalize

}