org.apache.spark.ml.util.MLReadable Scala Examples
The following examples show how to use org.apache.spark.ml.util.MLReadable.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MultilayerPerceptronClassifierWrapper.scala From drizzle-spark with Apache License 2.0 | 8 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel, val labelCount: Long, val layers: Array[Int], val weights: Array[Double] ) extends MLWritable { def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val labelCount = (rMetadata \ "labelCount").extract[Long] val layers = (rMetadata \ "layers").extract[Array[Int]] val weights = (rMetadata \ "weights").extract[Array[Double]] val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = ("class" -> instance.getClass.getName) ~ ("labelCount" -> instance.labelCount) ~ ("layers" -> instance.layers.toSeq) ~ ("weights" -> instance.weights.toArray.toSeq) val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 2
Source File: KNNTest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.nn import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{DataFrame, Row} import org.scalactic.Equality import org.scalatest.Assertion class KNNTest extends EstimatorFuzzing[KNN] with BallTreeTestBase { test("matches non spark result") { val results = new KNN().setOutputCol("matches") .fit(df).transform(testDF) .select("matches").collect() val sparkResults = results.map(r => r.getSeq[Row](0).map(mr => mr.getDouble(1)) ) val tree = BallTree(uniformData, uniformData.indices) val nonSparkResults = uniformData.take(5).map( point => tree.findMaximumInnerProducts(point, 5) ) sparkResults.zip(nonSparkResults).foreach { case (sr, nsr) => assert(sr === nsr.map(_.distance)) } } override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Assertion = { super.assertDFEq( df1.select("features", "values", "matches.distance"), df2.select("features", "values", "matches.distance") )(eq) } override def testObjects(): Seq[TestObject[KNN]] = List(new TestObject(new KNN().setOutputCol("matches"), df, testDF)) override def reader: MLReadable[_] = KNN override def modelReader: MLReadable[_] = KNNModel } class ConditionalKNNTest extends EstimatorFuzzing[ConditionalKNN] with BallTreeTestBase { test("matches non spark result") { val results = new ConditionalKNN().setOutputCol("matches") .fit(df).transform(testDF) .select("matches").collect() val sparkResults = results.map(r => r.getSeq[Row](0).map(mr => (mr.getDouble(1), mr.getInt(2))) ) val tree = ConditionalBallTree(uniformData, uniformData.indices, uniformLabels) val nonSparkResults = uniformData.take(5).map( point => tree.findMaximumInnerProducts(point, Set(0, 1), 5) ) sparkResults.zip(nonSparkResults).foreach { case (sr, nsr) => assert(sr.map(_._1) === nsr.map(_.distance)) assert(sr.forall(p => Set(1, 0)(p._2))) } } override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Assertion = { super.assertDFEq( df1.select("features", "values", "matches.distance"), df2.select("features", "values", "matches.distance") )(eq) } override def testObjects(): Seq[TestObject[ConditionalKNN]] = List(new TestObject(new ConditionalKNN().setOutputCol("matches"), df, testDF)) override def reader: MLReadable[_] = ConditionalKNN override def modelReader: MLReadable[_] = ConditionalKNNModel }
Example 3
Source File: LambdaSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.types.StructType class LambdaSuite extends TestBase with TransformerFuzzing[Lambda] { test("basic functionality") { val input = makeBasicDF() val lt = new Lambda() .setTransform(df => df.select("numbers")) .setTransformSchema(schema => new StructType(Array(schema("numbers")))) val output = lt.transform(input) val output2 = makeBasicDF().select("numbers") } test("without setting transform schema") { val input = makeBasicDF() val lt = Lambda(_.select("numbers")) val output = lt.transform(input) val output2 = makeBasicDF().select("numbers") assert(output === output2) } def testObjects(): Seq[TestObject[Lambda]] = List(new TestObject(Lambda(_.select("numbers")), makeBasicDF())) override def reader: MLReadable[_] = Lambda }
Example 4
Source File: TimerSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame class TimerSuite extends EstimatorFuzzing[Timer] { lazy val df: DataFrame = session .createDataFrame(Seq((0, "Hi I"), (1, "I wish for snow today"), (2, "we Cant go to the park, because of the snow!"), (3, ""))) .toDF("label", "sentence") test("Work with transformers and estimators") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val df2 = new Timer().setStage(tok).fit(df).transform(df) val df3 = new HashingTF().setInputCol("tokens").setOutputCol("hash").transform(df2) val idf = new IDF().setInputCol("hash").setOutputCol("idf") val df4 = new Timer().setStage(idf).fit(df3).transform(df3) } test("should work within pipelines") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val ttok = new Timer().setStage(tok) val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash") val idf = new IDF().setInputCol("hash").setOutputCol("idf") val tidf = new Timer().setStage(idf) val pipe = new Pipeline().setStages(Array(ttok, hash, tidf)) pipe.fit(df).transform(df) } test("should be able to turn off timing") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val ttok = new Timer().setStage(tok) val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash") val idf = new IDF().setInputCol("hash").setOutputCol("idf") val tidf = new Timer().setStage(idf) val pipe = new Pipeline().setStages(Array(ttok, hash, tidf)) val model = pipe.fit(df) println("Transforming") println(model.stages(0).params.foreach(println(_))) model.stages(0).asInstanceOf[TimerModel].setDisable(true) model.stages(2).asInstanceOf[TimerModel].setDisable(true) println("here") println(model.stages(0).getParam("disableMaterialization")) model.stages(0).params.foreach(p =>println("foo: " + p.toString)) model.transform(df) } val reader: MLReadable[_] = Timer val modelReader: MLReadable[_] = TimerModel override def testObjects(): Seq[TestObject[Timer]] = Seq(new TestObject[Timer]({ val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") new Timer().setStage(tok) }, df)) }
Example 5
Source File: CacherSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable class CacherSuite extends TransformerFuzzing[Cacher] { import session.implicits._ lazy val df = Seq((0, "guitars", "drums"), (1, "piano", "trumpet"), (2, "bass", "cymbals"), (3, "guitars", "drums"), (4, "piano", "trumpet"), (5, "bass", "cymbals"), (6, "guitars", "drums"), (7, "piano", "trumpet"), (8, "bass", "cymbals"), (9, "guitars", "drums"), (10, "piano", "trumpet"), (11, "bass", "cymbals") ).toDF("numbers", "words", "more") test("Be the identity operation") { val df2 = new Cacher().transform(df) assert(df2.collect() === df.collect()) } override def testObjects(): Seq[TestObject[Cacher]] = Seq(new TestObject(new Cacher(), df)) override def reader: MLReadable[_] = Cacher }
Example 6
Source File: SelectColumnsSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable class SelectColumnsSuite extends TestBase with TransformerFuzzing[SelectColumns] { import session.implicits._ test("Select all columns in a data frame") { val input = makeBasicDF() val result = new SelectColumns() .setCols(input.columns) .transform(input) assert(verifyResult(input, result)) } test("Test: Select two columns in a data frame") { val expected = Seq( ("guitars", "drums"), ("piano", "trumpet"), ("bass", "cymbals") ).toDF("words", "more") val result = new SelectColumns() .setCols(Array("words", "more")) .transform(makeBasicDF()) assert(verifyResult(expected, result)) } test("Test: Select columns with spaces") { val expected = Seq( ("guitars", "drums"), ("piano", "trumpet"), ("bass", "cymbals") ).toDF("words", "Scored Labels") val result = new SelectColumns() .setCols(Array("words", "Scored Labels")) .transform(makeBasicDF().withColumnRenamed("more", "Scored Labels")) assert(verifyResult(expected, result)) } test("Test: Select one column from the data frame") { val expected = Seq( "guitars", "piano", "bass" ).toDF("words") val result = new SelectColumns() .setCols(Array("words")) .transform(makeBasicDF()) assert(verifyResult(expected, result)) } test("Invalid column specified") { try { new SelectColumns().setCol("four").transform(makeBasicDF()) fail() } catch { case _: NoSuchElementException => } } def testObjects(): Seq[TestObject[SelectColumns]] = List(new TestObject( new SelectColumns().setCol("numbers"), makeBasicDF())) override def reader: MLReadable[_] = SelectColumns }
Example 7
Source File: MultiColumnAdapterSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.feature.{StringIndexer, Tokenizer} import org.apache.spark.ml.util.MLReadable import scala.collection.mutable class MultiColumnAdapterSpec extends TestBase with EstimatorFuzzing[MultiColumnAdapter] { lazy val wordDF = session.createDataFrame(Seq( (0, "This is a test", "this is one too"), (1, "could be a test", "bar"), (2, "foo", "bar"), (3, "foo", "maybe not"))) .toDF("label", "words1", "words2") lazy val inputCols = Array[String]("words1", "words2") lazy val outputCols = Array[String]("output1", "output2") lazy val stage = new StringIndexer() lazy val adaptedEstimator = new MultiColumnAdapter().setBaseStage(stage) .setInputCols(inputCols).setOutputCols(outputCols) test("parallelize transformers") { val stage1 = new Tokenizer() val transformer = new MultiColumnAdapter().setBaseStage(stage1) .setInputCols(inputCols).setOutputCols(outputCols) val tokenizedDF = transformer.fit(wordDF).transform(wordDF) val lines = tokenizedDF.getColAs[Array[String]]("output2") val trueLines = Array( Array("this", "is", "one", "too"), Array("bar"), Array("bar"), Array("maybe", "not") ) assert(lines === trueLines) } test("parallelize estimator") { val stringIndexedDF = adaptedEstimator.fit(wordDF).transform(wordDF) val lines1 = stringIndexedDF.getColAs[Array[String]]("output1") val trueLines1 = mutable.ArraySeq(1, 2, 0, 0) assert(lines1 === trueLines1) val lines2 = stringIndexedDF.getColAs[Array[String]]("output2") val trueLines2 = mutable.ArraySeq(1, 0, 0, 2) assert(lines2 === trueLines2) } def testObjects(): Seq[TestObject[MultiColumnAdapter]] = List(new TestObject(adaptedEstimator, wordDF)) override def reader: MLReadable[_] = MultiColumnAdapter override def modelReader: MLReadable[_] = PipelineModel }
Example 8
Source File: VerifyVowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.ml.util.MLReadable class VerifyVowpalWabbitInteractions extends TestBase with TransformerFuzzing[VowpalWabbitInteractions] { case class Data(val v1: Vector, val v2: Vector, val v3: Vector) lazy val df = session.createDataFrame(Seq(Data( Vectors.dense(Array(1.0, 2.0, 3.0)), Vectors.sparse(8, Array(5), Array(4.0)), Vectors.sparse(11, Array(8, 9), Array(7.0, 8.0)) ))) private def featurizeUsing(interactions: VowpalWabbitInteractions) = interactions.transform(df).head().getAs[SparseVector]("features") private def verifyValues(actual: SparseVector, expected: Array[Double]) = { assert(actual.numNonzeros == expected.length) (actual.values.sorted zip expected.sorted).forall{ case (x,y) => x == y } } test("Verify VowpalWabbit Interactions 3-dense x 1-sparse") { val interactions = new VowpalWabbitInteractions() .setInputCols(Array("v1", "v2")) .setOutputCol("features") val v = featurizeUsing(interactions) verifyValues(v, Array(4.0, 8, 12.0)) } test("Verify VowpalWabbit Interactions 1-sparse x 2-sparse") { val interactions = new VowpalWabbitInteractions() .setInputCols(Array("v2", "v3")) .setOutputCol("features") val v = featurizeUsing(interactions) verifyValues(v, Array(28.0, 32.0)) } test("Verify VowpalWabbit Interactions 3-dense x 1-sparse x 2-sparse") { val interactions = new VowpalWabbitInteractions() .setInputCols(Array("v1", "v2", "v3")) .setOutputCol("features") val v = featurizeUsing(interactions) verifyValues(v, Array( 1.0 * 5 * 7, 1 * 5 * 8.0, 2.0 * 5 * 7, 2 * 5 * 8.0, 3.0 * 5 * 7, 3 * 5 * 8.0 )) } def testObjects(): Seq[TestObject[VowpalWabbitInteractions]] = List(new TestObject( new VowpalWabbitInteractions().setInputCols(Array("v1")).setOutputCol("out"), df)) override def reader: MLReadable[_] = VowpalWabbitInteractions }
Example 9
Source File: VerifyVowpalWabbitRegressorFuzzing.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.test.benchmarks.DatasetUtils import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{Column, DataFrame} class VerifyVowpalWabbitRegressorFuzzing extends EstimatorFuzzing[VowpalWabbitRegressor] { val numPartitions = 2 def readCSV(fileName: String, fileLocation: String): DataFrame = { session.read .option("header", "true").option("inferSchema", "true") .option("treatEmptyValuesAsNulls", "false") .option("delimiter", if (fileName.endsWith(".csv")) "," else "\t") .csv(fileLocation) } override def reader: MLReadable[_] = VowpalWabbitRegressor override def modelReader: MLReadable[_] = VowpalWabbitRegressionModel override def testObjects(): Seq[TestObject[VowpalWabbitRegressor]] = { val fileName = "energyefficiency2012_data.train.csv" val columnsFilter = Some("X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2") val labelCol = "Y1" val fileLocation = DatasetUtils.regressionTrainFile(fileName).toString val readDataset = readCSV(fileName, fileLocation).repartition(numPartitions) val dataset = if (columnsFilter.isDefined) { readDataset.select(columnsFilter.get.split(",").map(new Column(_)): _*) } else { readDataset } val featuresColumn = "features" val featurizer = new VowpalWabbitFeaturizer() .setInputCols(dataset.columns.filter(col => col != labelCol)) .setOutputCol("features") val vw = new VowpalWabbitRegressor() val predCol = "pred" val trainData = featurizer.transform(dataset) val model = vw.setLabelCol(labelCol) .setFeaturesCol("features") .setPredictionCol(predCol) .fit(trainData) Seq(new TestObject( vw, trainData)) } }
Example 10
Source File: StratifiedRepartitionSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.TaskContext import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{IntegerType, StringType, StructType} class StratifiedRepartitionSuite extends TestBase with TransformerFuzzing[StratifiedRepartition] { import session.implicits._ val values = "values" val colors = "colors" val const = "const" lazy val input = Seq( (0, "Blue", 2), (0, "Red", 2), (0, "Green", 2), (1, "Purple", 2), (1, "Orange", 2), (1, "Indigo", 2), (2, "Violet", 2), (2, "Black", 2), (2, "White", 2), (3, "Gray", 2), (3, "Yellow", 2), (3, "Cerulean", 2) ).toDF(values, colors, const) test("Assert doing a stratified repartition will ensure all keys exist across all partitions") { val inputSchema = new StructType() .add(values, IntegerType).add(colors, StringType).add(const, IntegerType) val inputEnc = RowEncoder(inputSchema) val valuesFieldIndex = inputSchema.fieldIndex(values) val numPartitions = 3 val trainData = input.repartition(numPartitions).select(values, colors, const) .mapPartitions(iter => { val ctx = TaskContext.get val partId = ctx.partitionId // Remove all instances of 0 class on partition 1 if (partId == 1) { iter.flatMap(row => { if (row.getInt(valuesFieldIndex) <= 0) None else Some(row) }) } else { // Add back at least 3 instances on other partitions val oneOfEachExample = List(Row(0, "Blue", 2), Row(1, "Purple", 2), Row(2, "Black", 2), Row(3, "Gray", 2)) (iter.toList.union(oneOfEachExample).union(oneOfEachExample).union(oneOfEachExample)).toIterator } })(inputEnc).cache() // Some debug to understand what data is on which partition trainData.foreachPartition { rows => rows.foreach { row => val ctx = TaskContext.get val partId = ctx.partitionId println(s"Row: $row partition id: $partId") } } val stratifiedInputData = new StratifiedRepartition().setLabelCol(values) .setMode(SPConstants.Equal).transform(trainData) // Assert stratified data contains all keys across all partitions, with extra count // for it to be evaluated stratifiedInputData .mapPartitions(iter => { val actualLabels = iter.map(row => row.getInt(valuesFieldIndex)) .toArray.distinct.sorted.toList val expectedLabels = (0 to 3).toList if (actualLabels != expectedLabels) throw new Exception(s"Missing labels, actual: $actualLabels, expected: $expectedLabels") iter })(inputEnc).count() val stratifiedMixedInputData = new StratifiedRepartition().setLabelCol(values) .setMode(SPConstants.Mixed).transform(trainData) assert(stratifiedMixedInputData.count() >= trainData.count()) val stratifiedOriginalInputData = new StratifiedRepartition().setLabelCol(values) .setMode(SPConstants.Original).transform(trainData) assert(stratifiedOriginalInputData.count() == trainData.count()) } def testObjects(): Seq[TestObject[StratifiedRepartition]] = List(new TestObject( new StratifiedRepartition().setLabelCol(values).setMode(SPConstants.Equal), input)) def reader: MLReadable[_] = StratifiedRepartition }
Example 11
Source File: SuperpixelTransformerSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.lime import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.image.NetworkUtils import com.microsoft.ml.spark.io.split1.FileReaderUtils import org.apache.spark.ml.util.MLReadable class SuperpixelTransformerSuite extends TransformerFuzzing[SuperpixelTransformer] with NetworkUtils with FileReaderUtils { lazy val spt: SuperpixelTransformer = new SuperpixelTransformer().setInputCol(inputCol) test("basic functionality"){ val results = spt.transform(images) val superpixels = SuperpixelData.fromRow(results.collect()(0).getStruct(1)) assert(superpixels.clusters.length === 3) assert(superpixels.clusters.head.length == 310) } override def testObjects(): Seq[TestObject[SuperpixelTransformer]] = Seq(new TestObject(spt, images)) override def reader: MLReadable[_] = SuperpixelTransformer }
Example 12
Source File: SimpleHTTPTransformerSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.split1 import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.io.http.{HandlingUtils, JSONOutputParser, SimpleHTTPTransformer} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{StringType, StructType} class SimpleHTTPTransformerSuite extends TransformerFuzzing[SimpleHTTPTransformer] with WithServer { import session.implicits._ lazy val df: DataFrame = sc.parallelize((1 to 10).map(Tuple1(_))).toDF("data") def simpleTransformer: SimpleHTTPTransformer = new SimpleHTTPTransformer() .setInputCol("data") .setOutputParser(new JSONOutputParser() .setDataType(new StructType().add("blah", StringType))) .setUrl(url) .setOutputCol("results") test("HttpTransformerTest") { val results = simpleTransformer.transform(df).collect assert(results.length == 10) results.foreach(r => assert(r.getStruct(2).getString(0) === "more blah")) assert(results(0).schema.fields.length == 3) } test("HttpTransformerTest with Flaky Connection") { lazy val df2: DataFrame = sc.parallelize((1 to 5).map(Tuple1(_))).toDF("data") val results = simpleTransformer .setUrl(url + "/flaky") .setTimeout(1) .transform(df2).collect assert(results.length == 5) } test("Basic Handling") { val results = simpleTransformer .setHandler(HandlingUtils.basic) .transform(df).collect assert(results.length == 10) results.foreach(r => assert(r.getStruct(2).getString(0) === "more blah")) assert(results(0).schema.fields.length == 3) } test("Concurrent HttpTransformerTest") { val results = new SimpleHTTPTransformer() .setInputCol("data") .setOutputParser(new JSONOutputParser() .setDataType(new StructType().add("blah", StringType))) .setUrl(url) .setOutputCol("results") .setConcurrency(3) .transform(df) .collect assert(results.length == 10) assert(results.forall(_.getStruct(2).getString(0) == "more blah")) assert(results(0).schema.fields.length == 3) } override def testObjects(): Seq[TestObject[SimpleHTTPTransformer]] = Seq(new TestObject(simpleTransformer, df)) override def reader: MLReadable[_] = SimpleHTTPTransformer }
Example 13
Source File: ParserSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.split1 import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.io.http._ import org.apache.http.client.methods.HttpPost import org.apache.spark.ml.Transformer import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, SparkSession} trait ParserUtils extends WithServer { def sampleDf(spark: SparkSession): DataFrame = { val df = spark.createDataFrame((1 to 10).map(Tuple1(_))) .toDF("data") val df2 = new JSONInputParser().setInputCol("data") .setOutputCol("parsedInput").setUrl(url) .transform(df) .withColumn("unparsedOutput", udf({ x: Int => HTTPResponseData( Array(), Some(EntityData( "{\"foo\": \"here\"}".getBytes, None, None, None, false, false, false)), StatusLineData(ProtocolVersionData("foo", 1, 1), 200, "bar"), "en") }).apply(col("data")) ) new JSONOutputParser() .setDataType(new StructType().add("foo", StringType)) .setInputCol("unparsedOutput") .setOutputCol("parsedOutput") .transform(df2) } def makeTestObject[T <: Transformer](t: T, session: SparkSession): Seq[TestObject[T]] = { Seq(new TestObject(t, sampleDf(session))) } } class JsonInputParserSuite extends TransformerFuzzing[JSONInputParser] with ParserUtils { override def testObjects(): Seq[TestObject[JSONInputParser]] = makeTestObject( new JSONInputParser().setInputCol("data").setOutputCol("out") .setUrl(url), session) override def reader: MLReadable[_] = JSONInputParser } class JsonOutputParserSuite extends TransformerFuzzing[JSONOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[JSONOutputParser]] = makeTestObject( new JSONOutputParser().setInputCol("unparsedOutput").setOutputCol("out") .setDataType(new StructType().add("foo", StringType)), session) override def reader: MLReadable[_] = JSONOutputParser } class StringOutputParserSuite extends TransformerFuzzing[StringOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[StringOutputParser]] = makeTestObject( new StringOutputParser().setInputCol("unparsedOutput").setOutputCol("out"), session) override def reader: MLReadable[_] = StringOutputParser } class CustomInputParserSuite extends TransformerFuzzing[CustomInputParser] with ParserUtils { override def testObjects(): Seq[TestObject[CustomInputParser]] = makeTestObject( new CustomInputParser().setInputCol("data").setOutputCol("out") .setUDF({ x: Int => new HttpPost(s"http://$x") }), session) override def reader: MLReadable[_] = CustomInputParser } class CustomOutputParserSuite extends TransformerFuzzing[CustomOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[CustomOutputParser]] = makeTestObject( new CustomOutputParser().setInputCol("unparsedOutput").setOutputCol("out") .setUDF({ x: HTTPResponseData => x.locale }), session) override def reader: MLReadable[_] = CustomOutputParser }
Example 14
Source File: MultilayerPerceptronClassifierWrapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] val weights: Array[Double] = mlpModel.weights.toArray val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 15
Source File: SerializableSparkModel.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.Model import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import ai.deepsense.sparkutils.ML class SerializableSparkModel[M <: Model[M]](val sparkModel: M) extends ML.Model[SerializableSparkModel[M]] with MLWritable { override def copy(extra: ParamMap): SerializableSparkModel[M] = new SerializableSparkModel(sparkModel.copy(extra)) override def write: MLWriter = { sparkModel match { case w: MLWritable => w.write case _ => new DefaultMLWriter(this) } } override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset) override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema) override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae" } // This class may seem unused, but it is used reflectively by spark deserialization mechanism object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] { override def read: MLReader[SerializableSparkModel[_]] = { new DefaultMLReader[SerializableSparkModel[_]]() } }
Example 16
Source File: MultilayerPerceptronClassifierWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ private val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] lazy val weights: Array[Double] = mlpModel.weights.toArray lazy val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 17
Source File: ModelUtils.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.utils import org.apache.hadoop.mapred.InvalidInputException import org.apache.spark.ml.util.{MLReadable, MLWritable} object ModelUtils { def loadOrCreateModel[T <: MLWritable](ModelClass: MLReadable[T], path: String, createModelFunc: () => T): T = { try { ModelClass.load(path) } catch { case e: InvalidInputException => { if (e.getMessage.contains("Input path does not exist")) { val model = createModelFunc() model.write.overwrite().save(path) model } else { throw e } } } } }
Example 18
Source File: SerializableSparkModel.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.Model import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import io.deepsense.sparkutils.ML class SerializableSparkModel[M <: Model[M]](val sparkModel: M) extends ML.Model[SerializableSparkModel[M]] with MLWritable { override def copy(extra: ParamMap): SerializableSparkModel[M] = new SerializableSparkModel(sparkModel.copy(extra)) override def write: MLWriter = { sparkModel match { case w: MLWritable => w.write case _ => new DefaultMLWriter(this) } } override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset) override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema) override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae" } // This class may seem unused, but it is used reflectively by spark deserialization mechanism object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] { override def read: MLReader[SerializableSparkModel[_]] = { new DefaultMLReader[SerializableSparkModel[_]]() } }
Example 19
Source File: RankingAdapterSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.recommendation import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable class RankingAdapterSpec extends RankingTestBase with EstimatorFuzzing[RankingAdapter] { override def testObjects(): Seq[TestObject[RankingAdapter]] = { List(new TestObject(adapter, transformedDf)) } override def reader: MLReadable[_] = RankingAdapter override def modelReader: MLReadable[_] = RankingAdapterModel } class RankingAdapterModelSpec extends RankingTestBase with TransformerFuzzing[RankingAdapterModel] { override def testObjects(): Seq[TestObject[RankingAdapterModel]] = { val df = transformedDf List(new TestObject(adapter.fit(df), df)) } override def reader: MLReadable[_] = RankingAdapterModel }
Example 20
Source File: MathUnary.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType} import org.apache.spark.sql.functions.udf private val className = classOf[MathUnary].getName override def load(path: String): MathUnary = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("operation").head() val operation = data.getAs[String](0) val model = MathUnaryModel(UnaryOperation.forName(operation)) val transformer = new MathUnary(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 21
Source File: MultilayerPerceptronClassifierWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] val weights: Array[Double] = mlpModel.weights.toArray val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 22
Source File: SpeechToTextSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.cognitive.split2 import java.net.{URI, URL} import com.microsoft.ml.spark.Secrets import com.microsoft.ml.spark.cognitive.{SpeechResponse, SpeechToText} import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.commons.compress.utils.IOUtils import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{DataFrame, Row} import org.scalactic.Equality trait SpeechKey { lazy val speechKey = sys.env.getOrElse("SPEECH_API_KEY", Secrets.SpeechApiKey) } class SpeechToTextSuite extends TransformerFuzzing[SpeechToText] with SpeechKey { import session.implicits._ val region = "eastus" val resourcesDir = System.getProperty("user.dir") + "/src/test/resources/" val uri = new URI(s"https://$region.api.cognitive.microsoft.com/sts/v1.0/issuetoken") val language = "en-us" val profanity = "masked" val format = "simple" lazy val stt = new SpeechToText() .setSubscriptionKey(speechKey) .setLocation(region) .setOutputCol("text") .setAudioDataCol("audio") .setLanguage("en-US") lazy val audioBytes: Array[Byte] = { IOUtils.toByteArray(new URL("https://mmlspark.blob.core.windows.net/datasets/Speech/test1.wav").openStream()) } lazy val df: DataFrame = Seq( Tuple1(audioBytes) ).toDF("audio") override lazy val dfEq = new Equality[DataFrame] { override def areEqual(a: DataFrame, b: Any): Boolean = baseDfEq.areEqual(a.drop("audio"), b.asInstanceOf[DataFrame].drop("audio")) } override def testSerialization(): Unit = { tryWithRetries(Array(0, 100, 100, 100, 100))(super.testSerialization) } def jaccardSimilarity(s1: String, s2: String): Double = { val a = Set(s1) val b = Set(s2) a.intersect(b).size.toDouble / (a | b).size.toDouble } test("Basic Usage") { val toObj: Row => SpeechResponse = SpeechResponse.makeFromRowConverter val result = toObj(stt.setFormat("simple") .transform(df).select("text") .collect().head.getStruct(0)) result.DisplayText.get.contains("this is a test") } test("Detailed Usage") { val toObj = SpeechResponse.makeFromRowConverter val result = toObj(stt.setFormat("detailed") .transform(df).select("text") .collect().head.getStruct(0)) result.NBest.get.head.Display.contains("this is a test") } override def testObjects(): Seq[TestObject[SpeechToText]] = Seq(new TestObject(stt, df)) override def reader: MLReadable[_] = SpeechToText }
Example 23
Source File: VerifyIsolationForest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.isolationforest import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.metrics.MetricConstants import com.microsoft.ml.spark.core.test.benchmarks.Benchmarks import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row} import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.scalactic.Tolerance._ import com.microsoft.ml.spark.train.ComputeModelStatistics case class MammographyRecord(feature0: Double, feature1: Double, feature2: Double, feature3: Double, feature4: Double, feature5: Double, label: Double) case class ScoringResult(features: Vector, label: Double, predictedLabel: Double, outlierScore: Double) class VerifyIsolationForest extends Benchmarks with EstimatorFuzzing[IsolationForest] { test ("Verify isolationForestMammographyDataTest") { import session.implicits._ val data = loadMammographyData // Train a new isolation forest model val contamination = 0.02 val isolationForest = new IsolationForest() .setNumEstimators(100) .setBootstrap(false) .setMaxSamples(256) .setMaxFeatures(1.0) .setFeaturesCol("features") .setPredictionCol("predictedLabel") .setScoreCol("outlierScore") .setContamination(0.02) .setContaminationError(contamination * 0.01) .setRandomSeed(1) // Score all training data instances using the new model val isolationForestModel = isolationForest.fit(data) // Calculate area under ROC curve and assert val scores = isolationForestModel.transform(data).as[ScoringResult] val metrics = new ComputeModelStatistics() .setEvaluationMetric(MetricConstants.AucSparkMetric) .setLabelCol("label") .setScoredLabelsCol("predictedLabel") .setScoresCol("outlierScore") .transform(scores) // Expectation from results in the 2008 "Isolation Forest" paper by F. T. Liu, et al. val aurocExpectation = 0.86 val uncert = 0.02 val auroc = metrics.first().getDouble(1) assert(auroc === aurocExpectation +- uncert, "expected area under ROC =" + s" $aurocExpectation +/- $uncert, but observed $auroc") } def loadMammographyData(): DataFrame = { import session.implicits._ val mammographyRecordSchema = Encoders.product[MammographyRecord].schema val fileLocation = FileUtilities.join(BuildInfo.datasetDir,"IsolationForest", "mammography.csv").toString // Open source dataset from http://odds.cs.stonybrook.edu/mammography-dataset/ val rawData = session.read .format("csv") .option("comment", "#") .option("header", "false") .schema(mammographyRecordSchema) .load(fileLocation) val assembler = new VectorAssembler() .setInputCols(Array("feature0", "feature1", "feature2", "feature3", "feature4", "feature5")) .setOutputCol("features") val data = assembler .transform(rawData) .select("features", "label") data } override def reader: MLReadable[_] = IsolationForest override def modelReader: MLReadable[_] = IsolationForestModel override def testObjects(): Seq[TestObject[IsolationForest]] = { val dataset = loadMammographyData.toDF Seq(new TestObject( new IsolationForest(), dataset)) } }
Example 24
Source File: TextFeaturizerSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize.text import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.feature.{NGram, Tokenizer} import org.apache.spark.ml.util.MLReadable class TextFeaturizerSpec extends EstimatorFuzzing[TextFeaturizer]{ lazy val dfRaw = session .createDataFrame(Seq((0, "Hi I"), (1, "I wish for snow today"), (2, "we Cant go to the park, because of the snow!"), (3, ""))) .toDF("label", "sentence") lazy val dfTok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") .transform(dfRaw) lazy val dfNgram = new NGram().setInputCol("tokens").setOutputCol("ngrams").transform(dfTok) test("operate on sentences,tokens,or ngrams") { val tfRaw = new TextFeaturizer() .setInputCol("sentence") .setOutputCol("features") .setNumFeatures(20) val tfTok = new TextFeaturizer() .setUseTokenizer(false) .setInputCol("tokens") .setOutputCol("features") .setNumFeatures(20) val tfNgram = new TextFeaturizer() .setUseTokenizer(false) .setUseNGram(false) .setInputCol("ngrams") .setOutputCol("features") .setNumFeatures(20) val dfRaw2 = tfRaw.fit(dfRaw).transform(dfRaw) val dfTok2 = tfTok.fit(dfTok).transform(dfTok) val dfNgram2 = tfNgram.fit(dfNgram).transform(dfNgram) val linesRaw = dfRaw2.getSVCol("features") val linesTok = dfTok2.getSVCol("features") val linesNgram = dfNgram2.getSVCol("features") assert(linesRaw.length == 4) assert(linesTok.length == 4) assert(linesNgram.length == 4) assert(linesRaw(0)(0) == 0.9162907318741551) assert(linesTok(1)(9) == 0.5108256237659907) assert(linesNgram(2)(7) == 1.8325814637483102) assert(linesNgram(3)(1) == 0.0) } test("throw errors if the schema is incorrect") { val tfRaw = new TextFeaturizer() .setUseTokenizer(true) .setInputCol("sentence") .setOutputCol("features") .setNumFeatures(20) val tfTok = new TextFeaturizer() .setUseTokenizer(false) .setInputCol("tokens") .setOutputCol("features") .setNumFeatures(20) assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens"), dfTok) assertSparkException[IllegalArgumentException](tfRaw.setInputCol("ngrams"), dfNgram) assertSparkException[IllegalArgumentException](tfTok.setInputCol("sentence"), dfRaw) assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens_incorrect"), dfTok) assertSparkException[IllegalArgumentException](tfRaw.setOutputCol("tokens"), dfTok) } override def testObjects(): Seq[TestObject[TextFeaturizer]] = List(new TestObject(new TextFeaturizer().setInputCol("sentence"), dfRaw)) override def reader: MLReadable[_] = TextFeaturizer override def modelReader: MLReadable[_] = TextFeaturizerModel }
Example 25
Source File: PageSplitterSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize.text import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable class PageSplitterSpec extends TransformerFuzzing[PageSplitter] { import session.implicits._ lazy val df = Seq( "words words words wornssaa ehewjkdiw weijnsikjn xnh", "s s s s s s", "hsjbhjhnskjhndwjnbvckjbnwkjwenbvfkjhbnwevkjhbnwejhkbnvjkhnbndjkbnd", "hsjbhjhnskjhndwjnbvckjbnwkjwenbvfkjhbnwevkjhbnwejhkbnvjkhnbndjkbnd " + "190872340870271091309831097813097130i3u709781", "", null //scalastyle:ignore null ).toDF("text") lazy val t = new PageSplitter() .setInputCol("text") .setMaximumPageLength(20) .setMinimumPageLength(10) .setOutputCol("pages") test("Basic usage") { val resultList = t.transform(df).collect().toList resultList.dropRight(1).foreach { row => val pages = row.getSeq[String](1).toList val text = row.getString(0) assert(pages.mkString("") === text) assert(pages.forall(_.length <= 20)) assert(pages.dropRight(1).forall(_.length >= 10)) } } override def testObjects(): Seq[TestObject[PageSplitter]] = List(new TestObject(t, df)) override def reader: MLReadable[_] = PageSplitter }
Example 26
Source File: MultiNGramSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize.text import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.feature.Tokenizer import org.apache.spark.ml.util.MLReadable import scala.collection.mutable class MultiNGramSpec extends TransformerFuzzing[MultiNGram] { lazy val dfRaw = session .createDataFrame(Seq( (0, "Hi I"), (1, "I wish for snow today"), (2, "we Cant go to the park, because of the snow!"), (3, ""), (4, (1 to 10).map(_.toString).mkString(" ")) )) .toDF("label", "sentence") lazy val dfTok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") .transform(dfRaw) lazy val t = new MultiNGram() .setLengths(Array(1, 3, 4)).setInputCol("tokens").setOutputCol("ngrams") lazy val dfNgram = t.transform(dfTok) test("operate on tokens ") { val grams = dfNgram.collect().last.getAs[Seq[String]]("ngrams").toSet assert(grams("1 2 3 4")) assert(grams("4")) assert(grams("2 3 4")) assert(grams.size == 25) } override def testObjects(): Seq[TestObject[MultiNGram]] = List(new TestObject(t, dfTok)) override def reader: MLReadable[_] = MultiNGram }
Example 27
Source File: PartitionConsolidatorSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.flaky import com.microsoft.ml.spark.core.test.base.TimeLimitedFlaky import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.io.http.PartitionConsolidator import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalatest.Assertion class PartitionConsolidatorSuite extends TransformerFuzzing[PartitionConsolidator] with TimeLimitedFlaky { import session.implicits._ override val numCores: Option[Int] = Some(2) lazy val df: DataFrame = (1 to 1000).toDF("values") override val sortInDataframeEquality: Boolean = true override def testObjects(): Seq[TestObject[PartitionConsolidator]] = Seq( new TestObject(new PartitionConsolidator(), df)) override def reader: MLReadable[_] = PartitionConsolidator def getPartitionDist(df: DataFrame): List[Int] = { df.rdd.mapPartitions(it => Iterator(it.length)).collect().toList } //TODO figure out what is causing the issue on the build server override def testSerialization(): Unit = {} override def testExperiments(): Unit = {} def basicTest(df: DataFrame): Assertion = { val pd1 = getPartitionDist(df) val newDF = new PartitionConsolidator().transform(df) val pd2 = getPartitionDist(newDF) assert(pd1.sum === pd2.sum) assert(pd2.max >= pd1.max) assert(pd1.length === pd2.length) } test("basic functionality") { basicTest(df) } test("works with more partitions than cores") { basicTest(df.repartition(12)) } test("overheads") { val baseDF = (1 to 1000).toDF("values").cache() println(baseDF.count()) def getDF: Dataset[Row] = baseDF.map { x => Thread.sleep(10); x }( RowEncoder(new StructType().add("values", DoubleType))) val t1 = getTime(3)( getDF.foreach(_ => ()))._2 val t2 = getTime(3)( new PartitionConsolidator().transform(getDF).foreach(_ => ()))._2 println(t2.toDouble / t1.toDouble) assert(t2.toDouble / t1.toDouble < 3.0) } test("works with more partitions than cores2") { basicTest(df.repartition(100)) } test("work with 1 partition") { basicTest(df.repartition(1)) } }
Example 28
Source File: DropColumnsSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable class DropColumnsSuite extends TestBase with TransformerFuzzing[DropColumns] { import session.implicits._ test("Drop no columns in a data frame") { val input = makeBasicDF() val result = new DropColumns() .setCols(Array()) .transform(input) assert(verifyResult(input, result)) } test("Drop all but two columns in a data frame") { val keep = Set("words", "more") val input = makeBasicDF() val expected = Seq( ("guitars", "drums"), ("piano", "trumpet"), ("bass", "cymbals") ).toDF("words", "more") val result = new DropColumns() .setCols(input.columns.filterNot(keep.contains)) .transform(makeBasicDF()) assert(verifyResult(expected, result)) } test("Drop columns with spaces") { val result = new DropColumns() .setCols(Array("Scored Labels")) .transform(makeBasicDF().withColumnRenamed("more", "Scored Labels")) assert(verifyResult(makeBasicDF().drop("more"), result)) } test("Invalid column specified") { try { new DropColumns().setCol("four").transform(makeBasicDF()) fail() } catch { case _: NoSuchElementException => } } def testObjects(): Seq[TestObject[DropColumns]] = List(new TestObject( new DropColumns().setCol("numbers"), makeBasicDF())) override def reader: MLReadable[_] = DropColumns }
Example 29
Source File: RankingTrainValidationSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.recommendation import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing} import org.apache.spark.ml.recommendation.ALSModel import org.apache.spark.ml.util.MLReadable class RankingTrainValidationSplitSpec extends RankingTestBase with EstimatorFuzzing[RankingTrainValidationSplit] { test("testALS") { val tvRecommendationSplit = new RankingTrainValidationSplit() .setEstimator(als) .setEvaluator(evaluator) .setEstimatorParamMaps(paramGrid) .setTrainRatio(0.8) .setUserCol(recommendationIndexer.getUserOutputCol) .setItemCol(recommendationIndexer.getItemOutputCol) .setRatingCol("rating") val tvModel = tvRecommendationSplit.fit(transformedDf) val model = tvModel.getBestModel.asInstanceOf[ALSModel] val items = model.recommendForAllUsers(3) assert(items.collect()(0)(0) == 1) val users = model.recommendForAllItems(3) assert(users.collect()(0)(0) == 4) } override def testObjects(): Seq[TestObject[RankingTrainValidationSplit]] = { List(new TestObject(rankingTrainValidationSplit, transformedDf)) } override def reader: MLReadable[_] = RankingTrainValidationSplit override def modelReader: MLReadable[_] = RankingTrainValidationSplitModel } class RankingTrainValidationSplitModelSpec extends RankingTestBase with TransformerFuzzing[RankingTrainValidationSplitModel] { override def testObjects(): Seq[TestObject[RankingTrainValidationSplitModel]] = { List(new TestObject(rankingTrainValidationSplit.fit(transformedDf), transformedDf)) } override def reader: MLReadable[_] = RankingTrainValidationSplitModel }
Example 30
Source File: ClassBalancerSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame class ClassBalancerSuite extends EstimatorFuzzing[ClassBalancer] { lazy val df: DataFrame = session .createDataFrame(Seq((0, 1.0, "Hi I"), (1, 1.0, "I wish for snow today"), (2, 2.0, "I wish for snow today"), (3, 2.0, "I wish for snow today"), (4, 2.0, "I wish for snow today"), (5, 2.0, "I wish for snow today"), (6, 0.0, "I wish for snow today"), (7, 1.0, "I wish for snow today"), (8, 0.0, "we Cant go to the park, because of the snow!"), (9, 2.0, ""))) .toDF("index", "label", "sentence") val reader: MLReadable[_] = ClassBalancer val modelReader: MLReadable[_] = ClassBalancerModel override def testObjects(): Seq[TestObject[ClassBalancer]] = Seq(new TestObject[ClassBalancer](new ClassBalancer() .setInputCol("label"), df)) test("yield proper weights") { val model = new ClassBalancer() .setInputCol("label").fit(df) val df2 = model.transform(df) df2.show() assert(df2.collect()(8).getDouble(3) == 2.5) assert(df2.schema.fields.toSet == model.transformSchema(df.schema).fields.toSet) } }
Example 31
Source File: ExplodeSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame class ExplodeSuite extends TransformerFuzzing[Explode] { import session.implicits._ lazy val df: DataFrame = Seq( (0, Seq("guitars", "drums")), (1, Seq("piano")), (2, Seq())) .toDF("numbers", "words") lazy val t: Explode = new Explode().setInputCol("words").setOutputCol("exploded") test("Basic usage") { val df2 = t.transform(df) df2.show() assert(df2.columns.length == 3) assert(df2.count() == 3) assert(df2.select("exploded").collect().map(_.getString(0))===Array("guitars", "drums", "piano")) } override def testObjects(): Seq[TestObject[Explode]] = Seq(new TestObject(t, df)) override def reader: MLReadable[_] = Explode }
Example 32
Source File: SummarizeDataSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import java.io.File import org.apache.spark.ml.util.MLReadable class SummarizeDataSuite extends TransformerFuzzing[SummarizeData] { test("Smoke test for summarizing basic DF - schema transform") { val input = makeBasicDF() val summary = new SummarizeData() val result = summary.transformSchema(input.schema) assert(result.length > 10) } test("Smoke test for summary params") { val s = new SummarizeData() assert(s.params.length == 5) assert(s.params.map(s.isSet).toSeq == (1 to s.params.length).map(i => false)) val sNoCounts = s.setCounts(false).setPercentiles(false) assert(sNoCounts.params.map(sNoCounts.isSet).toSeq === Seq(false, true, false, true, false)) } test("Smoke test for summarizing basic DF") { val input = makeBasicDF() val summary = new SummarizeData() val result = summary.transform(input) assert(result.count === input.columns.length) assert(result.columns.length > 18) } test("Smoke test for summarizing missings DF") { val input = makeBasicNullableDF() val summary = new SummarizeData() val result = summary.transform(input) assert(result.count === input.columns.length) assert(result.columns.length > 18) } test("Smoke test for subset summarizing missings DF") { val input = makeBasicNullableDF() val summary = new SummarizeData().setPercentiles(false).setCounts(false) val result = summary.transform(input) assert(result.count === input.columns.length) assert(result.columns.length < 11) } override def testObjects(): Seq[TestObject[SummarizeData]] = Seq(new TestObject( new SummarizeData(), makeBasicDF() )) override def reader: MLReadable[_] = SummarizeData }
Example 33
Source File: RenameColumnSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable class RenameColumnSuite extends TestBase with TransformerFuzzing[RenameColumn] { test("Rename columns in a data frame") { val base = makeBasicDF() val result = new RenameColumn().setInputCol("words").setOutputCol("out").transform(base) val expected = base.withColumnRenamed("words", "out") assert(verifyResult(expected, result)) } test("Rename columns with outputColumn as existing column") { val base = makeBasicDF() val result = new RenameColumn().setInputCol("words").setOutputCol("numbers").transform(base) val expected = base.withColumnRenamed("words", "numbers") assert(verifyResult(expected, result)) } def testObjects(): Seq[TestObject[RenameColumn]] = List(new TestObject( new RenameColumn().setInputCol("numbers").setOutputCol("out"), makeBasicDF())) override def reader: MLReadable[_] = RenameColumn }
Example 34
Source File: RepartitionSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.MLReadable class RepartitionSuite extends TestBase with TransformerFuzzing[Repartition] { import session.implicits._ lazy val input = Seq( (0, "guitars", "drums"), (1, "piano", "trumpet"), (2, "bass", "cymbals"), (3, "guitars", "drums"), (4, "piano", "trumpet"), (5, "bass", "cymbals"), (6, "guitars", "drums"), (7, "piano", "trumpet"), (8, "bass", "cymbals"), (9, "guitars", "drums"), (10, "piano", "trumpet"), (11, "bass", "cymbals") ).toDF("numbers", "words", "more") test("Work for several values of n") { def test(n: Int): Unit = { val result = new Repartition() .setN(n) .transform(input) assert(result.rdd.getNumPartitions == n) () } List(1, 2, 3, 10).foreach(test) } test("Should allow a user to set the partitions specifically in pipeline transform") { val r = new Repartition().setN(1) val pipe = new Pipeline().setStages(Array(r)) val fitPipe = pipe.fit(input) assert(fitPipe.transform(input).rdd.getNumPartitions==1) assert(fitPipe.transform(input, ParamMap(r.n->5)).rdd.getNumPartitions ==5) } def testObjects(): Seq[TestObject[Repartition]] = List(new TestObject( new Repartition().setN(1), input)) def reader: MLReadable[_] = Repartition }
Example 35
Source File: UnicodeNormalizeSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame class UnicodeNormalizeSuite extends TestBase with TransformerFuzzing[UnicodeNormalize] { val inputCol = "words1" val outputCol = "norm1" //scalastyle:off null lazy val wordDF = session.createDataFrame(Seq( ("Schön", 1), ("Scho\u0308n", 1), (null, 1))) .toDF(inputCol, "dummy") lazy val expectedResultComposed = session.createDataFrame(Seq( ("Schön", 1, "schön"), ("Scho\u0308n", 1, "schön"), (null, 1, null))) .toDF(inputCol, "dummy", outputCol) lazy val expectedResultDecomposed = session.createDataFrame(Seq( ("Schön", 1, "sch\u0308n"), ("Scho\u0308n", 1, "sch\u0308n"), (null, 1, null))) .toDF(inputCol, "dummy", outputCol) //scalastyle:on null private def testForm(form: String, expected: DataFrame) = { val unicodeNormalize = new UnicodeNormalize() .setForm(form) .setInputCol(inputCol) .setOutputCol(outputCol) val result = unicodeNormalize.transform(wordDF) assert(verifyResult(result, expected)) } test("Check for NFC forms") { testForm("NFC", expectedResultComposed) } test("Check for NFKC forms") { testForm("NFKC", expectedResultComposed) } test("Check for NFD forms") { testForm("NFD", expectedResultDecomposed) } test("Check for NFKD forms") { testForm("NFKD", expectedResultDecomposed) } def testObjects(): Seq[TestObject[UnicodeNormalize]] = List(new TestObject( new UnicodeNormalize().setInputCol("words").setOutputCol("out"), makeBasicDF())) override def reader: MLReadable[_] = UnicodeNormalize }