org.apache.spark.ml.Estimator Scala Examples
The following examples show how to use org.apache.spark.ml.Estimator.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TypedEstimator.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml import frameless.ops.SmartProject import org.apache.spark.ml.{Estimator, Model} trait TypedEstimator[Inputs, Outputs, M <: Model[M]] { val estimator: Estimator[M] def fit[T, F[_]](ds: TypedDataset[T])( implicit smartProject: SmartProject[T, Inputs], F: SparkDelay[F] ): F[AppendTransformer[Inputs, Outputs, M]] = { implicit val sparkSession = ds.dataset.sparkSession F.delay { val inputDs = smartProject.apply(ds) val model = estimator.fit(inputDs.dataset) new AppendTransformer[Inputs, Outputs, M] { val transformer: M = model } } } }
Example 2
Source File: SerializableSparkEstimator.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql import org.apache.spark.sql.types.StructType import io.deepsense.sparkutils.ML class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E) extends ML.Estimator[SerializableSparkModel[T]] with MLWritable { override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0" override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = { val result: T = sparkEstimator.fit(dataset) new SerializableSparkModel[T](result) } override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] = new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E]) override def write: MLWriter = new DefaultMLWriter(this) override def transformSchema(schema: StructType): StructType = sparkEstimator.transformSchema(schema) }
Example 3
Source File: FeatureTestBase.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.test import com.salesforce.op.features._ import com.salesforce.op.features.types._ import com.salesforce.op.utils.spark.RichDataset.RichDataset import org.apache.spark.ml.{Estimator, Transformer} import org.apache.spark.sql.Dataset import org.scalatest.prop.PropertyChecks import org.scalatest.{Assertion, Suite} import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag def testOp[A <: FeatureType : TypeTag, B <: FeatureType : TypeTag, C <: FeatureType : TypeTag : FeatureTypeSparkConverter : ClassTag] ( op: FeatureLike[A] => FeatureLike[B] => FeatureLike[C] ): BinaryTester[A, B, C] = new BinaryTester[A, B, C] { def of(v: (A, B)*): Checker[C] = new Checker[C] { def expecting(z: C*): Assertion = { val (data, f1, f2) = TestFeatureBuilder[A, B](v) val f = op(f1)(f2) checkFeature(f, data, expected = z, clue = s"Testing ${f.originStage.operationName} on $v: ") } } } sealed abstract class UnaryTester[A <: FeatureType, C <: FeatureType : TypeTag : FeatureTypeSparkConverter : ClassTag] { def of(x: A*): Checker[C] } sealed abstract class BinaryTester[A <: FeatureType, B <: FeatureType, C <: FeatureType : TypeTag : FeatureTypeSparkConverter : ClassTag] { def of(x: A, y: B): Checker[C] = of((x, y)) def of(x: (A, B)*): Checker[C] } sealed abstract class Checker[C <: FeatureType : TypeTag : FeatureTypeSparkConverter : ClassTag] { def expecting(z: C*): Assertion protected def checkFeature(f: FeatureLike[C], data: Dataset[_], clue: String, expected: Seq[C]): Assertion = { val transformed = f.originStage match { case e: Estimator[_] => e.fit(data).transform(data) case t: Transformer => t.transform(data) } withClue(clue)( new RichDataset(transformed).collect[C](f) should contain theSameElementsInOrderAs expected ) } } }
Example 4
Source File: IDFTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.feature.IDF import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.{Estimator, Transformer} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class IDFTest extends FlatSpec with TestSparkContext { val data = Seq( Vectors.sparse(4, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(4, Array(1), Array(1.0)) ) lazy val (ds, f1) = TestFeatureBuilder(data.map(_.toOPVector)) Spec[IDF] should "compute inverted document frequency" in { val idf = f1.idf() val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((data.length + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } it should "compute inverted document frequency when minDocFreq is 1" in { val idf = f1.idf(minDocFreq = 1) val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((data.length + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } private def scaleDataWithIDF(dataSet: Seq[Vector], model: Vector): Seq[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } }
Example 5
Source File: DateMapToUnitCircleVectorizerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.sequence.SequenceModel import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.OpVectorMetadata import org.apache.spark.ml.{Estimator, Transformer} import org.apache.spark.ml.linalg.Vectors import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.RichMetadata._ import org.joda.time.{DateTime => JDateTime} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, SequenceModel[DateMap, OPVector], DateMapToUnitCircleVectorizer[DateMap]] with AttributeAsserts { val eps = 1E-4 val sampleDateTimes = Seq[JDateTime]( new JDateTime(2018, 2, 11, 0, 0, 0, 0), new JDateTime(2018, 11, 28, 6, 0, 0, 0), new JDateTime(2018, 2, 17, 12, 0, 0, 0), new JDateTime(2017, 4, 17, 18, 0, 0, 0), new JDateTime(1918, 2, 13, 3, 0, 0, 0) ) val (inputData, f1) = TestFeatureBuilder( sampleDateTimes.map(x => Map("a" -> x.getMillis, "b" -> x.getMillis).toDateMap) ) override val expectedResult: Seq[OPVector] = sampleDateTimes .map{ v => val rad = DateToUnitCircle.convertToRandians(Option(v.getMillis), TimePeriod.HourOfDay) (rad ++ rad).toOPVector } it should "work with its shortcut as a DateMap" in { val output = f1.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]] .fit(inputData).transform(inputData) val field = transformed.schema(output.name) val actual = transformed.collect(output) assertNominal(field, Array.fill(actual.head.value.size)(false), actual) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "work with its shortcut as a DateTimeMap" in { val (inputDataDT, f1DT) = TestFeatureBuilder( sampleDateTimes.map(x => Map("a" -> x.getMillis, "b" -> x.getMillis).toDateTimeMap) ) val output = f1DT.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]] .fit(inputData).transform(inputData) val field = transformed.schema(output.name) val actual = transformed.collect(output) assertNominal(field, Array.fill(actual.head.value.size)(false), actual) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "make the correct metadata" in { val fitted = estimator.fit(inputData) val meta = OpVectorMetadata(fitted.getOutputFeatureName, fitted.getMetadata()) meta.columns.length shouldBe 4 meta.columns.flatMap(_.grouping) shouldEqual Seq("a", "a", "b", "b") meta.columns.flatMap(_.descriptorValue) shouldEqual Seq("x_HourOfDay", "y_HourOfDay", "x_HourOfDay", "y_HourOfDay") } }
Example 6
Source File: ModelSelectorNames.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.selector import com.salesforce.op.evaluators.{EvaluationMetrics, _} import com.salesforce.op.features.types._ import com.salesforce.op.stages._ import com.salesforce.op.stages.base.binary.OpTransformer2 import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.{DataFrame, Dataset} case object ModelSelectorNames { val TrainValSplitResults = "trainValidationSplitResults" val CrossValResults = "crossValidationResults" val TrainingEval = "trainingSetEvaluationResults" val HoldOutEval = "testSetEvaluationResults" val ResampleValues = "resamplingValues" val CuttValues = "cuttValues" val PreSplitterDataCount = "preSplitterDataCount" val BestModelUid = "bestModelUID" val BestModelName = "bestModelName" val Positive = "positiveLabels" val Negative = "negativeLabels" val Desired = "desiredFraction" val UpSample = "upSamplingFraction" val DownSample = "downSamplingFraction" val idColName = "rowId" val LabelsKept = "labelsKept" val LabelsDropped = "labelsDropped" val LabelsDroppedTotal = "labelsDroppedTotal" type ModelType = Model[_ <: Model[_]] with OpTransformer2[RealNN, OPVector, Prediction] type EstimatorType = Estimator[_ <: Model[_]] with OpPipelineStage2[RealNN, OPVector, Prediction] // Stage param names val inputParam1Name = "labelCol" val inputParam2Name = "featuresCol" val outputParamName = "outputFeatureName" } private[op] def evaluateModel(data: Dataset[_]): DataFrame = { val scored = transform(data) val metrics = evaluate(scored) val metadata = ModelSelectorSummary.fromMetadata(getMetadata().getSummaryMetadata()) .copy(holdoutEvaluation = Option(metrics)) setMetadata(metadata.toMetadata().toSummaryMetadata()) scored } }
Example 7
Source File: SwUnaryEstimator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage1 import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag private[stages] final class SwUnaryModel[I <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParamName: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends Model[SwUnaryModel[I, O, T]] with SwTransformer1[I, O, T] with SparkWrapperParams[T] { setSparkMlStage(sparkMlStageIn) }
Example 8
Source File: SwSequenceEstimator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStageN import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag private[stages] final class SwSequenceModel[I <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParamName: String, val operationName: String, val outputParamName: String, private val sparkMlStageIn: Option[T], val uid: String )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends Model[SwSequenceModel[I, O, T]] with SwTransformerN[I, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 9
Source File: SwBinaryEstimator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage2 import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag private[stages] final class SwBinaryModel[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParam1Name: String, val inputParam2Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends Model[SwBinaryModel[I1, I2, O, T]] with SwTransformer2[I1, I2, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 10
Source File: OpEstimatorSpec.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.test import java.io.File import com.salesforce.op.features.types._ import com.salesforce.op.stages._ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql.Dataset import org.scalactic.Equality import org.scalatest.events.{Event, TestFailed} import org.scalatest.{Args, Reporter} import scala.collection.mutable.ArrayBuffer import scala.reflect._ import scala.reflect.runtime.universe._ private def modelSpec(): Unit = { // Define transformer spec for the fitted model reusing the same inputs & Spark context val modelSpec = new OpTransformerSpec[O, ModelType] { override implicit val featureTypeEquality: Equality[O] = OpEstimatorSpec.this.featureTypeEquality override implicit val seqEquality: Equality[Seq[O]] = OpEstimatorSpec.this.seqEquality lazy val transformer: ModelType = OpEstimatorSpec.this.model lazy val inputData: Dataset[_] = OpEstimatorSpec.this.inputData lazy val expectedResult: Seq[O] = OpEstimatorSpec.this.expectedResult override implicit lazy val spark = OpEstimatorSpec.this.spark override def specName: String = "model" override def tempDir: File = OpEstimatorSpec.this.tempDir } // Register all model spec tests for { testName <- modelSpec.testNames } registerTest(testName) { // Run test & collect failures val failures = ArrayBuffer.empty[TestFailed] val reporter = new Reporter { def apply(event: Event): Unit = event match { case f: TestFailed => failures += f case _ => } } // Note: We set 'runTestInNewInstance = true' to avoid restarting Spark context on every test run val args = Args(reporter, runTestInNewInstance = true) modelSpec.run(testName = Some(testName), args = args) // Propagate the failure if any for {failure <- failures.headOption} { failure.throwable.map(fail(failure.message, _)).getOrElse(fail(failure.message)) } } } }
Example 11
Source File: OpPipelineStageWriter.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import org.apache.hadoop.fs.Path import org.apache.spark.ml.util.MLWriter import org.apache.spark.ml.{Estimator, SparkDefaultParamsReadWrite} import org.json4s.JsonAST.{JObject, JValue} import org.json4s.jackson.JsonMethods.{compact, render} import scala.util.{Failure, Success} def writeToJson(path: String): JObject = { stage match { case _: Estimator[_] => return JObject() // no need to serialize estimators case s: SparkWrapperParams[_] => // Set save path for all Spark wrapped stages of type [[SparkWrapperParams]] so they can save s.setStageSavePath(path) case _ => } // We produce stage metadata for all the Spark params val metadata = SparkDefaultParamsReadWrite.getMetadataToSave(stage) // Write out the stage using the specified writer instance val writer = readerWriterFor[OpPipelineStageBase](stage.getClass.asInstanceOf[Class[OpPipelineStageBase]]) val stageJson: JValue = writer.write(stage) match { case Failure(err) => throw new RuntimeException(s"Failed to write out stage '${stage.uid}'", err) case Success(json) => json } // Join metadata & with stage ctor args val j = metadata.merge(JObject(FieldNames.CtorArgs.entryName -> stageJson)) render(j).asInstanceOf[JObject] } }
Example 12
Source File: LocalTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val distribTree = setParams(new DecisionTreeRegressor(), testParams) val localTree = setParams(new LocalDecisionTreeRegressor(), testParams) val localModel = localTree.fit(train) val model = distribTree.fit(train) OptimizedTreeTests.checkEqual(model, localModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }
Example 13
Source File: SerializableSparkEstimator.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql import org.apache.spark.sql.types.StructType import ai.deepsense.sparkutils.ML class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E) extends ML.Estimator[SerializableSparkModel[T]] with MLWritable { override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0" override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = { val result: T = sparkEstimator.fit(dataset) new SerializableSparkModel[T](result) } override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] = new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E]) override def write: MLWriter = new DefaultMLWriter(this) override def transformSchema(schema: StructType): StructType = sparkEstimator.transformSchema(schema) }
Example 14
Source File: Featurize.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.ml.{Estimator, Pipeline, PipelineModel} import org.apache.spark.sql._ import org.apache.spark.sql.types._ private[spark] object FeaturizeUtilities { // 2^18 features by default val NumFeaturesDefault = 262144 // 2^12 features for tree-based or NN-based learners val NumFeaturesTreeOrNNBased = 4096 } object Featurize extends DefaultParamsReadable[Featurize] override def fit(dataset: Dataset[_]): PipelineModel = { val pipeline = assembleFeaturesEstimators(getFeatureColumns) pipeline.fit(dataset) } private def assembleFeaturesEstimators(featureColumns: Map[String, Seq[String]]): Pipeline = { val assembleFeaturesEstimators = featureColumns.map(newColToFeatures => { new AssembleFeatures() .setColumnsToFeaturize(newColToFeatures._2.toArray) .setFeaturesCol(newColToFeatures._1) .setNumberOfFeatures(getNumberOfFeatures) .setOneHotEncodeCategoricals(getOneHotEncodeCategoricals) .setAllowImages(getAllowImages) }).toArray new Pipeline().setStages(assembleFeaturesEstimators) } override def copy(extra: ParamMap): Estimator[PipelineModel] = { new Featurize() } @DeveloperApi override def transformSchema(schema: StructType): StructType = assembleFeaturesEstimators(getFeatureColumns).transformSchema(schema) }
Example 15
Source File: IsolationForest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.isolationforest import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util._ import org.apache.spark.ml.{Estimator, Model} import com.linkedin.relevance.isolationforest.{IsolationForestParams, IsolationForest => IsolationForestSource, IsolationForestModel => IsolationForestModelSource} import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.StructType object IsolationForest extends DefaultParamsReadable[IsolationForest] class IsolationForest(override val uid: String, val that: IsolationForestSource) extends Estimator[IsolationForestModel] with IsolationForestParams with DefaultParamsWritable with Wrappable { def this(uid: String) = this(uid, new IsolationForestSource(uid)) def this() = this(Identifiable.randomUID("IsolationForest")) override def copy(extra: ParamMap): IsolationForest = new IsolationForest(uid, that.copy(extra)) override def fit(data: Dataset[_]): IsolationForestModel = new IsolationForestModel(uid, that.fit(data)) override def transformSchema(schema: StructType): StructType = that.transformSchema(schema) } class IsolationForestModel(override val uid: String, val that: IsolationForestModelSource) extends Model[IsolationForestModel] with MLWritable { override def copy(extra: ParamMap): IsolationForestModel = new IsolationForestModel(uid, that.copy(extra)) override def transform(data: Dataset[_]): DataFrame = that.transform(data) override def transformSchema(schema: StructType): StructType = that.transformSchema(schema) override def write: MLWriter = that.write } class IsolationForestModelReader extends MLReader[IsolationForestModel] with Serializable { override def load(path: String): IsolationForestModel = { val that = IsolationForestModelSource.load(path) new IsolationForestModel(that.uid, that) } } object IsolationForestModel extends MLReadable[IsolationForestModel] { override def read: MLReader[IsolationForestModel] = new IsolationForestModelReader }
Example 16
Source File: StringToShortIndexer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.util.collection.OpenHashMap class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel] with StringIndexerBase { def this() = this(Identifiable.randomUID("strShortIdx")) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def fit(dataset: DataFrame): StringToShortIndexerModel = { val counts = dataset.select(col($(inputCol)).cast(StringType)) .map(_.getString(0)) .countByValue() val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") copyValues(new StringToShortIndexerModel(uid, labels).setParent(this)) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra) } class StringToShortIndexerModel ( override val uid: String, val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase { def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels) require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") private val labelToIndex: OpenHashMap[String, Short] = { val n = labels.length.toShort val map = new OpenHashMap[String, Short](n) var i: Short = 0 while (i < n) { map.update(labels(i), i) i = (i + 1).toShort } map } def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { if (!dataset.schema.fieldNames.contains($(inputCol))) { logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " + "Skip StringToShortIndexerModel.") return dataset } val indexer = udf { label: String => if (labelToIndex.contains(label)) { labelToIndex(label) } else { // TODO: handle unseen labels throw new SparkException(s"Unseen label: $label.") } } val outputColName = $(outputCol) val metadata = NominalAttribute.defaultAttr .withName(outputColName).withValues(labels).toMetadata() dataset.select(col("*"), indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { if (schema.fieldNames.contains($(inputCol))) { validateAndTransformSchema(schema) } else { // If the input column does not exist during transformation, we skip StringToShortIndexerModel. schema } } override def copy(extra: ParamMap): StringToShortIndexerModel = { val copied = new StringToShortIndexerModel(uid, labels) copyValues(copied, extra).setParent(parent) } }
Example 17
Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.mutable.ListBuffer class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline { def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty) def this(uid: String) = this(uid, Array.empty) def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages) this.setStages(baseStages) override def fit(dataset: Dataset[_]): PipelineModel = { transformSchema(dataset.schema, logging = true) val theStages = $(stages) var indexOfLastEstimator = -1 theStages.view.zipWithIndex.foreach { case (stage, index) => stage match { case _: Estimator[_] => indexOfLastEstimator = index case _ => } } var curDataset = dataset val transformers = ListBuffer.empty[Transformer] theStages.view.zipWithIndex.foreach { case (stage, index) => if (index <= indexOfLastEstimator) { val transformer = stage match { case estimator: HasRecursiveFit[_] => estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset)) case estimator: Estimator[_] => estimator.fit(curDataset) case t: Transformer => t case _ => throw new IllegalArgumentException( s"Does not support stage $stage of type ${stage.getClass}") } if (index < indexOfLastEstimator) { curDataset = transformer.transform(curDataset) } transformers += transformer } else { transformers += stage.asInstanceOf[Transformer] } } createPipeline(dataset, transformers.toArray) } } class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel) extends Model[RecursivePipelineModel] with MLWritable with Logging { def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline) // drops right at most because is itself included private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel = new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset) override def copy(extra: ParamMap): RecursivePipelineModel = { new RecursivePipelineModel(uid, innerPipeline.copy(extra)) } override def write: MLWriter = { innerPipeline.write } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match { case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset)) case t: AnnotatorModel[_] if t.getLazyAnnotator => cur case t: Transformer => t.transform(cur) }) } override def transformSchema(schema: StructType): StructType = { innerPipeline.transformSchema(schema) } }
Example 18
Source File: AnnotatorApproach.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.storage.HasStorage import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer} import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType} import org.apache.spark.ml.util.DefaultParamsWritable override final def transformSchema(schema: StructType): StructType = { require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" + msgHelper(schema) + s"\nMake sure such annotators exist in your pipeline, " + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", outputAnnotatorType) val outputFields = schema.fields :+ StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build) StructType(outputFields) } }
Example 19
Source File: LogisticRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.{Estimator, ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml import org.apache.spark.ml.linalg.Vectors object LogisticRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLogisticRegressionModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.LogisticRegression() .setTol(tol) .setMaxIter(maxIter) .setRegParam(regParam) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 20
Source File: RandomForestClassification.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml.{Estimator, PipelineStage} import org.apache.spark.ml.classification.RandomForestClassifier import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ object RandomForestClassification extends BenchmarkAlgorithm with TreeOrForestClassifier { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ // TODO: subsamplingRate, featureSubsetStrategy // TODO: cacheNodeIds, checkpoint? new RandomForestClassifier() .setMaxDepth(depth) .setNumTrees(maxIter) .setSeed(ctx.seed()) } }
Example 21
Source File: LDA.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.clustering import scala.collection.mutable.{HashMap => MHashMap} import org.apache.commons.math3.random.Well19937c import org.apache.spark.ml.{Estimator, PipelineStage} import org.apache.spark.ml import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.ml.linalg.{Vector, Vectors} import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ object LDA extends BenchmarkAlgorithm with TestFromTraining { // The LDA model is package private, no need to expose it. override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ val rdd = ctx.sqlContext.sparkContext.parallelize( 0L until numExamples, numPartitions ) val seed: Int = randomSeed val docLen = docLength.get val numVocab = vocabSize.get val data: RDD[(Long, Vector)] = rdd.mapPartitionsWithIndex { (idx, partition) => val rng = new Well19937c(seed ^ idx) partition.map { docIndex => var currentSize = 0 val entries = MHashMap[Int, Int]() while (currentSize < docLen) { val index = rng.nextInt(numVocab) entries(index) = entries.getOrElse(index, 0) + 1 currentSize += 1 } val iter = entries.toSeq.map(v => (v._1, v._2.toDouble)) (docIndex, Vectors.sparse(numVocab, iter)) } } ctx.sqlContext.createDataFrame(data).toDF("docIndex", "features") } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.clustering.LDA() .setK(k) .setSeed(randomSeed.toLong) .setMaxIter(maxIter) .setOptimizer(optimizer) } // TODO(?) add a scoring method here. }
Example 22
Source File: OptimizedDecisionTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.classification.{DecisionTreeClassifier, OptimizedDecisionTreeClassifier} import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.{DecisionTreeRegressor, OptimizedDecisionTreeRegressor} import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeRegressor(), testParams) val newTree = setParams(new OptimizedDecisionTreeRegressor(), testParams) val newModel = newTree.fit(train) val oldModel = oldTree.fit(train) OptimizedTreeTests.checkEqual(oldModel, newModel) } private def testClassifierEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeClassifier(), testParams) val newTree = setParams(new OptimizedDecisionTreeClassifier(), testParams) val newModel = newTree.fit(train) val model = oldTree.fit(train) OptimizedTreeTests.checkEqual(model, newModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree with two feature values") { val data = sc.parallelize(Range(0, 8).map(x => { if (x > 3) { Instance(x, 1.0, Vectors.dense(0.0)) } else { Instance(x, 1.0, Vectors.dense(1.0)) }})) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 10).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }