org.apache.spark.ml.classification.LogisticRegressionModel Scala Examples
The following examples show how to use org.apache.spark.ml.classification.LogisticRegressionModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MLPipelineTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.param.ParamMap import org.scalaml.Logging import org.scalaml.spark.ResourcesLoader import org.scalatest.{FlatSpec, Matchers} final class MLPipelineTest extends FlatSpec with Matchers with Logging { protected val name = "Spark ML pipeline" final val trainFile = "/data/spark/mlpipeline_training.csv" final val testFile = "/data/spark/mlpipeline_test.csv" final val columns = Array[String]("date", "asset", "region", "agent") it should s"$name simple predictor" in { show(s"$name simple predictor") (for { trainPath <- ResourcesLoader.getPath(trainFile) testPath <- ResourcesLoader.getPath(testFile) } yield { val predictor = new SimplePredictor[LogisticRegressionModel]( new LogisticRegression().setMaxIter(5).setRegParam(0.1), columns, trainPath ) (predictor, predictor.classify(predictor(), testPath)) }).map { case (predictor, output) => { output.printSchema val predictedValues = output.select("prediction").collect.map(_.getDouble(0)) output.show predictor.stop predictedValues(0) } should be(0.0) } } it should s"$name cross validation" in { show(s"$name cross validation") (for { trainPath <- ResourcesLoader.getPath(trainFile) testPath <- ResourcesLoader.getPath(testFile) } yield { val lr = new LogisticRegression().setMaxIter(5).setRegParam(0.1) val paramsMap = new ParamMap().put(lr.maxIter -> 30).put(lr.regParam -> 0.1) val validator = new ValidatedPredictor[LogisticRegressionModel](lr, columns, trainPath) val (f1, auROC) = validator.trainingWithSummary.getOrElse((Double.NaN, Double.NaN)) println(s"F1-measure = ${f1} auROC = ${auROC}") validator.stop f1 should be(0.025 +- 0.005) auROC should be(0.600 +- 0.005) }) } } // -------------------------------- EOF ---------------------------------------------
Example 2
Source File: LogisticRegressionModel.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.classification import com.ibm.aardpfark.pfa.document.{PFABuilder, PFADocument} import com.ibm.aardpfark.pfa.dsl._ import com.ibm.aardpfark.spark.ml.PFALinearPredictionModel import org.apache.avro.SchemaBuilder import org.apache.spark.ml.classification.LogisticRegressionModel class PFALogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends PFALinearPredictionModel { private val rawPredictionCol = sparkTransformer.getRawPredictionCol private val probabilityCol = sparkTransformer.getProbabilityCol private val isBinary = sparkTransformer.numClasses == 2 override def outputSchema = SchemaBuilder.record(withUid(outputBaseName)).fields() .name(rawPredictionCol).`type`().array().items().doubleType().noDefault() .name(predictionCol).`type`.doubleType().noDefault() .name(probabilityCol).`type`().array().items().doubleType().noDefault() .endRecord() private val safeDoubleDiv = NamedFunctionDef("safeDoubleDiv", FunctionDef[Double, Double]("x", "y") { case Seq(x, y) => val result = Let("result", core.div(x, y)) val cond = If (impute.isnan(result.ref)) Then { core.addinv(core.pow(10.0, 320.0)) } Else { result.ref } Seq( result, cond ) }) private val rawPredFn = if (isBinary) { NewArray[Double](Seq(core.addinv(margin.ref), margin.ref)) } else { margin.ref } private val probFn = if (isBinary) { m.link.logit(rawPredFn) } else { m.link.softmax(rawPredFn) } private val rawPred = Let("rawPred", rawPredFn) private val prob = Let("prob", probFn) private val predFn = if (isBinary) { val threshold = sparkTransformer.getThreshold val probAttr = Attr(prob.ref, 1) If (core.lte(probAttr, threshold)) Then 0.0 Else 1.0 } else { val scaled = if (sparkTransformer.isDefined(sparkTransformer.thresholds)) { val thresholds = NewArray[Double](sparkTransformer.getThresholds.map(DoubleLiteral)) a.zipmap(prob.ref, thresholds, safeDoubleDiv.ref) } else { prob.ref } a.argmax(scaled) } private val pred = Let("pred", predFn) override def action = { Action( margin, rawPred, prob, pred, NewRecord(outputSchema, Map( probabilityCol -> prob.ref, rawPredictionCol -> rawPred.ref, predictionCol -> pred.ref) ) ) } override def pfa: PFADocument = { val bldr = PFABuilder() .withName(sparkTransformer.uid) .withMetadata(getMetadata) .withInput(inputSchema) .withOutput(outputSchema) .withCell(modelCell) .withAction(action) if (!isBinary) { bldr.withFunction(safeDoubleDiv) } bldr.pfa } }
Example 3
Source File: TitanicLogisticRegression.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.classification import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.DataFrame object TitanicLogisticRegression extends SparkSessionWrapper { def withVectorizedFeatures( featureColNames: Array[String] = Array("Gender", "Age", "SibSp", "Parch", "Fare"), outputColName: String = "features" )(df: DataFrame): DataFrame = { val assembler: VectorAssembler = new VectorAssembler() .setInputCols(featureColNames) .setOutputCol(outputColName) assembler.transform(df) } def withLabel( inputColName: String = "Survived", outputColName: String = "label" )(df: DataFrame) = { val labelIndexer: StringIndexer = new StringIndexer() .setInputCol(inputColName) .setOutputCol(outputColName) labelIndexer .fit(df) .transform(df) } def model(df: DataFrame = TitanicData.trainingDF()): LogisticRegressionModel = { val trainFeatures: DataFrame = df .transform(withVectorizedFeatures()) .transform(withLabel()) .select("features", "label") // only uses the features and label columns new LogisticRegression() .fit(trainFeatures) } def persistModel(): Unit = { model().save("./tmp/titanic_model/") } }
Example 4
Source File: LogisticRegressionRecommender.scala From wordpress-posts-recommender with Apache License 2.0 | 5 votes |
package wordpressworkshop import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.param.ParamMap import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame case class LogisticRegressionRecommender(training: DataFrame) { val lr = new LogisticRegression() val paramMap = ParamMap(lr.maxIter -> 20) .put(lr.regParam -> 0.01) .put(lr.probabilityCol -> "probability") val model: LogisticRegressionModel = lr.fit(training, paramMap) def metrics(testData: DataFrame) = { val predictionAndLabels: RDD[(Double, Double)] = model.transform(testData).map(row => row.getAs[Vector]("probability")(1) -> row.getAs[Double]("label")) new BinaryClassificationMetrics(predictionAndLabels) } def likeScores(testData: DataFrame): RDD[(Long, Long, Double)] = model.transform(testData) .map(row => (row.getAs[Long]("userId"), row.getAs[Long]("postId"), row.getAs[Vector]("probability")(1))) }
Example 5
Source File: StringIndexerDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.sql._ import org.apache.spark.sql.SQLContext object StringIndexerDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("label") .fit(df) val indexed = indexer.transform(df) indexed.show(false) spark.stop() } }
Example 6
Source File: OneHotEncoderDemo2.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator object OneHotEncoderDemo2 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() spark.stop() } }
Example 7
Source File: SparkRWrappers.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.api.r import org.apache.spark.ml.attribute._ import org.apache.spark.ml.feature.RFormula import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.DataFrame private[r] object SparkRWrappers { def fitRModelFormula( value: String, df: DataFrame, family: String, lambda: Double, alpha: Double, standardize: Boolean, solver: String): PipelineModel = { val formula = new RFormula().setFormula(value) val estimator = family match { case "gaussian" => new LinearRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) .setStandardization(standardize) .setSolver(solver) case "binomial" => new LogisticRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) .setStandardization(standardize) } val pipeline = new Pipeline().setStages(Array(formula, estimator)) pipeline.fit(df) } def getModelCoefficients(model: PipelineModel): Array[Double] = { model.stages.last match { case m: LinearRegressionModel => { val coefficientStandardErrorsR = Array(m.summary.coefficientStandardErrors.last) ++ m.summary.coefficientStandardErrors.dropRight(1) val tValuesR = Array(m.summary.tValues.last) ++ m.summary.tValues.dropRight(1) val pValuesR = Array(m.summary.pValues.last) ++ m.summary.pValues.dropRight(1) if (m.getFitIntercept) { Array(m.intercept) ++ m.coefficients.toArray ++ coefficientStandardErrorsR ++ tValuesR ++ pValuesR } else { m.coefficients.toArray ++ coefficientStandardErrorsR ++ tValuesR ++ pValuesR } } case m: LogisticRegressionModel => { if (m.getFitIntercept) { Array(m.intercept) ++ m.coefficients.toArray } else { m.coefficients.toArray } } } } def getModelDevianceResiduals(model: PipelineModel): Array[Double] = { model.stages.last match { case m: LinearRegressionModel => m.summary.devianceResiduals case m: LogisticRegressionModel => throw new UnsupportedOperationException( "No deviance residuals available for LogisticRegressionModel") } } def getModelFeatures(model: PipelineModel): Array[String] = { model.stages.last match { case m: LinearRegressionModel => val attrs = AttributeGroup.fromStructField( m.summary.predictions.schema(m.summary.featuresCol)) if (m.getFitIntercept) { Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get) } else { attrs.attributes.get.map(_.name.get) } case m: LogisticRegressionModel => val attrs = AttributeGroup.fromStructField( m.summary.predictions.schema(m.summary.featuresCol)) if (m.getFitIntercept) { Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get) } else { attrs.attributes.get.map(_.name.get) } } } def getModelName(model: PipelineModel): String = { model.stages.last match { case m: LinearRegressionModel => "LinearRegressionModel" case m: LogisticRegressionModel => "LogisticRegressionModel" } } }
Example 8
Source File: SparkRWrappers.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.api.r import org.apache.spark.ml.attribute._ import org.apache.spark.ml.feature.RFormula import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.DataFrame private[r] object SparkRWrappers { def fitRModelFormula( value: String, df: DataFrame, family: String, lambda: Double, alpha: Double): PipelineModel = { val formula = new RFormula().setFormula(value) val estimator = family match { case "gaussian" => new LinearRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) case "binomial" => new LogisticRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) } val pipeline = new Pipeline().setStages(Array(formula, estimator)) pipeline.fit(df) } def getModelWeights(model: PipelineModel): Array[Double] = { model.stages.last match { case m: LinearRegressionModel => Array(m.intercept) ++ m.weights.toArray case _: LogisticRegressionModel => throw new UnsupportedOperationException( "No weights available for LogisticRegressionModel") // SPARK-9492 } } def getModelFeatures(model: PipelineModel): Array[String] = { model.stages.last match { case m: LinearRegressionModel => val attrs = AttributeGroup.fromStructField( m.summary.predictions.schema(m.summary.featuresCol)) Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get) case _: LogisticRegressionModel => throw new UnsupportedOperationException( "No features names available for LogisticRegressionModel") // SPARK-9492 } } }
Example 9
Source File: OpLogisticRegressionTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.classification import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.PredictionEquality import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpLogisticRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LogisticRegressionModel], OpPredictorWrapper[LogisticRegression, LogisticRegressionModel]] with PredictionEquality { override def specName: String = Spec[OpLogisticRegression] val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, 0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector, 1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector, 1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector, 1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector ) ) val feature1 = rawFeature1.copy(isResponse = true) val estimator = new OpLogisticRegression().setInput(feature1, feature2) val expectedResult = Seq( Prediction(1.0, Array(-20.88, 20.88), Array(0.0, 1.0)), Prediction(0.0, Array(16.70, -16.7), Array(1.0, 0.0)), Prediction(0.0, Array(22.2, -22.2), Array(1.0, 0.0)), Prediction(1.0, Array(-18.35, 18.35), Array(0.0, 1.0)), Prediction(1.0, Array(-31.46, 31.46), Array(0.0, 1.0)), Prediction(0.0, Array(24.67, -24.67), Array(1.0, 0.0)), Prediction(1.0, Array(-22.07, 22.07), Array(0.0, 1.0)), Prediction(0.0, Array(20.9, -20.9), Array(1.0, 0.0)) ) it should "allow the user to set the desired spark parameters" in { estimator .setRegParam(0.1) .setElasticNetParam(0.1) .setMaxIter(20) estimator.fit(inputData) estimator.predictor.getRegParam shouldBe 0.1 estimator.predictor.getElasticNetParam shouldBe 0.1 estimator.predictor.getMaxIter shouldBe 20 } }
Example 10
Source File: ACMEModel.scala From cdsw-simple-serving with Apache License 2.0 | 5 votes |
// Don't execute these lines in the workbench -- skip to "Start workbench session" package acme import org.apache.spark.ml.PipelineModel import com.cloudera.datascience.cdsw.acme.ACMEData import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.ml.{Pipeline, PipelineModel} import scala.util.Random // Read and cache training data prepared from acme-dataeng: val training = ACMEData.readData() training.cache() training.show() // Build a logistic regression model, val assembler = new VectorAssembler(). setInputCols(training.columns.filter(_ != "Occupancy")). setOutputCol("featureVec") val lr = new LogisticRegression(). setFeaturesCol("featureVec"). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val pipeline = new Pipeline().setStages(Array(assembler, lr)) // and tune that model: val paramGrid = new ParamGridBuilder(). addGrid(lr.regParam, Seq(0.00001, 0.001, 0.1)). addGrid(lr.elasticNetParam, Seq(1.0)). build() val eval = new BinaryClassificationEvaluator(). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val validator = new TrainValidationSplit(). setSeed(Random.nextLong()). setEstimator(pipeline). setEvaluator(eval). setEstimatorParamMaps(paramGrid). setTrainRatio(0.9) val validatorModel = validator.fit(training) val pipelineModel = validatorModel.bestModel.asInstanceOf[PipelineModel] val lrModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel] // Logistic regression model parameters: training.columns.zip(lrModel.coefficients.toArray).foreach(println) // Model hyperparameters: lrModel.getElasticNetParam lrModel.getRegParam // Validation metric (accuracy): validatorModel.validationMetrics.max pipelineModel // End workbench session } }
Example 11
Source File: LogisticRegressionPrediction.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.ml_classification import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.sql.SparkSession class LogisticRegressionPrediction extends ConfigurableStop{ val authorEmail: String = "[email protected]" val description: String = "Use an existing logistic regression model to predict" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var test_data_path:String =_ var model_path:String=_ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() //load data stored in libsvm format as a dataframe val data=spark.read.format("libsvm").load(test_data_path) //data.show() //load model val model=LogisticRegressionModel.load(model_path) val predictions=model.transform(data) predictions.show() out.write(predictions) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map: Map[String, Any]): Unit = { test_data_path=MapUtil.get(map,key="test_data_path").asInstanceOf[String] model_path=MapUtil.get(map,key="model_path").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val test_data_path = new PropertyDescriptor().name("test_data_path").displayName("TEST_DATA_PATH").defaultValue("").required(true) val model_path = new PropertyDescriptor().name("model_path").displayName("MODEL_PATH").defaultValue("").required(true) descriptor = test_data_path :: descriptor descriptor = model_path :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/ml_classification/LogisticRegressionPrediction.png") } override def getGroup(): List[String] = { List(StopGroup.MLGroup.toString) } }
Example 12
Source File: ModelEstimator.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegressionModel} import org.apache.spark.ml._ import org.apache.spark.sql import sql._ @throws(classOf[IllegalArgumentException]) final def trainWithSummary( trainDf: DataFrame, stages: Array[PipelineStage] ): Option[(Double, Double)] = { require(stages.size > 0, "Cannot process a pipeline without stages") // Print the training set data frame trainDf.printSchema this(trainDf, stages).stages.last match { case lrModel: LogisticRegressionModel => val binarySummary = lrModel.summary.asInstanceOf[BinaryLogisticRegressionSummary] // Set the model threshold to maximize F-Measure val f1: Double = binarySummary.fMeasureByThreshold.select("F-Measure").head.getDouble(0) Some(f1, binarySummary.areaUnderROC) case _ => None } } } // ------------------------------ EOF --------------------------------------------------------
Example 13
Source File: MultinomialLogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class MultinomialLogisticRegressionParitySpec extends SparkParityBase { val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0) val ages = Seq(15, 30, 40, 50, 15, 80) val heights = Seq(175, 190, 155, 160, 170, 180) val weights = Seq(67, 100, 57, 56, 56, 88) val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) }) val schema = new StructType().add("label", DoubleType, nullable = false) .add("age", IntegerType, nullable = false) .add("height", IntegerType, nullable = false) .add("weight", IntegerType, nullable = false) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new VectorAssembler(). setInputCols(Array("age", "height", "weight")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)), interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703), numClasses = 3, isMultinomial = true))).fit(dataset) }
Example 14
Source File: LogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame import org.apache.spark.ml.linalg.Vectors class LogisticRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficients = Vectors.dense(0.44, 0.77), intercept = 0.66).setThreshold(0.7).setFeaturesCol("features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 15
Source File: LogisticRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrices, Vectors} class LogisticRegressionOp extends SimpleSparkOp[LogisticRegressionModel] { private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5 override val Model: OpModel[SparkBundleContext, LogisticRegressionModel] = new OpModel[SparkBundleContext, LogisticRegressionModel] { override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel] override def opName: String = Bundle.BuiltinOps.classification.logistic_regression override def store(model: Model, obj: LogisticRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val m = model.withValue("num_classes", Value.long(obj.numClasses)) if(obj.numClasses > 2) { val cm = obj.coefficientMatrix val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))). withValue("intercept_vector", Value.vector(obj.interceptVector.toArray)). withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList)) } else { m.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)). withValue("threshold", Value.double(obj.getThreshold)) } } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): LogisticRegressionModel = { val numClasses = model.value("num_classes").getLong val r = if(numClasses > 2) { val cmTensor = model.value("coefficient_matrix").getTensor[Double] val coefficientMatrix = Matrices.dense(cmTensor.dimensions.head, cmTensor.dimensions(1), cmTensor.toArray) val lr = new LogisticRegressionModel(uid = "", coefficientMatrix = coefficientMatrix, interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray), numClasses = numClasses.toInt, isMultinomial = true) model.getValue("thresholds"). map(t => lr.setThresholds(t.getDoubleList.toArray)). getOrElse(lr) } else { val lr = new LogisticRegressionModel(uid = "", coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble) // default threshold is 0.5 for both Spark and Scikit-learn val threshold = model.getValue("threshold") .map(value => value.getDouble) .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD) lr.setThreshold(threshold) } r } } override def sparkLoad(uid: String, shape: NodeShape, model: LogisticRegressionModel): LogisticRegressionModel = { val numClasses = model.numClasses val r = if (numClasses > 2) { val lr = new LogisticRegressionModel(uid = uid, coefficientMatrix = model.coefficientMatrix, interceptVector = model.interceptVector, numClasses = numClasses, isMultinomial = true) if(model.isDefined(model.thresholds)) { lr.setThresholds(model.getThresholds) } lr } else { val lr = new LogisticRegressionModel(uid = uid, coefficientMatrix = model.coefficientMatrix, interceptVector = model.interceptVector, numClasses = numClasses, isMultinomial = false) if(model.isDefined(model.threshold)) { lr.setThreshold(model.getThreshold) } lr } r } override def sparkInputs(obj: LogisticRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: LogisticRegressionModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "probability" -> obj.probabilityCol, "prediction" -> obj.predictionCol) } }
Example 16
Source File: ChurnPredictionLR.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator object ChurnPredictionLR { def main(args: Array[String]) { val spark: SparkSession = SparkSessionCreate.createSession("ChurnPredictionLogisticRegression") import spark.implicits._ val numFolds = 10 val MaxIter: Seq[Int] = Seq(100) val RegParam: Seq[Double] = Seq(1.0) // L2 regularization param, set 0.10 with L1 reguarization val Tol: Seq[Double] = Seq(1e-8) val ElasticNetParam: Seq[Double] = Seq(1.0) // Combination of L1 and L2 val lr = new LogisticRegression() .setLabelCol("label") .setFeaturesCol("features") // Chain indexers and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(PipelineConstruction.ipindexer, PipelineConstruction.labelindexer, PipelineConstruction.assembler, lr)) // Search through decision tree's maxDepth parameter for best model val paramGrid = new ParamGridBuilder() .addGrid(lr.maxIter, MaxIter) .addGrid(lr.regParam, RegParam) .addGrid(lr.tol, Tol) .addGrid(lr.elasticNetParam, ElasticNetParam) .build() val evaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction") // Set up 10-fold cross validation val crossval = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(numFolds) val cvModel = crossval.fit(Preprocessing.trainDF) val predictions = cvModel.transform(Preprocessing.testSet) val result = predictions.select("label", "prediction", "probability") val resutDF = result.withColumnRenamed("prediction", "Predicted_label") resutDF.show(10) val accuracy = evaluator.evaluate(predictions) println("Classification accuracy: " + accuracy) // Compute other performence metrices val predictionAndLabels = predictions .select("prediction", "label") .rdd.map(x => (x(0).asInstanceOf[Double], x(1) .asInstanceOf[Double])) val metrics = new BinaryClassificationMetrics(predictionAndLabels) val areaUnderPR = metrics.areaUnderPR println("Area under the precision-recall curve: " + areaUnderPR) val areaUnderROC = metrics.areaUnderROC println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) val lp = predictions.select("label", "prediction") val counttotal = predictions.count() val correct = lp.filter($"label" === $"prediction").count() val wrong = lp.filter(not($"label" === $"prediction")).count() val ratioWrong = wrong.toDouble / counttotal.toDouble val ratioCorrect = correct.toDouble / counttotal.toDouble val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / counttotal.toDouble val truen = lp.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / counttotal.toDouble val falsep = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble val falsen = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble println("Total Count: " + counttotal) println("Correct: " + correct) println("Wrong: " + wrong) println("Ratio wrong: " + ratioWrong) println("Ratio correct: " + ratioCorrect) println("Ratio true positive: " + truep) println("Ratio false positive: " + falsep) println("Ratio true negative: " + truen) println("Ratio false negative: " + falsen) } }
Example 17
Source File: Describe.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.ml.linalg.{ Matrix, Vectors } import org.apache.spark.ml.stat.Correlation import org.apache.spark.sql.Row object Describe { case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, international_plan: String, voice_mail_plan: String, num_voice_mail: Double, total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, total_international_num_calls: Double, churn: String) val schema = StructType(Array( StructField("state_code", StringType, true), StructField("account_length", IntegerType, true), StructField("area_code", StringType, true), StructField("international_plan", StringType, true), StructField("voice_mail_plan", StringType, true), StructField("num_voice_mail", DoubleType, true), StructField("total_day_mins", DoubleType, true), StructField("total_day_calls", DoubleType, true), StructField("total_day_charge", DoubleType, true), StructField("total_evening_mins", DoubleType, true), StructField("total_evening_calls", DoubleType, true), StructField("total_evening_charge", DoubleType, true), StructField("total_night_mins", DoubleType, true), StructField("total_night_calls", DoubleType, true), StructField("total_night_charge", DoubleType, true), StructField("total_international_mins", DoubleType, true), StructField("total_international_calls", DoubleType, true), StructField("total_international_charge", DoubleType, true), StructField("total_international_num_calls", DoubleType, true), StructField("churn", StringType, true))) def main(args: Array[String]) { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Desribe") .getOrCreate() spark.conf.set("spark.debug.maxToStringFields", 10000) val DEFAULT_MAX_TO_STRING_FIELDS = 2500 if (SparkEnv.get != null) { SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS) } else { DEFAULT_MAX_TO_STRING_FIELDS } import spark.implicits._ val trainSet: Dataset[CustomerAccount] = spark.read. option("inferSchema", "false") .format("com.databricks.spark.csv") .schema(schema) .load("data/churn-bigml-80.csv") .as[CustomerAccount] val statsDF = trainSet.describe() statsDF.show() trainSet.createOrReplaceTempView("UserAccount") spark.catalog.cacheTable("UserAccount") spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() trainSet.groupBy("churn").count.show() spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn") } }
Example 18
Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import java.lang.Boolean import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrix, SparseMatrix, Vector, Vectors} class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {} object LocalLogisticRegressionModel extends SimpleModelLoader[LogisticRegressionModel] with TypedTransformerConverter[LogisticRegressionModel] { override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = { val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor( classOf[String], classOf[Matrix], classOf[Vector], classOf[Int], java.lang.Boolean.TYPE ) constructor.setAccessible(true) val coefficientMatrixParams = data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]] val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams) val interceptVectorParams = data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]] val interceptVector = DataUtils.constructVector(interceptVectorParams) constructor .newInstance( metadata.uid, coefficientMatrix, interceptVector, data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer], data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean] ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double]) } override implicit def toLocal( transformer: LogisticRegressionModel ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer) }
Example 19
Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrix, Vector} class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {} object LocalLogisticRegressionModel extends SimpleModelLoader[LogisticRegressionModel] with TypedTransformerConverter[LogisticRegressionModel] { override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = { val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor( classOf[String], classOf[Matrix], classOf[Vector], classOf[Int], java.lang.Boolean.TYPE ) constructor.setAccessible(true) val coefficientMatrixParams = data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]] val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams) val interceptVectorParams = data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]] val interceptVector = DataUtils.constructVector(interceptVectorParams) constructor .newInstance( metadata.uid, coefficientMatrix, interceptVector, data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer], data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean] ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double]) } override implicit def toLocal( transformer: LogisticRegressionModel ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer) }
Example 20
Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.Vector class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {} object LocalLogisticRegressionModel extends SimpleModelLoader[LogisticRegressionModel] with TypedTransformerConverter[LogisticRegressionModel] { override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = { val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor( classOf[String], classOf[Vector], classOf[Double] ) constructor.setAccessible(true) val coefficientsParams = data.column("coefficients").get.data.head.asInstanceOf[Map[String, Any]] val coefficients = DataUtils.constructVector(coefficientsParams) constructor .newInstance( metadata.uid, coefficients, data.column("intercept").get.data.head.asInstanceOf[java.lang.Double] ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double]) } override implicit def toLocal( sparkTransformer: LogisticRegressionModel ): LocalLogisticRegressionModel = { new LocalLogisticRegressionModel(sparkTransformer) } }