org.apache.spark.ml.PipelineModel Scala Examples
The following examples show how to use org.apache.spark.ml.PipelineModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MultilayerPerceptronClassifierWrapper.scala From drizzle-spark with Apache License 2.0 | 8 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel, val labelCount: Long, val layers: Array[Int], val weights: Array[Double] ) extends MLWritable { def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val labelCount = (rMetadata \ "labelCount").extract[Long] val layers = (rMetadata \ "layers").extract[Array[Int]] val weights = (rMetadata \ "weights").extract[Array[Double]] val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = ("class" -> instance.getClass.getName) ~ ("labelCount" -> instance.labelCount) ~ ("layers" -> instance.layers.toSeq) ~ ("weights" -> instance.weights.toArray.toSeq) val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 2
Source File: SparkRWrappers.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.api.r import org.apache.spark.ml.attribute._ import org.apache.spark.ml.feature.RFormula import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.DataFrame private[r] object SparkRWrappers { def fitRModelFormula( value: String, df: DataFrame, family: String, lambda: Double, alpha: Double): PipelineModel = { val formula = new RFormula().setFormula(value) val estimator = family match { case "gaussian" => new LinearRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) case "binomial" => new LogisticRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) } val pipeline = new Pipeline().setStages(Array(formula, estimator)) pipeline.fit(df) } def getModelWeights(model: PipelineModel): Array[Double] = { model.stages.last match { case m: LinearRegressionModel => Array(m.intercept) ++ m.weights.toArray case _: LogisticRegressionModel => throw new UnsupportedOperationException( "No weights available for LogisticRegressionModel") // SPARK-9492 } } def getModelFeatures(model: PipelineModel): Array[String] = { model.stages.last match { case m: LinearRegressionModel => val attrs = AttributeGroup.fromStructField( m.summary.predictions.schema(m.summary.featuresCol)) Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get) case _: LogisticRegressionModel => throw new UnsupportedOperationException( "No features names available for LogisticRegressionModel") // SPARK-9492 } } }
Example 3
Source File: NerCrfCustomCase.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.ner.crf import com.johnsnowlabs.nlp.annotator.PerceptronModel import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.embeddings.WordEmbeddings import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.nlp.{DocumentAssembler, Finisher, LightPipeline, RecursivePipeline} import org.apache.spark.ml.PipelineModel import org.scalatest._ class NerCrfCustomCase extends FlatSpec { val spark = ResourceHelper.spark import spark.implicits._ "NerCRF" should "read low trained model" ignore { val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentenceDetector = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") val pos = PerceptronModel.pretrained() .setInputCols("sentence", "token") .setOutputCol("pos") val embeddings = new WordEmbeddings() .setInputCols("pos", "token", "sentence") .setOutputCol("embeddings") .setStoragePath("./emb.bin", "BINARY") .setDimension(200) val nerCrf = new NerCrfApproach() .setInputCols("pos", "token", "sentence", "embeddings") .setOutputCol("ner") .setMinEpochs(50) .setMaxEpochs(80) .setLabelColumn("label") val finisher = new Finisher() .setInputCols("ner") val recursivePipeline = new RecursivePipeline() .setStages(Array( documentAssembler, sentenceDetector, tokenizer, pos, embeddings, nerCrf, finisher )) val model = recursivePipeline.fit(Seq.empty[String].toDF("text")) model.write.overwrite().save("./crfnerconll") model.stages(4).asInstanceOf[NerCrfModel].write.overwrite().save("./crfnerconll-single") } "NerCRF" should "read and predict" ignore { val lp = new LightPipeline(PipelineModel.load("./crfnerconll")) println(lp.annotate( "Lung, right lower lobe, lobectomy: Grade 3" )) } }
Example 4
Source File: RecursiveClasses.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotators.TokenizerModel import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.Dataset class SomeApproachTest(override val uid: String) extends AnnotatorApproach[SomeModelTest] with HasRecursiveFit[SomeModelTest] { override val description: String = "Some Approach" override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): SomeModelTest = { require(recursivePipeline.isDefined, "RecursiveApproach Did not receive any recursive pipelines") require(recursivePipeline.get.stages.length == 2, "RecursiveApproach Did not receive exactly two stages in the recursive pipeline") require(recursivePipeline.get.stages.last.isInstanceOf[TokenizerModel], "RecursiveApproach Last stage of recursive pipeline is not the last stage of the recursive pipeline") new SomeModelTest() } override val inputAnnotatorTypes: Array[String] = Array(AnnotatorType.TOKEN) override val outputAnnotatorType: AnnotatorType = "BAR" } class SomeModelTest(override val uid: String) extends AnnotatorModel[SomeModelTest] with HasRecursiveTransform[SomeModelTest] { def this() = this("bar_uid") override def annotate(annotations: Seq[Annotation], recursivePipeline: PipelineModel): Seq[Annotation] = { require(recursivePipeline.stages.length == 2, "RecursiveModel Did not receive exactly two stages in the recursive pipeline") require(recursivePipeline.stages.last.isInstanceOf[TokenizerModel], "RecursiveModel Last stage of recursive pipeline is not the last stage of the recursive pipeline") Seq.empty } override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { throw new IllegalStateException("SomeModelTest does not have an annotate that works without recursion") } override val inputAnnotatorTypes: Array[String] = Array(AnnotatorType.TOKEN) override val outputAnnotatorType: AnnotatorType = "BAR" }
Example 5
Source File: ACMEModel.scala From cdsw-simple-serving with Apache License 2.0 | 5 votes |
// Don't execute these lines in the workbench -- skip to "Start workbench session" package acme import org.apache.spark.ml.PipelineModel import com.cloudera.datascience.cdsw.acme.ACMEData import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.ml.{Pipeline, PipelineModel} import scala.util.Random // Read and cache training data prepared from acme-dataeng: val training = ACMEData.readData() training.cache() training.show() // Build a logistic regression model, val assembler = new VectorAssembler(). setInputCols(training.columns.filter(_ != "Occupancy")). setOutputCol("featureVec") val lr = new LogisticRegression(). setFeaturesCol("featureVec"). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val pipeline = new Pipeline().setStages(Array(assembler, lr)) // and tune that model: val paramGrid = new ParamGridBuilder(). addGrid(lr.regParam, Seq(0.00001, 0.001, 0.1)). addGrid(lr.elasticNetParam, Seq(1.0)). build() val eval = new BinaryClassificationEvaluator(). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val validator = new TrainValidationSplit(). setSeed(Random.nextLong()). setEstimator(pipeline). setEvaluator(eval). setEstimatorParamMaps(paramGrid). setTrainRatio(0.9) val validatorModel = validator.fit(training) val pipelineModel = validatorModel.bestModel.asInstanceOf[PipelineModel] val lrModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel] // Logistic regression model parameters: training.columns.zip(lrModel.coefficients.toArray).foreach(println) // Model hyperparameters: lrModel.getElasticNetParam lrModel.getRegParam // Validation metric (accuracy): validatorModel.validationMetrics.max pipelineModel // End workbench session } }
Example 6
Source File: Featurize.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.ml.{Estimator, Pipeline, PipelineModel} import org.apache.spark.sql._ import org.apache.spark.sql.types._ private[spark] object FeaturizeUtilities { // 2^18 features by default val NumFeaturesDefault = 262144 // 2^12 features for tree-based or NN-based learners val NumFeaturesTreeOrNNBased = 4096 } object Featurize extends DefaultParamsReadable[Featurize] override def fit(dataset: Dataset[_]): PipelineModel = { val pipeline = assembleFeaturesEstimators(getFeatureColumns) pipeline.fit(dataset) } private def assembleFeaturesEstimators(featureColumns: Map[String, Seq[String]]): Pipeline = { val assembleFeaturesEstimators = featureColumns.map(newColToFeatures => { new AssembleFeatures() .setColumnsToFeaturize(newColToFeatures._2.toArray) .setFeaturesCol(newColToFeatures._1) .setNumberOfFeatures(getNumberOfFeatures) .setOneHotEncodeCategoricals(getOneHotEncodeCategoricals) .setAllowImages(getAllowImages) }).toArray new Pipeline().setStages(assembleFeaturesEstimators) } override def copy(extra: ParamMap): Estimator[PipelineModel] = { new Featurize() } @DeveloperApi override def transformSchema(schema: StructType): StructType = assembleFeaturesEstimators(getFeatureColumns).transformSchema(schema) }
Example 7
Source File: MultiColumnAdapterSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.feature.{StringIndexer, Tokenizer} import org.apache.spark.ml.util.MLReadable import scala.collection.mutable class MultiColumnAdapterSpec extends TestBase with EstimatorFuzzing[MultiColumnAdapter] { lazy val wordDF = session.createDataFrame(Seq( (0, "This is a test", "this is one too"), (1, "could be a test", "bar"), (2, "foo", "bar"), (3, "foo", "maybe not"))) .toDF("label", "words1", "words2") lazy val inputCols = Array[String]("words1", "words2") lazy val outputCols = Array[String]("output1", "output2") lazy val stage = new StringIndexer() lazy val adaptedEstimator = new MultiColumnAdapter().setBaseStage(stage) .setInputCols(inputCols).setOutputCols(outputCols) test("parallelize transformers") { val stage1 = new Tokenizer() val transformer = new MultiColumnAdapter().setBaseStage(stage1) .setInputCols(inputCols).setOutputCols(outputCols) val tokenizedDF = transformer.fit(wordDF).transform(wordDF) val lines = tokenizedDF.getColAs[Array[String]]("output2") val trueLines = Array( Array("this", "is", "one", "too"), Array("bar"), Array("bar"), Array("maybe", "not") ) assert(lines === trueLines) } test("parallelize estimator") { val stringIndexedDF = adaptedEstimator.fit(wordDF).transform(wordDF) val lines1 = stringIndexedDF.getColAs[Array[String]]("output1") val trueLines1 = mutable.ArraySeq(1, 2, 0, 0) assert(lines1 === trueLines1) val lines2 = stringIndexedDF.getColAs[Array[String]]("output2") val trueLines2 = mutable.ArraySeq(1, 0, 0, 2) assert(lines2 === trueLines2) } def testObjects(): Seq[TestObject[MultiColumnAdapter]] = List(new TestObject(adaptedEstimator, wordDF)) override def reader: MLReadable[_] = MultiColumnAdapter override def modelReader: MLReadable[_] = PipelineModel }
Example 8
Source File: IForestExample.scala From spark-iforest with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.iforest.{IForest, IForestModel} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Row, SparkSession} object IForestExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local") // test in local mode .appName("iforest example") .getOrCreate() val startTime = System.currentTimeMillis() // Dataset from https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original) val dataset = spark.read.option("inferSchema", "true") .csv("data/anomaly-detection/breastw.csv") // Index label values: 2 -> 0, 4 -> 1 val indexer = new StringIndexer() .setInputCol("_c10") .setOutputCol("label") val assembler = new VectorAssembler() assembler.setInputCols(Array("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")) assembler.setOutputCol("features") val iForest = new IForest() .setNumTrees(100) .setMaxSamples(256) .setContamination(0.35) .setBootstrap(false) .setMaxDepth(100) .setSeed(123456L) val pipeline = new Pipeline().setStages(Array(indexer, assembler, iForest)) val model = pipeline.fit(dataset) val predictions = model.transform(dataset) // Save pipeline model model.write.overwrite().save("/tmp/iforest.model") // Load pipeline model val loadedPipelineModel = PipelineModel.load("/tmp/iforest.model") // Get loaded iforest model val loadedIforestModel = loadedPipelineModel.stages(2).asInstanceOf[IForestModel] println(s"The loaded iforest model has no summary: model.hasSummary = ${loadedIforestModel.hasSummary}") val binaryMetrics = new BinaryClassificationMetrics( predictions.select("prediction", "label").rdd.map { case Row(label: Double, ground: Double) => (label, ground) } ) val endTime = System.currentTimeMillis() println(s"Training and predicting time: ${(endTime - startTime) / 1000} seconds.") println(s"The model's auc: ${binaryMetrics.areaUnderROC()}") } } // scalastyle:on println
Example 9
Source File: MultilayerPerceptronClassifierWrapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] val weights: Array[Double] = mlpModel.weights.toArray val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 10
Source File: StringIndexingWrapperModel.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.stringindexingwrapper import org.apache.spark.ml import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.types.StructType import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperables.report.Report import ai.deepsense.deeplang.doperables.{SparkModelWrapper, Transformer} import ai.deepsense.deeplang.inference.InferContext import ai.deepsense.deeplang.params.{Param, ParamMap} abstract class StringIndexingWrapperModel[M <: ml.Model[M], E <: ml.Estimator[M]]( private var wrappedModel: SparkModelWrapper[M, E]) extends Transformer { private var pipelinedModel: PipelineModel = null private[stringindexingwrapper] def setPipelinedModel( pipelinedModel: PipelineModel): this.type = { this.pipelinedModel = pipelinedModel this } private[stringindexingwrapper] def setWrappedModel( wrappedModel: SparkModelWrapper[M, E]): this.type = { this.wrappedModel = wrappedModel this } override final def replicate(extra: ParamMap): this.type = { val newWrappedModel = wrappedModel.replicate(extra) // Assumption - spark objects underhood (and pipeline) remains the same super.replicate(extra) .setPipelinedModel(pipelinedModel) .setWrappedModel(newWrappedModel) .asInstanceOf[this.type] } override protected def applyTransform(ctx: ExecutionContext, df: DataFrame): DataFrame = { DataFrame.fromSparkDataFrame(pipelinedModel.transform(df.sparkDataFrame)) } override protected def applyTransformSchema( schema: StructType, inferContext: InferContext): Option[StructType] = wrappedModel._transformSchema(schema, inferContext) override protected def applyTransformSchema(schema: StructType): Option[StructType] = wrappedModel._transformSchema(schema) override def report(extended: Boolean = true): Report = wrappedModel.report(extended) override def params: Array[Param[_]] = wrappedModel.params override protected def loadTransformer(ctx: ExecutionContext, path: String): this.type = { val pipelineModelPath = Transformer.stringIndexerPipelineFilePath(path) val wrappedModelPath = Transformer.stringIndexerWrappedModelFilePath(path) val loadedPipelineModel = PipelineModel.load(pipelineModelPath) val loadedWrappedModel = Transformer.load(ctx, wrappedModelPath) this .setPipelinedModel(loadedPipelineModel) .setWrappedModel(loadedWrappedModel.asInstanceOf[SparkModelWrapper[M, E]]) .setParamsFromJson(loadedWrappedModel.paramValuesToJson, ctx.inferContext.graphReader) } override protected def saveTransformer(ctx: ExecutionContext, path: String): Unit = { val pipelineModelPath = Transformer.stringIndexerPipelineFilePath(path) val wrappedModelPath = Transformer.stringIndexerWrappedModelFilePath(path) pipelinedModel.save(pipelineModelPath) wrappedModel.save(ctx, wrappedModelPath) } private[deeplang] override def paramMap: ParamMap = wrappedModel.paramMap private[deeplang] override def defaultParamMap: ParamMap = wrappedModel.defaultParamMap }
Example 11
Source File: LemmatizerTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ class LemmatizerTestSpec extends FlatSpec with LemmatizerBehaviors { require(Some(SparkAccessor).isDefined) val lemmatizer = new Lemmatizer "a lemmatizer" should s"be of type ${AnnotatorType.TOKEN}" in { assert(lemmatizer.outputAnnotatorType == AnnotatorType.TOKEN) } val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody) "A full Normalizer pipeline with latin content" should behave like fullLemmatizerPipeline(latinBodyData) "A lemmatizer" should "be readable and writable" taggedAs Tag("LinuxOnly") in { val lemmatizer = new Lemmatizer().setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\t") val path = "./test-output-tmp/lemmatizer" try { lemmatizer.write.overwrite.save(path) val lemmatizerRead = Lemmatizer.read.load(path) assert(lemmatizer.getDictionary.path == lemmatizerRead.getDictionary.path) } catch { case _: java.io.IOException => succeed } } "A lemmatizer" should "work under a pipeline framework" in { val data = ContentProvider.parquetData.limit(1000) val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentenceDetector = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") val lemmatizer = new Lemmatizer() .setInputCols(Array("token")) .setOutputCol("lemma") .setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\t") val finisher = new Finisher() .setInputCols("lemma") val pipeline = new Pipeline() .setStages(Array( documentAssembler, sentenceDetector, tokenizer, lemmatizer, finisher )) val recursivePipeline = new RecursivePipeline() .setStages(Array( documentAssembler, sentenceDetector, tokenizer, lemmatizer, finisher )) val model = pipeline.fit(data) model.transform(data).show() val PIPE_PATH = "./tmp_pipeline" model.write.overwrite().save(PIPE_PATH) val loadedPipeline = PipelineModel.read.load(PIPE_PATH) loadedPipeline.transform(data).show val recursiveModel = recursivePipeline.fit(data) recursiveModel.transform(data).show() recursiveModel.write.overwrite().save(PIPE_PATH) val loadedRecPipeline = PipelineModel.read.load(PIPE_PATH) loadedRecPipeline.transform(data).show succeed } }
Example 12
Source File: MultilayerPerceptronClassifierWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ private val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] lazy val weights: Array[Double] = mlpModel.weights.toArray lazy val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 13
Source File: SparkRWrappers.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.api.r import org.apache.spark.ml.attribute._ import org.apache.spark.ml.feature.RFormula import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.DataFrame private[r] object SparkRWrappers { def fitRModelFormula( value: String, df: DataFrame, family: String, lambda: Double, alpha: Double, standardize: Boolean, solver: String): PipelineModel = { val formula = new RFormula().setFormula(value) val estimator = family match { case "gaussian" => new LinearRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) .setStandardization(standardize) .setSolver(solver) case "binomial" => new LogisticRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) .setStandardization(standardize) } val pipeline = new Pipeline().setStages(Array(formula, estimator)) pipeline.fit(df) } def getModelCoefficients(model: PipelineModel): Array[Double] = { model.stages.last match { case m: LinearRegressionModel => { val coefficientStandardErrorsR = Array(m.summary.coefficientStandardErrors.last) ++ m.summary.coefficientStandardErrors.dropRight(1) val tValuesR = Array(m.summary.tValues.last) ++ m.summary.tValues.dropRight(1) val pValuesR = Array(m.summary.pValues.last) ++ m.summary.pValues.dropRight(1) if (m.getFitIntercept) { Array(m.intercept) ++ m.coefficients.toArray ++ coefficientStandardErrorsR ++ tValuesR ++ pValuesR } else { m.coefficients.toArray ++ coefficientStandardErrorsR ++ tValuesR ++ pValuesR } } case m: LogisticRegressionModel => { if (m.getFitIntercept) { Array(m.intercept) ++ m.coefficients.toArray } else { m.coefficients.toArray } } } } def getModelDevianceResiduals(model: PipelineModel): Array[Double] = { model.stages.last match { case m: LinearRegressionModel => m.summary.devianceResiduals case m: LogisticRegressionModel => throw new UnsupportedOperationException( "No deviance residuals available for LogisticRegressionModel") } } def getModelFeatures(model: PipelineModel): Array[String] = { model.stages.last match { case m: LinearRegressionModel => val attrs = AttributeGroup.fromStructField( m.summary.predictions.schema(m.summary.featuresCol)) if (m.getFitIntercept) { Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get) } else { attrs.attributes.get.map(_.name.get) } case m: LogisticRegressionModel => val attrs = AttributeGroup.fromStructField( m.summary.predictions.schema(m.summary.featuresCol)) if (m.getFitIntercept) { Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get) } else { attrs.attributes.get.map(_.name.get) } } } def getModelName(model: PipelineModel): String = { model.stages.last match { case m: LinearRegressionModel => "LinearRegressionModel" case m: LogisticRegressionModel => "LogisticRegressionModel" } } }
Example 14
Source File: BaseTransformerConverter.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter.runtime import com.truecar.mleap.runtime.transformer import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.classification.RandomForestClassificationModel import org.apache.spark.ml.feature.{IndexToString, StandardScalerModel, StringIndexerModel, VectorAssembler} import org.apache.spark.ml.mleap.classification.SVMModel import org.apache.spark.ml.mleap.converter.runtime.classification.{RandomForestClassificationModelToMleap, SupportVectorMachineModelToMleap} import org.apache.spark.ml.mleap.converter.runtime.feature.{IndexToStringToMleap, StandardScalerModelToMleap, StringIndexerModelToMleap, VectorAssemblerModelToMleap} import org.apache.spark.ml.mleap.converter.runtime.regression.{LinearRegressionModelToMleap, RandomForestRegressionModelToMleap} import org.apache.spark.ml.regression.{LinearRegressionModel, RandomForestRegressionModel} trait BaseTransformerConverter extends SparkTransformerConverter { // regression implicit val mleapLinearRegressionModelToMleap: TransformerToMleap[LinearRegressionModel, transformer.LinearRegressionModel] = addConverter(LinearRegressionModelToMleap) implicit val mleapRandomForestRegressionModelToMleap: TransformerToMleap[RandomForestRegressionModel, transformer.RandomForestRegressionModel] = addConverter(RandomForestRegressionModelToMleap) // classification implicit val mleapRandomForestClassificationModelToMleap: TransformerToMleap[RandomForestClassificationModel, transformer.RandomForestClassificationModel] = addConverter(RandomForestClassificationModelToMleap) implicit val mleapSupportVectorMachineModelToMleap: TransformerToMleap[SVMModel, transformer.SupportVectorMachineModel] = addConverter(SupportVectorMachineModelToMleap) //feature implicit val mleapIndexToStringToMleap: TransformerToMleap[IndexToString, transformer.ReverseStringIndexerModel] = addConverter(IndexToStringToMleap) implicit val mleapStandardScalerModelToMleap: TransformerToMleap[StandardScalerModel, transformer.StandardScalerModel] = addConverter(StandardScalerModelToMleap) implicit val mleapStringIndexerModelToMleap: TransformerToMleap[StringIndexerModel, transformer.StringIndexerModel] = addConverter(StringIndexerModelToMleap) implicit val mleapVectorAssemblerToMleap: TransformerToMleap[VectorAssembler, transformer.VectorAssemblerModel] = addConverter(VectorAssemblerModelToMleap) // other implicit val mleapPipelineModelToMleap: TransformerToMleap[PipelineModel, transformer.PipelineModel] = addConverter(PipelineModelToMleap(this)) } object BaseTransformerConverter extends BaseTransformerConverter
Example 15
Source File: XGBoostInference.scala From xgbspark-text-classification with Apache License 2.0 | 5 votes |
package com.lenovo.ml import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.StructType import DataPreprocess.segWords import org.apache.spark.ml.PipelineModel object XGBoostInference { def main(args:Array[String]): Unit = { // 1、创建Spark程序入口 val sparkSession = SparkSession.builder().appName("XGBoostInference").enableHiveSupport().getOrCreate() // 2、读取训练数据,对文本预处理后分词 val tableName = args(0) val matrix = sparkSession.sql("SELECT * FROM " + tableName) val words = segWords(sparkSession, args(1), args(2), args(3), args(4), matrix.select("text")) // 3、将原数据与分词结果关联起来 val rows = matrix.rdd.zip(words.rdd).map{ case (rowLeft, rowRight) => Row.fromSeq(rowLeft.toSeq ++ rowRight.toSeq) } val schema = StructType(matrix.schema.fields ++ words.schema.fields) val matrixMerge = sparkSession.createDataFrame(rows, schema) // 4、构建特征向量 val featuredModelTrained = sparkSession.sparkContext.broadcast(PipelineModel.read.load(args(5))) val dataPrepared = featuredModelTrained.value.transform(matrixMerge).repartition(18).cache() // 5、加载分类模型,产出故障预测结果 val xgbModelTrained = sparkSession.sparkContext.broadcast(PipelineModel.read.load(args(6))) val prediction = xgbModelTrained.value.transform(dataPrepared) // 6、将预测结果写到HDFS prediction.select("text", "predictedLabel", "probabilities").rdd.coalesce(1).saveAsTextFile(args(7)) sparkSession.stop() } }
Example 16
Source File: Merge.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml import com.ibm.aardpfark.pfa.document.Cell import com.ibm.aardpfark.pfa.expression._ import org.apache.avro.{Schema, SchemaBuilder} import org.apache.spark.ml.PipelineModel val first = docs.head val last = docs.last var name = "merged" var version = 0L val inputSchema = is val outputSchema = last.output var meta: Map[String, String] = Map() var cells: Map[String, Cell[_]] = Map() var action: PFAExpression = StringExpr("input") var fcns: Map[String, FunctionDef] = Map() var currentSchema = inputSchema docs.zipWithIndex.foreach { case (doc, idx) => val inputParam = Param("input", currentSchema) val inputFields = currentSchema.getFields.toSeq val newFields = doc.output.getFields.toSeq val outputFields = inputFields ++ newFields val bldr = SchemaBuilder.record(s"Stage_${idx + 1}_output_schema").fields() outputFields.foreach { field => bldr .name(field.name()) .`type`(field.schema()) .noDefault() } currentSchema = bldr.endRecord() val let = Let(s"Stage_${idx + 1}_action_output", Do(doc.action)) val inputExprs = inputFields.map { field => field.name -> StringExpr(s"input.${field.name}") } val newExprs = newFields.map { field => field.name -> StringExpr(s"${let.x}.${field.name}") } val exprs = inputExprs ++ newExprs val stageOutput = NewRecord(currentSchema, exprs.toMap) val le = new LetExpr(Seq((let.x, let.`type`, let.expr))) val stageActionFn = NamedFunctionDef(s"Stage_${idx + 1}_action", FunctionDef( Seq(inputParam), currentSchema, Seq(le, stageOutput) )) fcns = fcns ++ doc.fcns + (stageActionFn.name -> stageActionFn.fn) cells = cells ++ doc.cells meta = meta ++ doc.metadata action = stageActionFn.call(action) } first.copy( name = Some(name), version = Some(version), metadata = meta, cells = cells, fcns = fcns, action = action, input = inputSchema, output = currentSchema ) } }
Example 17
Source File: SparkSupport.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml import com.ibm.aardpfark.avro.SchemaConverters import com.ibm.aardpfark.pfa.document.{PFADocument, ToPFA} import org.apache.avro.SchemaBuilder import org.apache.spark.ml.{PipelineModel, Transformer} import org.apache.spark.sql.types.StructType object SparkSupport { def toPFA(t: Transformer, pretty: Boolean): String = { toPFATransformer(t).pfa.toJSON(pretty) } def toPFA(p: PipelineModel, s: StructType, pretty: Boolean): String = { val inputFields = s.map { f => f.copy(nullable = false) } val inputSchema = StructType(inputFields) val pipelineInput = SchemaBuilder.record(s"Input_${p.uid}") val inputAvroSchema = SchemaConverters.convertStructToAvro(inputSchema, pipelineInput, "") Merge.mergePipeline(p, inputAvroSchema).toJSON(pretty) } // testing implicit conversions for Spark ML PipelineModel and Transformer to PFA / JSON implicit private[aardpfark] def toPFATransformer(transformer: org.apache.spark.ml.Transformer): ToPFA = { val pkg = transformer.getClass.getPackage.getName val name = transformer.getClass.getSimpleName val pfaPkg = pkg.replace("org.apache", "com.ibm.aardpfark") val pfaClass = Class.forName(s"$pfaPkg.PFA$name") val ctor = pfaClass.getConstructors()(0) ctor.newInstance(transformer).asInstanceOf[ToPFA] } }
Example 18
Source File: SparkFeaturePFASuiteBase.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.pfa import com.opendatagroup.hadrian.jvmcompiler.PFAEngine import org.json4s.DefaultFormats import org.apache.spark.ml.{PipelineModel, Transformer} import org.apache.spark.sql.types.StructType abstract class SparkPipelinePFASuiteBase[A <: Result](implicit m: Manifest[A]) extends SparkPredictorPFASuiteBase[A] { import com.ibm.aardpfark.spark.ml.SparkSupport._ protected val schema: StructType override protected def transformerToPFA(t: Transformer, pretty: Boolean): String = { toPFA(t.asInstanceOf[PipelineModel], schema, pretty) } } abstract class SparkFeaturePFASuiteBase[A <: Result](implicit m: Manifest[A]) extends SparkPFASuiteBase { implicit val formats = DefaultFormats protected var isDebug = false import com.ibm.aardpfark.spark.ml.SparkSupport._ import org.json4s._ import org.json4s.native.JsonMethods._ test("PFA transformer produces the same results as Spark transformer") { parityTest(sparkTransformer, input, expectedOutput) } protected def transformerToPFA(t: Transformer, pretty: Boolean): String = { toPFA(t, pretty) } protected def testInputVsExpected( engine: PFAEngine[AnyRef, AnyRef], input: Array[String], expectedOutput: Array[String]) = { import ApproxEquality._ input.zip(expectedOutput).foreach { case (in, out) => val pfaResult = engine.action(engine.jsonInput(in)) val actual = parse(pfaResult.toString).extract[A] val expected = parse(out).extract[A] (actual, expected) match { case (a: ScalerResult, e: ScalerResult) => assert(a.scaled === e.scaled) case (a: Result, e: Result) => assert(a === e) } } } def parityTest( sparkTransformer: Transformer, input: Array[String], expectedOutput: Array[String]): Unit = { val PFAJson = transformerToPFA(sparkTransformer, pretty = true) if (isDebug) { println(PFAJson) } val engine = getPFAEngine(PFAJson) testInputVsExpected(engine, input, expectedOutput) } } case class ScalerResult(scaled: Seq[Double]) extends Result
Example 19
Source File: PipelineExampleTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.ml import com.github.dnvriend.TestSpec import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{ HashingTF, Tokenizer } import org.apache.spark.ml.{ Pipeline, PipelineModel } import org.apache.spark.sql.Row class PipelineExampleTest extends TestSpec { it should "PipelineExample" in withSparkSession { spark => import spark.implicits._ // Prepare training documents from a list of (id, text, label) tuples. val training = Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0) ).toDF("id", "text", "label") // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training) // Now we can optionally save the fitted pipeline to disk model.write.overwrite().save("/tmp/spark-logistic-regression-model") // We can also save this unfit pipeline to disk pipeline.write.overwrite().save("/tmp/unfit-lr-model") // And load it back in during production val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model") // Prepare test documents, which are unlabeled (id, text) tuples. val test = Seq( (4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), (7L, "apache hadoop"), (8L, "spark f g h"), (9L, "d e f spark a b c"), (10L, "spark baz bar a b c"), (11L, "foo bar a b c spark"), (12L, "a b c scala d e f"), (13L, "spark mapreduce") ).toDF("id", "text") // Make predictions on test documents. model.transform(test) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } } }
Example 20
Source File: StringIndexingWrapperModel.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.stringindexingwrapper import org.apache.spark.ml import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.types.StructType import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperables.report.Report import io.deepsense.deeplang.doperables.{SparkModelWrapper, Transformer} import io.deepsense.deeplang.inference.InferContext import io.deepsense.deeplang.params.{Param, ParamMap} abstract class StringIndexingWrapperModel[M <: ml.Model[M], E <: ml.Estimator[M]]( private var wrappedModel: SparkModelWrapper[M, E]) extends Transformer { private var pipelinedModel: PipelineModel = null private[stringindexingwrapper] def setPipelinedModel( pipelinedModel: PipelineModel): this.type = { this.pipelinedModel = pipelinedModel this } private[stringindexingwrapper] def setWrappedModel( wrappedModel: SparkModelWrapper[M, E]): this.type = { this.wrappedModel = wrappedModel this } override final def replicate(extra: ParamMap): this.type = { val newWrappedModel = wrappedModel.replicate(extra) // Assumption - spark objects underhood (and pipeline) remains the same super.replicate(extra) .setPipelinedModel(pipelinedModel) .setWrappedModel(newWrappedModel) .asInstanceOf[this.type] } override private[deeplang] def _transform(ctx: ExecutionContext, df: DataFrame): DataFrame = { DataFrame.fromSparkDataFrame(pipelinedModel.transform(df.sparkDataFrame)) } override private[deeplang] def _transformSchema( schema: StructType, inferContext: InferContext): Option[StructType] = wrappedModel._transformSchema(schema, inferContext) override private[deeplang] def _transformSchema(schema: StructType): Option[StructType] = wrappedModel._transformSchema(schema) override def report: Report = wrappedModel.report override def params: Array[Param[_]] = wrappedModel.params override protected def loadTransformer(ctx: ExecutionContext, path: String): this.type = { val pipelineModelPath = Transformer.stringIndexerPipelineFilePath(path) val wrappedModelPath = Transformer.stringIndexerWrappedModelFilePath(path) val loadedPipelineModel = PipelineModel.load(pipelineModelPath) val loadedWrappedModel = Transformer.load(ctx, wrappedModelPath) this .setPipelinedModel(loadedPipelineModel) .setWrappedModel(loadedWrappedModel.asInstanceOf[SparkModelWrapper[M, E]]) .setParamsFromJson(loadedWrappedModel.paramValuesToJson) } override protected def saveTransformer(ctx: ExecutionContext, path: String): Unit = { val pipelineModelPath = Transformer.stringIndexerPipelineFilePath(path) val wrappedModelPath = Transformer.stringIndexerWrappedModelFilePath(path) pipelinedModel.save(pipelineModelPath) wrappedModel.save(ctx, wrappedModelPath) } private[deeplang] override def paramMap: ParamMap = wrappedModel.paramMap private[deeplang] override def defaultParamMap: ParamMap = wrappedModel.defaultParamMap }
Example 21
Source File: LightPipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.ml.{PipelineModel, Transformer} import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.JavaConverters._ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddingsVectors: Boolean = false) { private var ignoreUnsupported = false def setIgnoreUnsupported(v: Boolean): Unit = ignoreUnsupported = v def getIgnoreUnsupported: Boolean = ignoreUnsupported def getStages: Array[Transformer] = pipelineModel.stages def transform(dataFrame: Dataset[_]): DataFrame = pipelineModel.transform(dataFrame) def fullAnnotate(target: String, startWith: Map[String, Seq[Annotation]] = Map.empty[String, Seq[Annotation]]): Map[String, Seq[Annotation]] = { getStages.foldLeft(startWith)((annotations, transformer) => { transformer match { case documentAssembler: DocumentAssembler => annotations.updated(documentAssembler.getOutputCol, documentAssembler.assemble(target, Map.empty[String, String])) case lazyAnnotator: AnnotatorModel[_] if lazyAnnotator.getLazyAnnotator => annotations case recursiveAnnotator: HasRecursiveTransform[_] with AnnotatorModel[_] => val combinedAnnotations = recursiveAnnotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil)) annotations.updated(recursiveAnnotator.getOutputCol, recursiveAnnotator.annotate(combinedAnnotations, pipelineModel)) case annotator: AnnotatorModel[_] => val combinedAnnotations = annotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil)) annotations.updated(annotator.getOutputCol, annotator.annotate(combinedAnnotations)) case finisher: Finisher => annotations.filterKeys(finisher.getInputCols.contains) case rawModel: RawAnnotator[_] => if (ignoreUnsupported) annotations else throw new IllegalArgumentException(s"model ${rawModel.uid} does not support LightPipeline." + s" Call setIgnoreUnsupported(boolean) on LightPipeline to ignore") case pipeline: PipelineModel => new LightPipeline(pipeline, parseEmbeddingsVectors).fullAnnotate(target, annotations) case _ => annotations } }) } def fullAnnotate(targets: Array[String]): Array[Map[String, Seq[Annotation]]] = { targets.par.map(target => { fullAnnotate(target) }).toArray } def fullAnnotateJava(target: String): java.util.Map[String, java.util.List[JavaAnnotation]] = { fullAnnotate(target).mapValues(_.map(aa => JavaAnnotation(aa.annotatorType, aa.begin, aa.end, aa.result, aa.metadata.asJava)).asJava).asJava } def fullAnnotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[JavaAnnotation]]] = { targets.asScala.par.map(target => { fullAnnotateJava(target) }).toList.asJava } def annotate(target: String): Map[String, Seq[String]] = { fullAnnotate(target).mapValues(_.map(a => { a.annotatorType match { case (AnnotatorType.WORD_EMBEDDINGS | AnnotatorType.SENTENCE_EMBEDDINGS) if (parseEmbeddingsVectors) => a.embeddings.mkString(" ") case _ => a.result } })) } def annotate(targets: Array[String]): Array[Map[String, Seq[String]]] = { targets.par.map(target => { annotate(target) }).toArray } def annotateJava(target: String): java.util.Map[String, java.util.List[String]] = { annotate(target).mapValues(_.asJava).asJava } def annotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[String]]] = { targets.asScala.par.map(target => { annotateJava(target) }).toList.asJava } }
Example 22
Source File: CommonLoaderConversions.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.classification._ import io.hydrosphere.spark_ml_serving.clustering._ import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.preprocessors._ import io.hydrosphere.spark_ml_serving.regression._ import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.classification._ import org.apache.spark.ml.clustering.{GaussianMixtureModel, KMeansModel, LocalLDAModel => SparkLocalLDAModel} import org.apache.spark.ml.feature._ import org.apache.spark.ml.regression._ object CommonLoaderConversions extends DynamicLoaderConverter { implicit def sparkToLocal(m: Any): ModelLoader[_] = { m match { case _: PipelineModel.type => LocalPipelineModel case x: ModelLoader[_] => x // Classification models case _: DecisionTreeClassificationModel.type => LocalDecisionTreeClassificationModel case _: MultilayerPerceptronClassificationModel.type => LocalMultilayerPerceptronClassificationModel case _: NaiveBayesModel.type => LocalNaiveBayes case _: RandomForestClassificationModel.type => LocalRandomForestClassificationModel case _: GBTClassificationModel.type => LocalGBTClassificationModel // Clustering models case _: GaussianMixtureModel.type => LocalGaussianMixtureModel case _: KMeansModel.type => LocalKMeansModel case _: SparkLocalLDAModel.type => LocalLDAModel // Preprocessing case _: Binarizer.type => LocalBinarizer case _: CountVectorizerModel.type => LocalCountVectorizerModel case _: DCT.type => LocalDCT case _: HashingTF.type => LocalHashingTF case _: IndexToString.type => LocalIndexToString case _: MaxAbsScalerModel.type => LocalMaxAbsScalerModel case _: MinMaxScalerModel.type => LocalMinMaxScalerModel case _: NGram.type => LocalNGram case _: Normalizer.type => LocalNormalizer case _: OneHotEncoder.type => LocalOneHotEncoder case _: PCAModel.type => LocalPCAModel case _: PolynomialExpansion.type => LocalPolynomialExpansion case _: StandardScalerModel.type => LocalStandardScalerModel case _: StopWordsRemover.type => LocalStopWordsRemover case _: StringIndexerModel.type => LocalStringIndexerModel case _: Tokenizer.type => LocalTokenizer case _: VectorIndexerModel.type => LocalVectorIndexerModel case _: IDFModel.type => LocalIDF case _: ChiSqSelectorModel.type => LocalChiSqSelectorModel case _: RegexTokenizer.type => LocalRegexTokenizer case _: VectorAssembler.type => LocalVectorAssembler // Regression case _: DecisionTreeRegressionModel.type => LocalDecisionTreeRegressionModel case _: LinearRegressionModel.type => LocalLinearRegressionModel case _: RandomForestRegressionModel.type => LocalRandomForestRegressionModel case _: GBTRegressionModel.type => LocalGBTRegressor case x => SpecificLoaderConversions.sparkToLocal(x) } } }
Example 23
Source File: LocalPipelineModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.{PipelineModel, Transformer} import io.hydrosphere.spark_ml_serving.common.utils.PumpedClass class LocalPipelineModel(override val sparkTransformer: PipelineModel) extends LocalTransformer[PipelineModel] { def transform(localData: LocalData): LocalData = { import CommonTransormerConversions._ sparkTransformer.stages.foldLeft(localData) { case (data, transformer) => transformer.transform(data) } } } object LocalPipelineModel extends ModelLoader[PipelineModel] with TypedTransformerConverter[PipelineModel] { import CommonLoaderConversions._ def getStages(pipelineParameters: Metadata, source: ModelSource): Array[Transformer] = { pipelineParameters.paramMap("stageUids").asInstanceOf[List[String]].zipWithIndex.toArray.map { case (uid: String, index: Int) => val currentStage = s"stages/${index}_$uid" val modelMetadata = source.readFile(s"$currentStage/metadata/part-00000") val stageParameters = Metadata.fromJson(modelMetadata) val companion = PumpedClass.companionFromClassName(stageParameters.`class`) companion.load(s"${source.root}/$currentStage").asInstanceOf[Transformer] } } override def load(source: ModelSource): PipelineModel = { val metadata = source.readFile("metadata/part-00000") val pipelineParameters = Metadata.fromJson(metadata) val stages: Array[Transformer] = getStages(pipelineParameters, source) val cstr = classOf[PipelineModel].getDeclaredConstructor( classOf[String], classOf[Array[Transformer]] ) cstr.setAccessible(true) cstr .newInstance( pipelineParameters.uid, stages ) } implicit def toLocal(sparkTransformer: PipelineModel) = new LocalPipelineModel(sparkTransformer) }
Example 24
Source File: CommonTransormerConversions.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.classification._ import io.hydrosphere.spark_ml_serving.clustering._ import io.hydrosphere.spark_ml_serving.common.LocalTransformer import io.hydrosphere.spark_ml_serving.preprocessors._ import io.hydrosphere.spark_ml_serving.regression._ import org.apache.spark.ml.classification._ import org.apache.spark.ml.clustering.{GaussianMixtureModel, KMeansModel, LocalLDAModel => SparkLocalLDAModel} import org.apache.spark.ml.feature._ import org.apache.spark.ml.regression._ import org.apache.spark.ml.{PipelineModel, Transformer} object CommonTransormerConversions extends DynamicTransformerConverter { implicit def transformerToLocal(transformer: Transformer): LocalTransformer[_] = { transformer match { case x: PipelineModel => new LocalPipelineModel(x) // Classification models case x: DecisionTreeClassificationModel => new LocalDecisionTreeClassificationModel(x) case x: MultilayerPerceptronClassificationModel => new LocalMultilayerPerceptronClassificationModel(x) case x: NaiveBayesModel => new LocalNaiveBayes(x) case x: RandomForestClassificationModel => new LocalRandomForestClassificationModel(x) case x: GBTClassificationModel => new LocalGBTClassificationModel(x) // Clustering models case x: GaussianMixtureModel => new LocalGaussianMixtureModel(x) case x: KMeansModel => new LocalKMeansModel(x) case x: SparkLocalLDAModel => new LocalLDAModel(x) // Preprocessing case x: Binarizer => new LocalBinarizer(x) case x: CountVectorizerModel => new LocalCountVectorizerModel(x) case x: DCT => new LocalDCT(x) case x: HashingTF => new LocalHashingTF(x) case x: IndexToString => new LocalIndexToString(x) case x: MaxAbsScalerModel => new LocalMaxAbsScalerModel(x) case x: MinMaxScalerModel => new LocalMinMaxScalerModel(x) case x: NGram => new LocalNGram(x) case x: Normalizer => new LocalNormalizer(x) case x: OneHotEncoder => new LocalOneHotEncoder(x) case x: PCAModel => new LocalPCAModel(x) case x: PolynomialExpansion => new LocalPolynomialExpansion(x) case x: StandardScalerModel => new LocalStandardScalerModel(x) case x: StopWordsRemover => new LocalStopWordsRemover(x) case x: StringIndexerModel => new LocalStringIndexerModel(x) case x: Tokenizer => new LocalTokenizer(x) case x: VectorIndexerModel => new LocalVectorIndexerModel(x) case x: IDFModel => new LocalIDF(x) case x: ChiSqSelectorModel => new LocalChiSqSelectorModel(x) case x: RegexTokenizer => new LocalRegexTokenizer(x) case x: VectorAssembler => new LocalVectorAssembler(x) // Regression case x: DecisionTreeRegressionModel => new LocalDecisionTreeRegressionModel(x) case x: LinearRegressionModel => new LocalLinearRegressionModel(x) case x: RandomForestRegressionModel => new LocalRandomForestRegressionModel(x) case x: GBTRegressionModel => new LocalGBTRegressor(x) case x => SpecificTransformerConversions.transformerToLocal(x) } } }
Example 25
Source File: PipelineWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.{DataFrame, Dataset} class PipelineWrapper() { var pipeline = new Pipeline() var transformers: Array[TransformerWrapper] = Array() def setTransformers(value: Array[TransformerWrapper]): this.type = { transformers = value setStages(PipelineBuilder.build(transformers)) this } def setStages(value: Array[_ <: PipelineStage]): Unit = { pipeline = pipeline.setStages(value) } def fit(dataset: Dataset[_]): PipelineModelWrapper = { new PipelineModelWrapper(pipeline.fit(dataset), transformers) } } class PipelineModelWrapper(val model: PipelineModel, val transformers: Array[TransformerWrapper]) { def transform(dataset: Dataset[_]): DataFrame = { var df = model.transform(dataset) if (transformers.length >= 2) { (0 until transformers.length - 1).foreach { i => val outCols = transformers(i).getOutputCols for (col <- outCols) { df = df.drop(col) } } } df } }
Example 26
Source File: PipelineOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops import ml.combust.bundle.BundleContext import ml.combust.bundle.op.OpModel import ml.combust.bundle.serializer.GraphSerializer import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.{PipelineModel, Transformer} class PipelineOp extends SimpleSparkOp[PipelineModel] { override val Model: OpModel[SparkBundleContext, PipelineModel] = new OpModel[SparkBundleContext, PipelineModel] { override val klazz: Class[PipelineModel] = classOf[PipelineModel] override def opName: String = Bundle.BuiltinOps.pipeline override def store(model: Model, obj: PipelineModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val nodes = GraphSerializer(context).write(obj.stages).get model.withValue("nodes", Value.stringList(nodes)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): PipelineModel = { val nodes = GraphSerializer(context).read(model.value("nodes").getStringList). map(_.map(_.asInstanceOf[Transformer])).get.toArray new PipelineModel(uid = "", stages = nodes) } } override def sparkLoad(uid: String, shape: NodeShape, model: PipelineModel): PipelineModel = { new PipelineModel(uid = uid, stages = model.stages) } override def sparkInputs(obj: PipelineModel): Seq[ParamSpec] = Seq() override def sparkOutputs(obj: PipelineModel): Seq[SimpleParamSpec] = Seq() override def load(node: Node, model: PipelineModel)(implicit context: BundleContext[SparkBundleContext]): PipelineModel = { new PipelineModel(uid = node.name, stages = model.stages) } }
Example 27
Source File: TextClassificationPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.textclassifier import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row import org.utils.StandaloneSpark object TextClassificationPipeline { def main(args: Array[String]): Unit = { val spark = StandaloneSpark.getSparkInstance() // Prepare training documents from a list of (id, text, label) tuples. val training = spark.createDataFrame(Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0) )).toDF("id", "text", "label") // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training) // Now we can optionally save the fitted pipeline to disk model.write.overwrite().save("/tmp/spark-logistic-regression-model") // We can also save this unfit pipeline to disk pipeline.write.overwrite().save("/tmp/unfit-lr-model") // And load it back in during production val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model") // Prepare test documents, which are unlabeled (id, text) tuples. val test = spark.createDataFrame(Seq( (4L, "spark i j k"), (5L, "l m n"), (6L, "spark hadoop spark"), (7L, "apache hadoop") )).toDF("id", "text") // Make predictions on test documents. model.transform(test) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } } }
Example 28
Source File: MultilayerPerceptronClassifierWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] val weights: Array[Double] = mlpModel.weights.toArray val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 29
Source File: PredictNewsClassDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.mining import algorithms.evaluation.MultiClassEvaluation import config.paramconf.ClassParams import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.{Row, SparkSession} object PredictNewsClassDemo extends Serializable { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("predict news multi class demo") .getOrCreate() val args = Array("ckooc-ml/data/classnews/predict", "lr") val filePath = args(0) val modelType = args(1) var modelPath = "" val params = new ClassParams modelType match { case "lr" => modelPath = params.LRModelPath case "dt" => modelPath = params.DTModelPath case _ => println("模型类型错误!") System.exit(1) } import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None }.toDF("label", "title", "time", "content") data.persist() //加载模型,进行数据转换 val model = PipelineModel.load(modelPath) val predictions = model.transform(data) //=== 模型评估 val resultRDD = predictions.select("prediction", "indexedLabel").rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val (precision, recall, f1) = MultiClassEvaluation.multiClassEvaluate(resultRDD) println("\n\n========= 评估结果 ==========") println(s"\n加权准确率:$precision") println(s"加权召回率:$recall") println(s"F1值:$f1") // predictions.select("label", "predictedLabel", "content").show(100, truncate = false) data.unpersist() spark.stop() } }
Example 30
Source File: recursive.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.ml.{Pipeline, PipelineModel} package object recursive { implicit def p2recursive(pipeline: Pipeline): RecursivePipeline = new RecursivePipeline(pipeline) implicit def pm2recursive(pipelineModel: PipelineModel): RecursivePipelineModel = new RecursivePipelineModel(pipelineModel.uid, pipelineModel) implicit def pm2light(pipelineModel: PipelineModel): LightPipeline = new LightPipeline(pipelineModel) implicit class Recursive(p: Pipeline) { def recursive: RecursivePipeline = { new RecursivePipeline(p) } } implicit class RecursiveModel(p: PipelineModel) { def recursive: RecursivePipelineModel = { new RecursivePipelineModel(p.uid, p) } } }
Example 31
Source File: LanguageDetectorDLTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.ld.dl import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.ml.{Pipeline, PipelineModel} import org.scalatest._ class LanguageDetectorDLTestSpec extends FlatSpec { "LanguageDetectorDL" should "correctly load saved model" in { val smallCorpus = ResourceHelper.spark.read .option("header", true) .option("delimiter", "|") .csv("src/test/resources/language-detector/multilingual_sample.txt") val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") val languageDetector = LanguageDetectorDL.pretrained("ld_wiki_20") .setInputCols("sentence") .setOutputCol("language") .setThreshold(0.3f) .setCoalesceSentences(true) val pipeline = new Pipeline() .setStages(Array( documentAssembler, sentence, languageDetector )) val pipelineDF = pipeline.fit(smallCorpus).transform(smallCorpus) println(pipelineDF.count()) smallCorpus.show(2) pipelineDF.show(2) pipelineDF.select("sentence").show(4, false) pipelineDF.select("language.metadata").show(20, false) pipelineDF.select("language.result", "lang").show(20, false) pipeline.fit(smallCorpus).write.overwrite().save("./tmp_ld_pipeline") val pipelineModel = PipelineModel.load("./tmp_ld_pipeline") pipelineModel.transform(smallCorpus).select("language.result", "lang").show(20, false) } }
Example 32
Source File: PretrainedPipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.pretrained import com.johnsnowlabs.nlp.LightPipeline import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.DataFrame case class PretrainedPipeline( downloadName: String, lang: String = "en", source: String = ResourceDownloader.publicLoc, parseEmbeddingsVectors: Boolean = false, diskLocation: Option[String] = None ) { def this(downloadName: String) { this(downloadName, "en", ResourceDownloader.publicLoc) } def this(downloadName: String, lang: String) { this(downloadName, lang, ResourceDownloader.publicLoc) } val model: PipelineModel = if (diskLocation.isEmpty) { ResourceDownloader .downloadPipeline(downloadName, Option(lang), source) } else { PipelineModel.load(diskLocation.get) } lazy val lightModel = new LightPipeline(model, parseEmbeddingsVectors) def annotate(dataset: DataFrame, inputColumn: String): DataFrame = { model .transform(dataset.withColumnRenamed(inputColumn, "text")) } def annotate(target: String): Map[String, Seq[String]] = lightModel.annotate(target) def annotate(target: Array[String]): Array[Map[String, Seq[String]]] = lightModel.annotate(target) def transform(dataFrame: DataFrame): DataFrame = model.transform(dataFrame) } object PretrainedPipeline { def fromDisk(path: String, parseEmbeddings: Boolean = false): PretrainedPipeline = { PretrainedPipeline(null, null, null, parseEmbeddings, Some(path)) } def fromDisk(path: String): PretrainedPipeline = { fromDisk(path, false) } }
Example 33
Source File: AnnotatorApproach.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.storage.HasStorage import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer} import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType} import org.apache.spark.ml.util.DefaultParamsWritable override final def transformSchema(schema: StructType): StructType = { require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" + msgHelper(schema) + s"\nMake sure such annotators exist in your pipeline, " + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", outputAnnotatorType) val outputFields = schema.fields :+ StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build) StructType(outputFields) } }
Example 34
Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.mutable.ListBuffer class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline { def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty) def this(uid: String) = this(uid, Array.empty) def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages) this.setStages(baseStages) override def fit(dataset: Dataset[_]): PipelineModel = { transformSchema(dataset.schema, logging = true) val theStages = $(stages) var indexOfLastEstimator = -1 theStages.view.zipWithIndex.foreach { case (stage, index) => stage match { case _: Estimator[_] => indexOfLastEstimator = index case _ => } } var curDataset = dataset val transformers = ListBuffer.empty[Transformer] theStages.view.zipWithIndex.foreach { case (stage, index) => if (index <= indexOfLastEstimator) { val transformer = stage match { case estimator: HasRecursiveFit[_] => estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset)) case estimator: Estimator[_] => estimator.fit(curDataset) case t: Transformer => t case _ => throw new IllegalArgumentException( s"Does not support stage $stage of type ${stage.getClass}") } if (index < indexOfLastEstimator) { curDataset = transformer.transform(curDataset) } transformers += transformer } else { transformers += stage.asInstanceOf[Transformer] } } createPipeline(dataset, transformers.toArray) } } class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel) extends Model[RecursivePipelineModel] with MLWritable with Logging { def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline) // drops right at most because is itself included private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel = new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset) override def copy(extra: ParamMap): RecursivePipelineModel = { new RecursivePipelineModel(uid, innerPipeline.copy(extra)) } override def write: MLWriter = { innerPipeline.write } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match { case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset)) case t: AnnotatorModel[_] if t.getLazyAnnotator => cur case t: Transformer => t.transform(cur) }) } override def transformSchema(schema: StructType): StructType = { innerPipeline.transformSchema(schema) } }
Example 35
Source File: BigTextMatcher.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.btm import com.johnsnowlabs.collections.StorageSearchTrie import com.johnsnowlabs.nlp.AnnotatorType.{TOKEN, DOCUMENT, CHUNK} import com.johnsnowlabs.nlp.annotators.TokenizerModel import com.johnsnowlabs.nlp.serialization.StructFeature import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper} import com.johnsnowlabs.nlp.AnnotatorApproach import com.johnsnowlabs.storage.Database.Name import com.johnsnowlabs.storage.{Database, HasStorage, RocksDBConnection, StorageWriter} import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.Dataset class BigTextMatcher(override val uid: String) extends AnnotatorApproach[BigTextMatcherModel] with HasStorage { def this() = this(Identifiable.randomUID("ENTITY_EXTRACTOR")) override val inputAnnotatorTypes = Array(DOCUMENT, TOKEN) override val outputAnnotatorType: AnnotatorType = CHUNK override val description: String = "Extracts entities from target dataset given in a text file" val mergeOverlapping = new BooleanParam(this, "mergeOverlapping", "whether to merge overlapping matched chunks. Defaults false") val tokenizer = new StructFeature[TokenizerModel](this, "tokenizer") setDefault(inputCols,Array(TOKEN)) setDefault(caseSensitive, true) setDefault(mergeOverlapping, false) def setTokenizer(tokenizer: TokenizerModel): this.type = set(this.tokenizer, tokenizer) def getTokenizer: TokenizerModel = $$(tokenizer) def setMergeOverlapping(v: Boolean): this.type = set(mergeOverlapping, v) def getMergeOverlapping: Boolean = $(mergeOverlapping) private def loadEntities(path: String, writers: Map[Database.Name, StorageWriter[_]]): Unit = { val inputFiles: Seq[Iterator[String]] = ResourceHelper.parseLinesIterator(ExternalResource(path, ReadAs.TEXT, Map())) inputFiles.foreach { inputFile => { StorageSearchTrie.load(inputFile, writers, get(tokenizer)) }} } override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): BigTextMatcherModel = { new BigTextMatcherModel() .setInputCols($(inputCols)) .setOutputCol($(outputCol)) .setCaseSensitive($(caseSensitive)) .setStorageRef($(storageRef)) .setMergeOverlapping($(mergeOverlapping)) } override protected def createWriter(database: Name, connection: RocksDBConnection): StorageWriter[_] = { database match { case Database.TMVOCAB => new TMVocabReadWriter(connection, $(caseSensitive)) case Database.TMEDGES => new TMEdgesReadWriter(connection, $(caseSensitive)) case Database.TMNODES => new TMNodesWriter(connection) } } override protected def index( fitDataset: Dataset[_], storageSourcePath: Option[String], readAs: Option[ReadAs.Value], writers: Map[Database.Name, StorageWriter[_]], readOptions: Option[Map[String, String]] ): Unit = { require(readAs.get == ReadAs.TEXT, "BigTextMatcher only supports TEXT input formats at the moment.") loadEntities(storageSourcePath.get, writers) } override protected val databases: Array[Name] = BigTextMatcherModel.databases } object BigTextMatcher extends DefaultParamsReadable[BigTextMatcher]
Example 36
Source File: ChunkTokenizer.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, TOKEN} import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.Dataset override val outputAnnotatorType: AnnotatorType = TOKEN override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): TokenizerModel = { val ruleFactory = buildRuleFactory val processedExceptions = get(exceptionsPath) .map(er => ResourceHelper.parseLines(er)) .getOrElse(Array.empty[String]) ++ get(exceptions).getOrElse(Array.empty[String]) val raw = new ChunkTokenizerModel() .setCaseSensitiveExceptions($(caseSensitiveExceptions)) .setTargetPattern($(targetPattern)) .setRules(ruleFactory) if (processedExceptions.nonEmpty) raw.setExceptions(processedExceptions) else raw } } object ChunkTokenizer extends DefaultParamsReadable[ChunkTokenizer]
Example 37
Source File: CoNLLGenerator.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.util import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import scala.collection.mutable.ArrayBuffer import scala.util.Try object CoNLLGenerator { def exportConllFiles(spark: SparkSession, filesPath: String, pipelineModel: PipelineModel, outputPath: String): Unit = { import spark.implicits._ //for toDS and toDF val data = spark.sparkContext.wholeTextFiles(filesPath).toDS.toDF("filename", "text") exportConllFiles(data, pipelineModel, outputPath) } def exportConllFiles(spark: SparkSession, filesPath: String, pipelinePath: String, outputPath: String): Unit = { val model = PipelineModel.load(pipelinePath) exportConllFiles(spark, filesPath, model, outputPath) } def exportConllFiles(data: DataFrame, pipelineModel: PipelineModel, outputPath: String): Unit = { val POSdataset = pipelineModel.transform(data) exportConllFiles(POSdataset, outputPath) } def exportConllFiles(data: DataFrame, pipelinePath: String, outputPath: String): Unit = { val model = PipelineModel.load(pipelinePath) exportConllFiles(data, model, outputPath) } def exportConllFiles(data: DataFrame, outputPath: String): Unit = { import data.sparkSession.implicits._ //for udf var dfWithNER = data //if data does not contain ner column, add "O" as default if (Try(data("finished_ner")).isFailure){ def OArray = (len : Int) => { //create array of $len "O"s var z = new Array[String](len) for (i <- 0 until z.length) { z(i)="O" } z } val makeOArray = data.sparkSession.udf.register("finished_pos", OArray) dfWithNER=data.withColumn("finished_ner", makeOArray(size(col("finished_pos")))) } val newPOSDataset = dfWithNER.select("finished_token", "finished_pos", "finished_token_metadata", "finished_ner"). as[(Array[String], Array[String], Array[(String, String)], Array[String])] val CoNLLDataset = makeConLLFormat(newPOSDataset) CoNLLDataset.coalesce(1).write.format("com.databricks.spark.csv"). options(scala.collection.Map("delimiter" -> " ", "emptyValue" -> "")). save(outputPath) } def makeConLLFormat(newPOSDataset : Dataset[(Array[String], Array[String], Array[(String, String)], Array[String])]) ={ import newPOSDataset.sparkSession.implicits._ //for row casting newPOSDataset.flatMap(row => { val newColumns: ArrayBuffer[(String, String, String, String)] = ArrayBuffer() val columns = ((row._1 zip row._2), row._3.map(_._2.toInt), row._4).zipped.map{case (a,b, c) => (a._1, a._2, b, c)} var sentenceId = 1 newColumns.append(("", "", "", "")) newColumns.append(("-DOCSTART-", "-X-", "-X-", "O")) newColumns.append(("", "", "", "")) columns.foreach(a => { if (a._3 != sentenceId){ newColumns.append(("", "", "", "")) sentenceId = a._3 } newColumns.append((a._1, a._2, a._2, a._4)) }) newColumns }) } }
Example 38
Source File: NerHelper.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.benchmarks.spark import java.io.{BufferedWriter, File, FileWriter} import com.johnsnowlabs.nlp.annotators.common.NerTagged import com.johnsnowlabs.nlp.training.CoNLL import com.johnsnowlabs.nlp.{Annotation, SparkAccessor} import com.johnsnowlabs.nlp.util.io.ExternalResource import org.apache.spark.ml.PipelineModel import scala.collection.mutable object NerHelper { def saveNerSpanTags(annotations: Array[Array[Annotation]], file: String): Unit = { val bw = new BufferedWriter(new FileWriter(new File(file))) bw.write(s"start\tend\ttag\ttext\n") for (i <- 0 until annotations.length) { for (a <- annotations(i)) bw.write(s"${a.begin}\t${a.end}\t${a.result}\t${a.metadata("entity").replace("\n", " ")}\n") } bw.close() } def calcStat(correct: Int, predicted: Int, predictedCorrect: Int): (Float, Float, Float) = { // prec = (predicted & correct) / predicted // rec = (predicted & correct) / correct val prec = predictedCorrect.toFloat / predicted val rec = predictedCorrect.toFloat / correct val f1 = 2 * prec * rec / (prec + rec) (prec, rec, f1) } def measureExact(nerReader: CoNLL, model: PipelineModel, file: ExternalResource, printErrors: Int = 0): Unit = { val df = nerReader.readDataset(SparkAccessor.benchmarkSpark, file.path).toDF() val transformed = model.transform(df) val rows = transformed.select("ner_span", "label_span").collect() val correctPredicted = mutable.Map[String, Int]() val predicted = mutable.Map[String, Int]() val correct = mutable.Map[String, Int]() var toPrintErrors = printErrors for (row <- rows) { val predictions = NerTagged.getAnnotations(row, 0).filter(a => a.result != "O") val labels = NerTagged.getAnnotations(row, 1).filter(a => a.result != "O") for (p <- predictions) { val tag = p.metadata("entity") predicted(tag) = predicted.getOrElse(tag, 0) + 1 } for (l <- labels) { val tag = l.metadata("entity") correct(tag) = correct.getOrElse(tag, 0) + 1 } val correctPredictions = labels.toSet.intersect(predictions.toSet) for (a <- correctPredictions) { val tag = a.metadata("entity") correctPredicted(tag) = correctPredicted.getOrElse(tag, 0) + 1 } if (toPrintErrors > 0) { for (p <- predictions) { if (toPrintErrors > 0 && !correctPredictions.contains(p)) { System.out.println(s"Predicted\t${p.result}\t${p.begin}\t${p.end}\t${p.metadata("text")}") toPrintErrors -= 1 } } for (p <- labels) { if (toPrintErrors > 0 && !correctPredictions.contains(p)) { System.out.println(s"Correct\t${p.result}\t${p.begin}\t${p.end}\t${p.metadata("text")}") toPrintErrors -= 1 } } } } val (prec, rec, f1) = calcStat(correct.values.sum, predicted.values.sum, correctPredicted.values.sum) System.out.println(s"$prec\t$rec\t$f1") val tags = (correct.keys ++ predicted.keys ++ correctPredicted.keys).toList.distinct for (tag <- tags) { val (prec, rec, f1) = calcStat(correct.getOrElse(tag, 0), predicted.getOrElse(tag, 0), correctPredicted.getOrElse(tag, 0)) System.out.println(s"$tag\t$prec\t$rec\t$f1") } } }
Example 39
Source File: WordEmbeddingsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.base.{DocumentAssembler, RecursivePipeline} import com.johnsnowlabs.nlp.util.io.{ReadAs, ResourceHelper} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.scalatest._ class WordEmbeddingsTestSpec extends FlatSpec { "Word Embeddings" should "correctly embed clinical words not embed non-existent words" ignore { val words = ResourceHelper.spark.read.option("header","true").csv("src/test/resources/embeddings/clinical_words.txt") val notWords = ResourceHelper.spark.read.option("header","true").csv("src/test/resources/embeddings/not_words.txt") val documentAssembler = new DocumentAssembler() .setInputCol("word") .setOutputCol("document") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val embeddings = WordEmbeddingsModel.pretrained() .setInputCols("document", "token") .setOutputCol("embeddings") .setCaseSensitive(false) val pipeline = new RecursivePipeline() .setStages(Array( documentAssembler, tokenizer, embeddings )) val wordsP = pipeline.fit(words).transform(words).cache() val notWordsP = pipeline.fit(notWords).transform(notWords).cache() val wordsCoverage = WordEmbeddingsModel.withCoverageColumn(wordsP, "embeddings", "cov_embeddings") val notWordsCoverage = WordEmbeddingsModel.withCoverageColumn(notWordsP, "embeddings", "cov_embeddings") wordsCoverage.select("word","cov_embeddings").show() notWordsCoverage.select("word","cov_embeddings").show() val wordsOverallCoverage = WordEmbeddingsModel.overallCoverage(wordsCoverage,"embeddings").percentage val notWordsOverallCoverage = WordEmbeddingsModel.overallCoverage(notWordsCoverage,"embeddings").percentage ResourceHelper.spark.createDataFrame( Seq( ("Words", wordsOverallCoverage),("Not Words", notWordsOverallCoverage) ) ).toDF("Dataset", "OverallCoverage").show() assert(wordsOverallCoverage == 1) assert(notWordsOverallCoverage == 0) } "Word Embeddings" should "store and load from disk" in { val data = ResourceHelper.spark.read.option("header","true").csv("src/test/resources/embeddings/clinical_words.txt") val documentAssembler = new DocumentAssembler() .setInputCol("word") .setOutputCol("document") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val embeddings = new WordEmbeddings() .setStoragePath("src/test/resources/random_embeddings_dim4.txt", ReadAs.TEXT) .setDimension(4) .setStorageRef("glove_4d") .setInputCols("document", "token") .setOutputCol("embeddings") val pipeline = new Pipeline() .setStages(Array( documentAssembler, tokenizer, embeddings )) val model = pipeline.fit(data) model.write.overwrite().save("./tmp_embeddings_pipeline") model.transform(data).show(5) val loadedPipeline1 = PipelineModel.load("./tmp_embeddings_pipeline") loadedPipeline1.transform(data).show(5) val loadedPipeline2 = PipelineModel.load("./tmp_embeddings_pipeline") loadedPipeline2.transform(data).show(5) } }