org.apache.spark.ml.util.Identifiable Scala Examples
The following examples show how to use org.apache.spark.ml.util.Identifiable.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0 | 6 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.log4j import scala.collection.mutable object MNISTBenchmark { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt) val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2" val numPartitions = if(args.length >= 3) args(2).toInt else 10 val models = if(args.length >=4) args(3).split(',') else Array("tree","naive") val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, path) .zipWithIndex() .filter(_._2 < ns.max) .sortBy(_._2, numPartitions = numPartitions) .keys .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) .cache() dataset.count() //force persist val limiter = new Limiter() val knn = new KNNClassifier() .setTopTreeSize(numPartitions * 10) .setFeaturesCol("features") .setPredictionCol("prediction") .setK(1) val naiveKNN = new NaiveKNNClassifier() val pipeline = new Pipeline() .setStages(Array(limiter, knn)) val naivePipeline = new Pipeline() .setStages(Array(limiter, naiveKNN)) val paramGrid = new ParamGridBuilder() .addGrid(limiter.n, ns) .build() val bm = new Benchmarker() .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumTimes(3) val metrics = mutable.ArrayBuffer[String]() if(models.contains("tree")) { val bmModel = bm.setEstimator(pipeline).fit(dataset) metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}" } if(models.contains("naive")) { val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset) metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}" } logger.info(metrics.mkString("\n")) } } class Limiter(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("limiter")) val n: IntParam = new IntParam(this, "n", "number of rows to limit") def setN(value: Int): this.type = set(n, value) // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN) override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF() override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = schema }
Example 2
Source File: Binarizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) }
Example 3
Source File: URLElimminator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, Params} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("URLEliminator")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), filterTextUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), StringType) } else { schema } } } object URLElimminator extends DefaultParamsReadable[URLElimminator] { override def load(path: String): URLElimminator = super.load(path) }
Example 4
Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import java.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{LongType, StructType} def setDim(value: Long): this.type = set(dim, value) def this() = this(Identifiable.randomUID("randomProjectionsHasher")) override def transform(dataset: Dataset[_]): DataFrame = { val dimensity = { if (!isSet(dim)) {//If dimensions is not set - will search AttributeGroup in metadata as it comes from OdklCountVectorizer val vectorsIndex = dataset.schema.fieldIndex($(inputCol)) AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size } else { $(dim).toInt } } val projectionMatrix = dataset.sqlContext.sparkContext.broadcast( Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix]) //the matrix of random vectors to costruct hash val binHashSparseVectorColumn = udf((vector: Vector) => { projectionMatrix.value.multiply(vector).values .map(f => if (f>0) 1L else 0L) .view.zipWithIndex .foldLeft(0L) {case (acc,(v, i)) => acc | (v << i) } }) dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), LongType) } }
Example 5
Source File: RegexpReplaceTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("RegexpReplaceTransformer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), regexp_replace(dataset.col($(inputCol)), $(regexpPattern), $(regexpReplacement))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputCol))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), StringType) } else { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } } } object RegexpReplaceTransformer extends DefaultParamsReadable[RegexpReplaceTransformer] { override def load(path: String): RegexpReplaceTransformer = super.load(path) }
Example 6
Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1)) override def transform(dataset: Dataset[_]): DataFrame = { val lowerBound = $(lowerN) val upperBound = $(upperN) val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound)) dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), new ArrayType(StringType, true)) } else { schema } } } object NGramExtractor extends DefaultParamsReadable[NGramExtractor] { override def load(path: String): NGramExtractor = super.load(path) }
Example 7
Source File: HashBasedDeduplicator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import odkl.analysis.spark.util.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.linalg.Vectors.norm import org.apache.spark.ml.linalg.{BLAS, Vector} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuffer def setSimilarityTreshold(value: Double): this.type = set(similarityThreshold, value) setDefault(new ParamPair[String](inputColHash,"hash"), new ParamPair[Double](similarityThreshold,0.9)) def this() = this(Identifiable.randomUID("hashBasedDeduplication")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sqlContext.createDataFrame( dataset.toDF .repartition(dataset.col($(inputColHash))) .sortWithinPartitions($(inputColHash)) .rdd .mapPartitions((f: Iterator[Row]) => { if (f.hasNext) { var curHash: Long = -1L val vectorsBuffer = new ArrayBuffer[Vector](0) // unique vectors buffer for this bucket for (it <- f) yield { val newHash = it.getAs[Long]($(inputColHash)) if (newHash == curHash) { val currentVector = it.getAs[Vector]($(inputColVector)) val isUnique = vectorsBuffer.forall(storedVector => { //are this vector is "different" with other in buffer? (BLAS.dot(storedVector, currentVector) / (norm(storedVector, 2) * norm(currentVector, 2))) < $(similarityThreshold) //is unsimilar? }) if (isUnique) { vectorsBuffer.append(currentVector) it } else { Row.empty //dummy Row } } else { vectorsBuffer.clear() vectorsBuffer.append(it.getAs[Vector]($(inputColVector))) curHash = newHash it } } } else { new Array[Row](0).toIterator //empty partition? } }).filter(!_.equals(Row.empty)), //filter dummy transformSchema(dataset.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) }
Example 8
Source File: LanguageDetectorTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import com.google.common.base.Optional import com.optimaize.langdetect.LanguageDetector import com.optimaize.langdetect.i18n.LdLocale import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import scala.collection.Map def setOutputCol(value: String): this.type = set(outputCol, value) def this() = this(Identifiable.randomUID("languageDetector")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } @transient object languageDetectorWrapped extends Serializable { val languageDetector: LanguageDetector = LanguageDetectorUtils.buildLanguageDetector( LanguageDetectorUtils.readListLangsBuiltIn(), $(minimalConfidence), $(languagePriors).toMap) } }
Example 9
Source File: LanguageAwareAnalyzer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.util.StopwordAnalyzerBase import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.HasOutputCol import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } def this() = this(Identifiable.randomUID("languageAnalyzer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF } @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputColText) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true)) } else { SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true)) } } } object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] { override def load(path: String): LanguageAwareAnalyzer = super.load(path) }
Example 10
Source File: RegressionEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //默认均方根误差 setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { //均方根误差 case "rmse" => metrics.rootMeanSquaredError //均方差 case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 //平均绝对误差 case "mae" => metrics.meanAbsoluteError } metric } override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false//均方根误差 case "mse" => false//均方差 case "r2" => true//平方系统 case "mae" => false//平均绝对误差 } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 11
Source File: MulticlassClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 setDefault(metricName -> "f1") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision//准确率 case "recall" => metrics.recall//召回率 case "weightedPrecision" => metrics.weightedPrecision//加权准确率 case "weightedRecall" => metrics.weightedRecall//加权召回率 } metric } override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true//F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "precision" => true//准确率 case "recall" => true//召回率 case "weightedPrecision" => true//加权准确率 case "weightedRecall" => true//加权召回率 } override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) }
Example 12
Source File: BinaryClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //ROC曲线下面积 setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { //ROC曲线下面积为1.0时表示一个完美的分类器 case "areaUnderROC" => metrics.areaUnderROC() //准确率与召回率 case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true//ROC曲线下面积为1.0时表示一个完美的分类器,0.5则表示一个随机的性能 case "areaUnderPR" => true //准确率与召回率 } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 13
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StructField, StructType} override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val pcaOp = udf { pcaModel.transform _ } dataset.withColumn($(outputCol), pcaOp(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[VectorUDT], s"Input column ${$(inputCol)} must be a vector column") require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.") val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) StructType(outputFields) } override def copy(extra: ParamMap): PCAModel = { val copied = new PCAModel(uid, pcaModel) copyValues(copied, extra).setParent(parent) } }
Example 14
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.types.DataType def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT }
Example 15
Source File: NameAssigner.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasInputCols import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Dataset, functions} import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType} class NameAssigner(override val uid: String) extends Transformer with HasInputCols{ def setInputCols(column: String*) : this.type = set(inputCols, column.toArray) def this() = this(Identifiable.randomUID("NameAssigner")) override def transform(dataset: Dataset[_]): DataFrame = { $(inputCols) $(inputCols).foldLeft(dataset.toDF)((data, column) => { val metadata: Metadata = dataset.schema(column).metadata val attributes = AttributeGroup.fromStructField( StructField(column, new VectorUDT, nullable = false, metadata = metadata)) val map = attributes.attributes .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) val func = functions.udf[String, Number](x => if(x == null) { null } else { val i = x.intValue() map.getOrElse(i, i.toString) }) data.withColumn(column, func(data(column)).as(column, metadata)) }).toDF } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.map(f => if ($(inputCols).contains(f.name)) { StructField(f.name, StringType, f.nullable, f.metadata) } else { f })) }
Example 16
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val idf = udf { vec: Vector => idfModel.transform(vec) } dataset.withColumn($(outputCol), idf(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): IDFModel = { val copied = new IDFModel(uid, idfModel) copyValues(copied, extra).setParent(parent) } }
Example 17
Source File: HashingTF.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 18
Source File: Tokenizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getPattern: String = $(pattern) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") override protected def createTransformFunc: String => Seq[String] = { str => val re = $(pattern).r val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, true) override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) }
Example 19
Source File: TransientFeatureArrayParam.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.features._ import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.json4s.DefaultFormats import org.json4s.JsonAST.{JArray, JValue} import org.json4s.jackson.JsonMethods.{compact, parse, render} import scala.util.{Failure, Success} override def w(value: Array[TransientFeature]): ParamPair[Array[TransientFeature]] = super.w(value) override def jsonEncode(value: Array[TransientFeature]): String = { compact(render(JArray(value.map(_.toJson).toList))) } override def jsonDecode(json: String): Array[TransientFeature] = { parse(json).extract[Array[JValue]].map(obj => { TransientFeature(obj) match { case Failure(e) => throw new RuntimeException("Failed to parse TransientFeature", e) case Success(v) => v } }) } }
Example 20
Source File: SparkStageParam.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import org.apache.hadoop.fs.Path import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.param.{Param, ParamPair, Params} import org.apache.spark.ml.util.{Identifiable, MLReader, MLWritable} import org.apache.spark.util.SparkUtils import org.json4s.JsonAST.{JObject, JValue} import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.json4s.{DefaultFormats, Formats, JString} class SparkStageParam[S <: PipelineStage with Params] ( parent: String, name: String, doc: String, isValid: Option[S] => Boolean ) extends Param[Option[S]](parent, name, doc, isValid) { import SparkStageParam._ override def jsonDecode(jsonStr: String): Option[S] = { val json = parse(jsonStr) val uid = (json \ "uid").extractOpt[String] val path = (json \ "path").extractOpt[String] path -> uid match { case (None, _) | (_, None) | (_, Some(NoUID)) => savePath = None None case (Some(p), Some(stageUid)) => savePath = Option(p) val stagePath = new Path(p, stageUid).toString val className = (json \ "className").extract[String] val cls = SparkUtils.classForName(className) val stage = cls.getMethod("read").invoke(null).asInstanceOf[MLReader[PipelineStage]].load(stagePath) Option(stage).map(_.asInstanceOf[S]) } } } object SparkStageParam { implicit val formats: Formats = DefaultFormats val NoClass = "" val NoUID = "" def updateParamsMetadataWithPath(jValue: JValue, path: String): JValue = jValue match { case JObject(pairs) => JObject( pairs.map { case (SparkWrapperParams.SparkStageParamName, j) => SparkWrapperParams.SparkStageParamName -> j.merge(JObject("path" -> JString(path))) case param => param } ) case j => throw new IllegalArgumentException(s"Cannot recognize JSON Spark params metadata: $j") } }
Example 21
Source File: EstimatorWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.StructType import org.apache.spark.{ml, sql} import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.{Transformer, Estimator} import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class EstimatorWrapper( executionContext: ExecutionContext, estimator: Estimator[Transformer]) extends ML.Estimator[TransformerWrapper] { override def fitDF(dataset: sql.DataFrame): TransformerWrapper = { new TransformerWrapper( executionContext, estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))) } override def copy(extra: ParamMap): EstimatorWrapper = { val params = ParamTransformer.transform(extra) val estimatorCopy = estimator.replicate().set(params: _*) new EstimatorWrapper(executionContext, estimatorCopy) } override def transformSchema(schema: StructType): StructType = { schema } override lazy val params: Array[ml.param.Param[_]] = { estimator.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("EstimatorWrapper") }
Example 22
Source File: EvaluatorWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.evaluation import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.Evaluator import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class EvaluatorWrapper( context: ExecutionContext, evaluator: Evaluator) extends ML.Evaluator { override def evaluateDF(dataset: sql.DataFrame): Double = { evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value } override def copy(extra: ParamMap): evaluation.Evaluator = { val params = ParamTransformer.transform(extra) val evaluatorCopy = evaluator.replicate().set(params: _*) new EvaluatorWrapper(context, evaluatorCopy) } override lazy val params: Array[Param[_]] = { evaluator.params.map(new ParamWrapper(uid, _)) } override def isLargerBetter: Boolean = evaluator.isLargerBetter override val uid: String = Identifiable.randomUID("EvaluatorWrapper") }
Example 23
Source File: TransformerWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import org.apache.spark.sql.types.StructType import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.Transformer import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class TransformerWrapper( executionContext: ExecutionContext, transformer: Transformer) extends ML.Model[TransformerWrapper] { override def copy(extra: ParamMap): TransformerWrapper = { val params = ParamTransformer.transform(extra) val transformerCopy = transformer.replicate().set(params: _*) new TransformerWrapper(executionContext, transformerCopy) } override def transformDF(dataset: sql.DataFrame): sql.DataFrame = { transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())) .sparkDataFrame } override def transformSchema(schema: StructType): StructType = { transformer._transformSchema(schema).get } override lazy val params: Array[Param[_]] = { transformer.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("TransformerWrapper") }
Example 24
Source File: RegressionEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => -metrics.rootMeanSquaredError case "mse" => -metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => -metrics.meanAbsoluteError } metric } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 25
Source File: BinaryClassificationEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() case other => throw new IllegalArgumentException(s"Does not support metric $other.") } metrics.unpersist() metric } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 26
Source File: Binarizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) }
Example 27
Source File: IntermediateCacher.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} class IntermediateCacher(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("intermediateCacher")) } val inputCols = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) setDefault(inputCols -> Array.empty[String]) override def transformSchema(schema: StructType): StructType = { schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) val intermediateDF = if ($(inputCols).isEmpty) dataset.toDF() else dataset.select($(inputCols).map(col(_)): _*) intermediateDF.cache() } override def copy(extra: ParamMap): IntermediateCacher = { defaultCopy(extra) } } object IntermediateCacher extends DefaultParamsReadable[IntermediateCacher]
Example 28
Source File: EstimatorWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.StructType import org.apache.spark.{ml, sql} import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.{Transformer, Estimator} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class EstimatorWrapper( executionContext: ExecutionContext, estimator: Estimator[Transformer]) extends ML.Estimator[TransformerWrapper] { override def fitDF(dataset: sql.DataFrame): TransformerWrapper = { new TransformerWrapper( executionContext, estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))) } override def copy(extra: ParamMap): EstimatorWrapper = { val params = ParamTransformer.transform(extra) val estimatorCopy = estimator.replicate().set(params: _*) new EstimatorWrapper(executionContext, estimatorCopy) } override def transformSchema(schema: StructType): StructType = { schema } override lazy val params: Array[ml.param.Param[_]] = { estimator.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("EstimatorWrapper") }
Example 29
Source File: EvaluatorWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.evaluation import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.Evaluator import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class EvaluatorWrapper( context: ExecutionContext, evaluator: Evaluator) extends ML.Evaluator { override def evaluateDF(dataset: sql.DataFrame): Double = { evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value } override def copy(extra: ParamMap): evaluation.Evaluator = { val params = ParamTransformer.transform(extra) val evaluatorCopy = evaluator.replicate().set(params: _*) new EvaluatorWrapper(context, evaluatorCopy) } override lazy val params: Array[Param[_]] = { evaluator.params.map(new ParamWrapper(uid, _)) } override def isLargerBetter: Boolean = evaluator.isLargerBetter override val uid: String = Identifiable.randomUID("EvaluatorWrapper") }
Example 30
Source File: TransformerWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import org.apache.spark.sql.types.StructType import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.Transformer import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class TransformerWrapper( executionContext: ExecutionContext, transformer: Transformer) extends ML.Model[TransformerWrapper] { override def copy(extra: ParamMap): TransformerWrapper = { val params = ParamTransformer.transform(extra) val transformerCopy = transformer.replicate().set(params: _*) new TransformerWrapper(executionContext, transformerCopy) } override def transformDF(dataset: sql.DataFrame): sql.DataFrame = { transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())) .sparkDataFrame } override def transformSchema(schema: StructType): StructType = { transformer._transformSchema(schema).get } override lazy val params: Array[Param[_]] = { transformer.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("TransformerWrapper") }
Example 31
Source File: ALSRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import com.github.fommil.netlib.F2jBLAS import org.apache.spark.ml.recommendation.ALSModel import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.settings class ALSRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("alsRecommender")) } private def alsModel: ALSModel = { val alsModelPath = s"${settings.dataDir}/${settings.today}/alsModel.parquet" ALSModel.load(alsModelPath) } def blockify(factors: Dataset[(Int, Array[Float])], blockSize: Int = 4096): Dataset[Seq[(Int, Array[Float])]] = { import factors.sparkSession.implicits._ factors.mapPartitions(_.grouped(blockSize)) } override def source = "als" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) import userDF.sparkSession.implicits._ val activeUsers = userDF.select(col($(userCol)).alias("id")) val userFactors = alsModel.userFactors.join(activeUsers, Seq("id")) val itemFactors = alsModel.itemFactors val rank = alsModel.rank val num = $(topK) val userFactorsBlocked = blockify(userFactors.as[(Int, Array[Float])]) val itemFactorsBlocked = blockify(itemFactors.as[(Int, Array[Float])]) val ratings = userFactorsBlocked.crossJoin(itemFactorsBlocked) .as[(Seq[(Int, Array[Float])], Seq[(Int, Array[Float])])] .flatMap { case (srcIter, dstIter) => val m = srcIter.size val n = math.min(dstIter.size, num) val output = new Array[(Int, Int, Float)](m * n) var i = 0 val pq = new BoundedPriorityQueue[(Int, Float)](num)(Ordering.by(_._2)) srcIter.foreach { case (srcId, srcFactor) => dstIter.foreach { case (dstId, dstFactor) => val score = new F2jBLAS().sdot(rank, srcFactor, 1, dstFactor, 1) pq += dstId -> score } pq.foreach { case (dstId, score) => output(i) = (srcId, dstId, score) i += 1 } pq.clear() } output.toSeq } ratings .toDF($(userCol), $(itemCol), $(scoreCol)) .withColumn($(sourceCol), lit(source)) } }
Example 32
Source File: CurationRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import ws.vinta.albedo.utils.DatasetUtils._ class CurationRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("curationRecommender")) } override def source = "curation" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) implicit val spark: SparkSession = userDF.sparkSession import spark.implicits._ val rawStarringDS = loadRawStarringDS().cache() val curatorIds = Array(652070, 1912583, 59990, 646843, 28702) // vinta, saiday, tzangms, fukuball, wancw val curatedRepoDF = rawStarringDS .select($"repo_id", $"starred_at") .where($"user_id".isin(curatorIds: _*)) .groupBy($"repo_id") .agg(max($"starred_at").alias("starred_at")) .orderBy($"starred_at".desc) .limit($(topK)) .cache() def calculateScoreUDF = udf((starred_at: java.sql.Timestamp) => { starred_at.getTime / 1000.0 }) userDF .select($(userCol)) .crossJoin(curatedRepoDF) .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"starred_at").alias($(scoreCol))) .withColumn($(sourceCol), lit(source)) } }
Example 33
Source File: PopularityRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import ws.vinta.albedo.utils.DatasetUtils._ class PopularityRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("popularityRecommender")) } override def source = "popularity" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) implicit val spark: SparkSession = userDF.sparkSession import spark.implicits._ val popularRepoDF = loadPopularRepoDF() .limit($(topK)) .cache() def calculateScoreUDF = udf((stargazers_count: Int, created_at: java.sql.Timestamp) => { val valueScore = math.round(math.log10(stargazers_count) * 1000.0) / 1000.0 val timeScore = (created_at.getTime / 1000.0) / (60 * 60 * 24 * 30 * 12) / 5.0 valueScore + timeScore }) userDF .select($(userCol)) .crossJoin(popularRepoDF) .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"repo_stargazers_count", $"repo_created_at").alias($(scoreCol))) .withColumn($(sourceCol), lit(source)) } }
Example 34
Source File: ContentRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.http.HttpHost import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.elasticsearch.action.search.SearchRequest import org.elasticsearch.client.{RestClient, RestHighLevelClient} import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item import org.elasticsearch.index.query.QueryBuilders._ import org.elasticsearch.search.SearchHit import org.elasticsearch.search.builder.SearchSourceBuilder import ws.vinta.albedo.closures.DBFunctions._ class ContentRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("contentRecommender")) } val enableEvaluationMode = new Param[Boolean](this, "enableEvaluationMode", "Should be enable for evaluation only") def getEnableEvaluationMode: Boolean = $(enableEvaluationMode) def setEnableEvaluationMode(value: Boolean): this.type = set(enableEvaluationMode, value) setDefault(enableEvaluationMode -> false) override def source = "content" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) import userDF.sparkSession.implicits._ val userRecommendedItemDF = userDF .as[Int] .flatMap { case (userId) => { // 因為 More Like This query 用 document id 查詢時 // 結果會過濾掉那些做為條件的 document ids // 但是這樣在 evaluate 的時候就不太合適了 // 所以我們改用後 k 個 repo 當作查詢條件 val limit = $(topK) val offset = if ($(enableEvaluationMode)) $(topK) else 0 val repoIds = selectUserStarredRepos(userId, limit, offset) val lowClient = RestClient.builder(new HttpHost("127.0.0.1", 9200, "http")).build() val highClient = new RestHighLevelClient(lowClient) val fields = Array("description", "full_name", "language", "topics") val texts = Array("") val items = repoIds.map((itemId: Int) => new Item("repo", "repo_info_doc", itemId.toString)) val queryBuilder = moreLikeThisQuery(fields, texts, items) .minTermFreq(2) .maxQueryTerms(50) val searchSourceBuilder = new SearchSourceBuilder() searchSourceBuilder.query(queryBuilder) searchSourceBuilder.size($(topK)) searchSourceBuilder.from(0) val searchRequest = new SearchRequest() searchRequest.indices("repo") searchRequest.types("repo_info_doc") searchRequest.source(searchSourceBuilder) val searchResponse = highClient.search(searchRequest) val hits = searchResponse.getHits val searchHits = hits.getHits val userItemScoreTuples = searchHits.map((searchHit: SearchHit) => { val itemId = searchHit.getId.toInt val score = searchHit.getScore (userId, itemId, score) }) lowClient.close() userItemScoreTuples } } .toDF($(userCol), $(itemCol), $(scoreCol)) .withColumn($(sourceCol), lit(source)) userRecommendedItemDF } }
Example 35
Source File: UserRepoTransformer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ class UserRepoTransformer(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("userRepoTransformer")) } val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) override def transformSchema(schema: StructType): StructType = { $(inputCols).foreach((inputColName: String) => { require(schema.fieldNames.contains(inputColName), s"Input column $inputColName must exist.") }) val newFields: Array[StructField] = Array( StructField("repo_language_index_in_user_recent_repo_languages", IntegerType, nullable = false), StructField("repo_language_count_in_user_recent_repo_languages", IntegerType, nullable = false) ) StructType(schema.fields ++ newFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) import dataset.sparkSession.implicits._ dataset .withColumn("repo_language_index_in_user_recent_repo_languages", repoLanguageIndexInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) .withColumn("repo_language_count_in_user_recent_repo_languages", repoLanguageCountInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) } override def copy(extra: ParamMap): UserRepoTransformer = { defaultCopy(extra) } } object UserRepoTransformer extends DefaultParamsReadable[UserRepoTransformer]
Example 36
Source File: HanLPTokenizer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import java.util import com.hankcs.hanlp.HanLP import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary import com.hankcs.hanlp.seg.common.Term import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.{BooleanParam, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types._ import scala.collection.JavaConverters._ class HanLPTokenizer(override val uid: String) extends UnaryTransformer[String, Seq[String], HanLPTokenizer] with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("hanLPTokenizer")) } val shouldRemoveStopWords = new BooleanParam(this, "shouldRemoveStopWords", "Whether to remove stop words") def getShouldRemoveStopWords: Boolean = $(shouldRemoveStopWords) def setShouldRemoveStopWords(value: Boolean): this.type = set(shouldRemoveStopWords, value) setDefault(shouldRemoveStopWords -> true) override def createTransformFunc: String => Seq[String] = { originStr => HanLP.Config.ShowTermNature = false HanLP.Config.Normalization = false val segment = HanLP.newSegment() val termList: util.List[Term] = segment.seg(HanLP.convertToSimplifiedChinese(originStr.toLowerCase)) if ($(shouldRemoveStopWords)) { CoreStopWordDictionary.apply(termList) } val LanguageRE = """(c|r|c\+\+|c#|f#)""".r val OneCharExceptCJKRE = """([^\p{InHiragana}\p{InKatakana}\p{InBopomofo}\p{InCJKCompatibilityIdeographs}\p{InCJKUnifiedIdeographs}])""".r termList .asScala .flatMap((term: Term) => { val word = term.word word match { case LanguageRE(language) => Array(language) case OneCharExceptCJKRE(_) => Array.empty[String] case _ => """([\w\.\-_\p{InHiragana}\p{InKatakana}\p{InBopomofo}\p{InCJKCompatibilityIdeographs}\p{InCJKUnifiedIdeographs}]+)""".r.findAllIn(word).toList } }) } override def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override def outputDataType: DataType = { new ArrayType(StringType, false) } override def copy(extra: ParamMap): HanLPTokenizer = { defaultCopy(extra) } } object HanLPTokenizer extends DefaultParamsReadable[HanLPTokenizer]
Example 37
Source File: SnowballStemmer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import org.tartarus.snowball.ext.EnglishStemmer class SnowballStemmer(override val uid: String) extends UnaryTransformer[Seq[String], Seq[String], SnowballStemmer] with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("snowballStemmer")) } override def createTransformFunc: Seq[String] => Seq[String] = { strings => val stemmer = new EnglishStemmer() strings.map((str: String) => { try { stemmer.setCurrent(str) stemmer.stem() stemmer.getCurrent() } catch { case _: Exception => str } }) } override def validateInputType(inputType: DataType): Unit = { require(inputType == ArrayType(StringType), s"Input type must be string type but got $inputType.") } override def outputDataType: DataType = { ArrayType(StringType) } override def copy(extra: ParamMap): SnowballStemmer = { defaultCopy(extra) } } object SnowballStemmer extends DefaultParamsReadable[SnowballStemmer]
Example 38
Source File: RankingMetricFormatter.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ import ws.vinta.albedo.evaluators.RankingEvaluator._ class RankingMetricFormatter(override val uid: String, val sourceType: String) extends Transformer with DefaultParamsWritable { def this(sourceType: String) = { this(Identifiable.randomUID("rankingMetricFormatter"), sourceType) } val userCol = new Param[String](this, "userCol", "User column name") def getUserCol: String = $(userCol) def setUserCol(value: String): this.type = set(userCol, value) setDefault(userCol -> "user") val itemCol = new Param[String](this, "itemCol", "Item column name") def getItemCol: String = $(itemCol) def setItemCol(value: String): this.type = set(itemCol, value) setDefault(itemCol -> "item") val predictionCol = new Param[String](this, "predictionCol", "Prediction column name") def getPredictionCol: String = $(predictionCol) def setPredictionCol(value: String): this.type = set(predictionCol, value) setDefault(predictionCol -> "prediction") val topK = new IntParam(this, "topK", "Recommend top-k items for every user") def getTopK: Int = $(topK) def setTopK(value: Int): this.type = set(topK, value) setDefault(topK -> 15) override def transformSchema(schema: StructType): StructType = { Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType) .foreach{ case(columnName: String, expectedDataType: DataType) => { val actualDataType = schema(columnName).dataType require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.") } } schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) sourceType match { case "als" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK))) case "lr" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK))) } } override def copy(extra: ParamMap): RankingMetricFormatter = { val copied = new RankingMetricFormatter(uid, sourceType) copyValues(copied, extra) } } object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter]
Example 39
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val idf = udf { vec: Vector => idfModel.transform(vec) } dataset.withColumn($(outputCol), idf(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): IDFModel = { val copied = new IDFModel(uid, idfModel) copyValues(copied, extra) } }
Example 40
Source File: RegressionEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("1.4.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema val predictionColName = $(predictionCol) val predictionType = schema($(predictionCol)).dataType require(predictionType == FloatType || predictionType == DoubleType, s"Prediction column $predictionColName must be of type float or double, " + s" but not $predictionType") val labelColName = $(labelCol) val labelType = schema($(labelCol)).dataType require(labelType == FloatType || labelType == DoubleType, s"Label column $labelColName must be of type float or double, but not $labelType") val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 41
Source File: MulticlassClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("1.5.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision case "recall" => metrics.recall case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall } metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true case "precision" => true case "recall" => true case "weightedPrecision" => true case "weightedRecall" => true } @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 42
Source File: BinaryClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("1.2.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 43
Source File: RWrapperUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.internal.Logging import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute} import org.apache.spark.ml.feature.{RFormula, RFormulaModel} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.Dataset private[r] object RWrapperUtils extends Logging { def getFeaturesAndLabels( rFormulaModel: RFormulaModel, data: Dataset[_]): (Array[String], Array[String]) = { val schema = rFormulaModel.transform(data).schema val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol)) .attributes.get val features = featureAttrs.map(_.name.get) val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol)) .asInstanceOf[NominalAttribute] val labels = labelAttr.values.get (features, labels) } }
Example 44
Source File: RegressionEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 45
Source File: MulticlassClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 46
Source File: BinaryClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 47
Source File: ElementwiseProduct.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 48
Source File: UnaryTransformerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.DoubleParam import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.{DataType, DataTypes} import org.apache.spark.util.Utils // $example off$ object MyTransformer extends DefaultParamsReadable[MyTransformer] // $example off$ def main(args: Array[String]) { val spark = SparkSession .builder() .appName("UnaryTransformerExample") .getOrCreate() // $example on$ val myTransformer = new MyTransformer() .setShift(0.5) .setInputCol("input") .setOutputCol("output") // Create data, transform, and display it. val data = spark.range(0, 5).toDF("input") .select(col("input").cast("double").as("input")) val result = myTransformer.transform(data) println("Transformed by adding constant value") result.show() // Save and load the Transformer. val tmpDir = Utils.createTempDir() val dirName = tmpDir.getCanonicalPath myTransformer.write.overwrite().save(dirName) val sameTransformer = MyTransformer.load(dirName) // Transform the data to show the results are identical. println("Same transform applied from loaded model") val sameResult = sameTransformer.transform(data) sameResult.show() Utils.deleteRecursively(tmpDir) // $example off$ spark.stop() } } // scalastyle:on println
Example 49
Source File: VectorExplode.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.util.collection.OpenHashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.odkl.SparkSqlUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, functions} class VectorExplode(override val uid: String) extends Transformer with DefaultParamsWritable { val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.") def setValueCol(value: String) : this.type = set(valueCol, value) setDefault(valueCol -> "value") def this() = this(Identifiable.randomUID("vectorExplode")) override def transform(dataset: Dataset[_]): DataFrame = { val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT]) val resultSchema = StructType(Seq( StructField($(valueCol), StringType, nullable = false)) ++ vectors.map(f => StructField(f.name, DoubleType, nullable = true)) ) val arraySize = resultSchema.size - 1 val names: Array[Map[Int, String]] = vectors.map( f => { AttributeGroup.fromStructField(f).attributes .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) }) val maxCapacity = names.map(_.size).max val explodeVectors : (Row => Array[Row]) = (r: Row ) => { val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity) for(i <- 0 until r.length) { val vector = r.getAs[Vector](i) vector.foreachActive((index, value) => { val name = names(i).getOrElse(index, s"${vectors(i).name}_$index") accumulator.changeValue( name, Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN}, v => {v(i) = value; v}) }) } accumulator.map(x => new GenericRowWithSchema( (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray, resultSchema)).toArray } val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*) val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType))) val expression = functions.explode(explodeUDF(vectorsStruct)) dataset .withColumn(uid, expression) .select( dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++ resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.fields.map(x => x.dataType match { case vector: VectorUDT => StructField(x.name, typeFromVector(x)) case _ => x } )) def typeFromVector(field: StructField): StructType = { val attributes = AttributeGroup.fromStructField(field) StructType(attributes.attributes .map(_.map(a => a.name.getOrElse(s"_${a.index.get}"))) .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" }) .map(name => StructField(name, DoubleType, nullable = false))) } }
Example 50
Source File: RegressionEvaluator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl class RegressionEvaluator(override val uid: String) extends Evaluator[RegressionEvaluator](uid) { val throughOrigin = new BooleanParam(this, "throughOrigin", "True if the regression is through the origin. For example, in " + "linear regression, it will be true without fitting intercept.") def setThroughOrigin(value: Boolean): this.type = set(throughOrigin, value) def getThroughOrigin: Boolean = $(throughOrigin) def this() = this(Identifiable.randomUID("regressionEvaluator")) override def transform(dataset: Dataset[_]): DataFrame = { try { val predictions: RDD[(Double, Double)] = dataset.select($(predictionCol), $(labelCol)) .rdd.map { case Row(score: Double, label: Double) => (score, label) } val metrics = Try(new RegressionMetrics(predictions)) val rows = metrics.toOption.map(m => Seq( "r2" -> m.r2, "rmse" -> m.rootMeanSquaredError, "explainedVariance" -> m.explainedVariance, "meanAbsoluteError" -> m.meanAbsoluteError, "meanSquaredError" -> m.meanSquaredError ).map(Row.fromTuple)).getOrElse(Seq()) SparkSqlUtils.reflectionLock.synchronized( dataset.sqlContext.createDataFrame( dataset.sparkSession.sparkContext.parallelize(rows, 1), transformSchema(dataset.schema))) } catch { // Most probably evaluation dataset is empty case e: Exception => logWarning("Failed to calculate metrics due to " + e.getMessage) SparkSqlUtils.reflectionLock.synchronized( dataset.sqlContext.createDataFrame( dataset.sparkSession.sparkContext.emptyRDD[Row], transformSchema(dataset.schema))) } } override def copy(extra: ParamMap): RegressionEvaluator = { copyValues(new RegressionEvaluator(), extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { new StructType() .add("metric", StringType, nullable = false) .add("value", DoubleType, nullable = false) } }
Example 51
Source File: BinaryClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 52
Source File: Chunk2Doc.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT} import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} class Chunk2Doc(override val uid: String) extends AnnotatorModel[Chunk2Doc] { def this() = this(Identifiable.randomUID("CHUNK2DOC")) override val outputAnnotatorType: AnnotatorType = DOCUMENT override val inputAnnotatorTypes: Array[String] = Array(CHUNK) override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { annotations.map(annotation => { Annotation( outputAnnotatorType, annotation.begin, annotation.end, annotation.result, annotation.metadata ) }) } } object Chunk2Doc extends DefaultParamsReadable[Chunk2Doc]
Example 53
Source File: Cleaner.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions.clean import com.hankcs.hanlp.HanLP import config.paramconf.{HasOutputCol, HasInputCol} import functions.MySchemaUtils import functions.clean.chinese.BCConvert import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1) override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val cleanFunc = udf {line: String => var cleaned = "" getFanJian match { case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line) case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line) case _ => cleaned = line } getQuanBan match { case "q2b" => cleaned = BCConvert.qj2bj(cleaned) case "b2q" => cleaned = BCConvert.bj2qj(cleaned) case _ => cleaned = cleaned } cleaned } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record => val outputIndex = record.fieldIndex($(outputCol)) record.getString(outputIndex).length >= getMinLineLen } } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.typeName.equals(StringType.typeName), s"Input type must be StringType but got $inputType.") MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable) } } object Cleaner extends DefaultParamsReadable[Cleaner] { override def load(path: String): Cleaner = super.load(path) }
Example 54
Source File: HoltWintersBestModelFinder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberHoltWintersModel import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import scala.reflect.ClassTag class HoltWintersBestModelFinder[G]( override val uid: String )(implicit kt: ClassTag[G]) extends HoltWintersBestModelEvaluation[G, HoltWintersModel[G]] with DefaultParamsWritable with HasGroupByCol with TimeSeriesBestModelFinder { def setTimeSeriesEvaluator(eval: TimeSeriesEvaluator[G]): this.type = set(timeSeriesEvaluator, eval) def setEstimatorParamMaps(value: Array[ParamMap]): this.type = set(estimatorParamMaps, value) def setNFutures(value: Int): this.type = set(nFutures, value) override def setValidationCol(value: String): this.type = set(validationCol, value) def setLabelCol(label: String): this.type = set(labelCol, label) def setGroupByCol(groupBy: String): this.type = set(groupByCol, Some(groupBy)) def this()(implicit kt: ClassTag[G]) = this(Identifiable.randomUID("arima")) def modelEvaluation( idModels: RDD[(G, Row, Option[UberHoltWintersModel])] ): RDD[(G, (UberHoltWintersModel, ModelParamEvaluation[G]))] = { val eval = $(timeSeriesEvaluator) val broadcastEvaluator = idModels.context.broadcast(eval) idModels.filter(_._3.isDefined).map { case (id, row, models) => val evaluatedModels = models.map { model => holtWintersEvaluation(row, model, broadcastEvaluator, id) }.head log.warn(s"best model reach ${evaluatedModels._2.metricResult}") (id, evaluatedModels) } } override protected def train(dataSet: Dataset[_]): HoltWintersModel[G] = { val splitDs = split(dataSet, $(nFutures)) val idModels = splitDs.rdd.map(train) new HoltWintersModel[G](uid, modelEvaluation(idModels)) .setValidationCol($(validationCol)) .asInstanceOf[HoltWintersModel[G]] } def train(row: Row): (G, Row, Option[UberHoltWintersModel]) = { val id = row.getAs[G]($(groupByCol).get) val result = try { val dense = row.getAs[org.apache.spark.ml.linalg.DenseVector]($(featuresCol)) val ts:org.apache.spark.mllib.linalg.Vector = org.apache.spark.mllib.linalg.Vectors.dense(dense.toArray); Some( UberHoltWintersModel.fitModelWithBOBYQA(ts, $(nFutures)) ) } catch { case e: Exception => log.error( s"Got the following Exception ${e.getLocalizedMessage} in id $id" ) None } (id, row, result) } } object HoltWintersBestModelFinder extends DefaultParamsReadable[HoltWintersBestModelFinder[_]] { override def load(path: String): HoltWintersBestModelFinder[_] = super.load(path) }
Example 55
Source File: AllColumnsTimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) // override def transform(dataSet: DataFrame): DataFrame = { override def transform(dataSet: Dataset[_] ): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(labelCol))) val keyValueDataSet = rdd.map { case (row: Row) => Row( row.getAs[T](labelColIndex.value), row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol)) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(keyValueDataSet, trainSchema) } override def transformSchema(schema: StructType): StructType = { StructType( schema.filter(_.name == $(labelCol)).head +: Seq( StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): AllColumnsTimeSeriesGenerator[T, U] = defaultCopy(extra) } object AllColumnsTimeSeriesGenerator extends DefaultParamsReadable[AllColumnsTimeSeriesGenerator[_, _]] { override def load(path: String): AllColumnsTimeSeriesGenerator[_, _] = super.load(path) }
Example 56
Source File: VectorizeEncoder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.core.data.DataTransformer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} class VectorizeEncoder(override val uid: String) extends Transformer with HasIdCol with HasTimeCol with HasInputCols with HasLabelCol with HasGroupByCol with HasOutputCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("vectorizer")) def setIdCol(input: String) = set(idCol, input) def setLabelCol(input: String) = set(labelCol, input) def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy)) def setInputCol(input: Array[String]) = set(inputCols, input) def setTimeCol(time: String) = set(timeCol, Some(time)) def setOutputCol(output: String) = set(outputCol, output) override def transform(dataSet: Dataset[_]): DataFrame = { val context = dataSet.sqlContext.sparkContext val input = context.broadcast($(inputCols)) val allColumnNames = dataSet.schema.map(_.name) val nonInputColumnIndexes = context.broadcast( allColumnNames.zipWithIndex.filter( f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol) || f._1 == $(timeCol).getOrElse(""))) val result = dataSet.rdd.map { case (row: Row) => val rowSeq = row.toSeq val nonInputColumns = nonInputColumnIndexes.value.map { case (_, index) => rowSeq(index) } val size = input.value.length val (values, indices) = input.value .filter(col => row.getAs(col) != null) .map { column => DataTransformer.toDouble(row.getAs(column)) } .zipWithIndex .filter(f => f._1 != 0d) .unzip Row( nonInputColumns :+ org.apache.spark.ml.linalg.Vectors .sparse(size, indices.toArray, values.toArray): _* ) } val newSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(result, newSchema) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType( schema.filter( col => !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol) || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("") ) ).add(StructField($(outputCol), new VectorUDT)) }
Example 57
Source File: MovingAverage.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types._ def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(windowSize -> 3) override def transform(dataSet: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataSet.schema) val sparkContext = dataSet.sqlContext.sparkContext val inputType = outputSchema($(inputCol)).dataType val inputTypeBr = sparkContext.broadcast(inputType) val dataSetRdd = dataSet.rdd val inputColName = sparkContext.broadcast($(inputCol)) val inputColIndex = dataSet.columns.indexOf($(inputCol)) val inputColIndexBr = sparkContext.broadcast(inputColIndex) val windowSizeBr = sparkContext.broadcast($(windowSize)) val maRdd = dataSetRdd.map { case (row: Row) => val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) { val vector = row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value) (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1))) } else { val iterable = row.getAs[Iterable[Double]](inputColName.value) (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1))) } val (before, after) = row.toSeq.splitAt(inputColIndexBr.value) Row( (before :+ rawValue) ++ after.tail :+ MovingAverageCalc .simpleMovingAverageArray(array, windowSizeBr.value): _* ) } dataSet.sqlContext.createDataFrame(maRdd, outputSchema) } override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra) } object MovingAverageCalc { private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = { (for (i <- 1 to values.length) yield //TODO rollback this comment with the right size of features to make the meanaverage return // the features values for the first values of the calc if (i < period) 0d //values(i) else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d) } } object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] { override def load(path: String): MovingAverage[_] = super.load(path) }
Example 58
Source File: TimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataSet: Dataset[_]): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val index = sparkContext.broadcast(dataSet.schema.fieldIndex($(timeCol).get)) val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(groupByCol).get)) val featuresColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(featuresCol))) val grouped = rdd.map { case (row: Row) => val timeColRow = IUberdataForecastUtil.convertColumnToLong(row, index.value) convertColumnToDouble(timeColRow, featuresColIndex) }.groupBy { row => row.getAs[L](labelColIndex.value) }.map { case (key, values) => val toBeUsed = values.toArray.sortBy(row => row.getAs[Long](index.value)) (key, toBeUsed) } val toBeTrained = grouped.map { case (key, values) => org.apache.spark.sql.Row( key, Vectors.dense(values.map(_.getAs[Double](featuresColIndex.value))) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(toBeTrained, trainSchema) } override def transformSchema(schema: StructType): StructType = { val labelIndex = schema.fieldIndex($(groupByCol).get) StructType( Seq( schema.fields(labelIndex), StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): TimeSeriesGenerator[L] = defaultCopy(extra) } object TimeSeriesGenerator extends DefaultParamsReadable[TimeSeriesGenerator[_]] { override def load(path: String): TimeSeriesGenerator[_] = super.load(path) }
Example 59
Source File: XGBoost.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import eleflow.uberdata.models.UberXGBOOSTModel import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType} import scala.reflect.ClassTag class XGBoost[I](override val uid: String, val models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))])( implicit kt: ClassTag[I], ord: Ordering[I] = null) extends ForecastBaseModel[XGBoostSmallModel[I]] with HasInputCol with HasOutputCol with DefaultParamsWritable with HasFeaturesCol with HasNFutures with HasGroupByCol { def this( models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))] )(implicit kt: ClassTag[I], ord: Ordering[I] ) = this(Identifiable.randomUID("xgboost"), models) override def transform(dataSet: Dataset[_]): DataFrame = { val schema = dataSet.schema val predSchema = transformSchema(schema) val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)}) val predictions = joined.map { case (id, ((bestModel, metrics), row)) => val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]]( IUberdataForecastUtil.FEATURES_COL_NAME ) val label = DataTransformer.toFloat(row.getAs($(featuresCol))) val labelPoint = features.map { vec => val array = vec.toArray.map(_.toFloat) LabeledPoint(label, null, array) } val matrix = new DMatrix(labelPoint.toIterator) val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance .predict(matrix) .flatMap(_.map(_.toDouble)) .splitAt(features.length) Row( row.toSeq :+ Vectors .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _* ) } dataSet.sqlContext.createDataFrame(predictions, predSchema) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra) }
Example 60
Source File: RWrapperUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.internal.Logging import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute} import org.apache.spark.ml.feature.{RFormula, RFormulaModel} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.Dataset private[r] object RWrapperUtils extends Logging { def getFeaturesAndLabels( rFormulaModel: RFormulaModel, data: Dataset[_]): (Array[String], Array[String]) = { val schema = rFormulaModel.transform(data).schema val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol)) .attributes.get val features = featureAttrs.map(_.name.get) val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol)) .asInstanceOf[NominalAttribute] val labels = labelAttr.values.get (features, labels) } }
Example 61
Source File: RegressionEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 62
Source File: MulticlassClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 63
Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.mutable.ListBuffer class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline { def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty) def this(uid: String) = this(uid, Array.empty) def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages) this.setStages(baseStages) override def fit(dataset: Dataset[_]): PipelineModel = { transformSchema(dataset.schema, logging = true) val theStages = $(stages) var indexOfLastEstimator = -1 theStages.view.zipWithIndex.foreach { case (stage, index) => stage match { case _: Estimator[_] => indexOfLastEstimator = index case _ => } } var curDataset = dataset val transformers = ListBuffer.empty[Transformer] theStages.view.zipWithIndex.foreach { case (stage, index) => if (index <= indexOfLastEstimator) { val transformer = stage match { case estimator: HasRecursiveFit[_] => estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset)) case estimator: Estimator[_] => estimator.fit(curDataset) case t: Transformer => t case _ => throw new IllegalArgumentException( s"Does not support stage $stage of type ${stage.getClass}") } if (index < indexOfLastEstimator) { curDataset = transformer.transform(curDataset) } transformers += transformer } else { transformers += stage.asInstanceOf[Transformer] } } createPipeline(dataset, transformers.toArray) } } class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel) extends Model[RecursivePipelineModel] with MLWritable with Logging { def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline) // drops right at most because is itself included private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel = new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset) override def copy(extra: ParamMap): RecursivePipelineModel = { new RecursivePipelineModel(uid, innerPipeline.copy(extra)) } override def write: MLWriter = { innerPipeline.write } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match { case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset)) case t: AnnotatorModel[_] if t.getLazyAnnotator => cur case t: Transformer => t.transform(cur) }) } override def transformSchema(schema: StructType): StructType = { innerPipeline.transformSchema(schema) } }
Example 64
Source File: ElementwiseProduct.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 65
Source File: GaussianProcessRegression.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import breeze.linalg.{DenseVector => BDV, _} import org.apache.spark.internal.Logging import org.apache.spark.ml.commons._ import org.apache.spark.ml.commons.kernel.Kernel import org.apache.spark.ml.commons.util._ import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, Instrumentation} import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset class GaussianProcessRegression(override val uid: String) extends Regressor[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with GaussianProcessParams with GaussianProcessCommons[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with Logging { def this() = this(Identifiable.randomUID("gaussProcessReg")) override protected def train(dataset: Dataset[_]): GaussianProcessRegressionModel = { val instr = Instrumentation.create(this, dataset) val points: RDD[LabeledPoint] = getPoints(dataset).cache() val expertLabelsAndKernels: RDD[(BDV[Double], Kernel)] = getExpertLabelsAndKernels(points).cache() val optimalHyperparameters = optimizeHypers(instr, expertLabelsAndKernels, likelihoodAndGradient) expertLabelsAndKernels.foreach(_._2.setHyperparameters(optimalHyperparameters)) produceModel(instr, points, expertLabelsAndKernels, optimalHyperparameters) } private def likelihoodAndGradient(yAndK : (BDV[Double], Kernel), x : BDV[Double]) = { val (y: BDV[Double], kernel : Kernel) = yAndK kernel.setHyperparameters(x) val (k, derivative) = kernel.trainingKernelAndDerivative() val (_, logdet, kinv) = logDetAndInv(k) val alpha = kinv * y val likelihood = 0.5 * (y.t * alpha) + 0.5 * logdet val alphaAlphaTMinusKinv = alpha * alpha.t alphaAlphaTMinusKinv -= kinv val gradient = derivative.map(derivative => -0.5 * sum(derivative *= alphaAlphaTMinusKinv)) (likelihood, BDV(gradient:_*)) } override def copy(extra: ParamMap): GaussianProcessRegression = defaultCopy(extra) override protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor): GaussianProcessRegressionModel = new GaussianProcessRegressionModel(uid, rawPredictor) } class GaussianProcessRegressionModel private[regression](override val uid: String, private val gaussianProjectedProcessRawPredictor: GaussianProjectedProcessRawPredictor) extends RegressionModel[Vector, GaussianProcessRegressionModel] { override protected def predict(features: Vector): Double = { gaussianProjectedProcessRawPredictor.predict(features)._1 } override def copy(extra: ParamMap): GaussianProcessRegressionModel = { val newModel = copyValues(new GaussianProcessRegressionModel(uid, gaussianProjectedProcessRawPredictor), extra) newModel.setParent(parent) } }
Example 66
Source File: MultinomialLabeler.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.MultinomialLabelerModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasFeaturesCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{udf, col} import ml.combust.mleap.core.util.VectorConverters._ class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"), val model: MultinomialLabelerModel) extends Transformer with HasFeaturesCol with HasProbabilitiesCol with HasLabelsCol { def setFeaturesCol(value: String): this.type = set(featuresCol, value) def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value) def setLabelsCol(value: String): this.type = set(labelsCol, value) @org.apache.spark.annotation.Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val probabilitiesUdf = udf { (vector: Vector) => model.top(vector).map(_._1).toArray } val labelsUdf = udf { (vector: Vector) => model.topLabels(vector).toArray } dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))). withColumn($(labelsCol), labelsUdf(col($(featuresCol)))) } override def copy(extra: ParamMap): Transformer = copyValues(new MultinomialLabeler(uid, model), extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT], s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(probabilitiesCol)), s"Output column ${$(probabilitiesCol)} already exists.") require(!inputFields.exists(_.name == $(labelsCol)), s"Output column ${$(labelsCol)} already exists.") StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)), StructField($(labelsCol), ArrayType(StringType)))) } }
Example 67
Source File: MathUnary.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType} import org.apache.spark.sql.functions.udf private val className = classOf[MathUnary].getName override def load(path: String): MathUnary = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("operation").head() val operation = data.getAs[String](0) val model = MathUnaryModel(UnaryOperation.forName(operation)) val transformer = new MathUnary(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 68
Source File: Sampler.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import scala.util.Random class Sampler(fraction: Double, override val uid: String, seed: Int = Random.nextInt) extends Transformer { def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler")) final def getOutputCol: String = $(inputCol) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sample(false, fraction, seed).toDF } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Sampler = defaultCopy(extra) } object Sampler { def main(args: Array[String]): Unit = { val ss = SparkSession .builder .master("local") .appName("preprocess") .getOrCreate() val training = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") println(training.count) val sampler = new Sampler(0.5) .setInputCol("features") val pipeline = new Pipeline() .setStages(Array(sampler)) val model = pipeline.fit(training) val test = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") model.transform(test).select("*") .collect() .foreach { case Row(label: Double, vector: Vector) => println(s"($label, " + s"${vector.toSparse.indices.mkString("[", ",", "]")}, " + s"${vector.toSparse.values.mkString("[", ",", "]")}") } ss.stop() } }
Example 69
Source File: DLImageTransformer.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dlframes import com.intel.analytics.bigdl.dataset.Transformer import com.intel.analytics.bigdl.transform.vision.image.{FeatureTransformer, ImageFeature, MatToTensor} import org.apache.spark.ml.DLTransformerBase import org.apache.spark.ml.adapter.{HasInputCol, HasOutputCol, SchemaUtils} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row} class DLImageTransformer ( val transformer: Transformer[ImageFeature, ImageFeature], override val uid: String) extends DLTransformerBase with HasInputCol with HasOutputCol { def this(transformer: FeatureTransformer) = this(transformer, Identifiable.randomUID("DLImageTransformer")) setDefault(inputCol -> "image") def setInputCol(value: String): this.type = set(inputCol, value) setDefault(outputCol -> "output") def setOutputCol(value: String): this.type = set(outputCol, value) protected def validateInputType(inputType: DataType): Unit = { val validTypes = Array(DLImageSchema.floatSchema, DLImageSchema.byteSchema) require(validTypes.exists(t => SchemaUtils.sameType(inputType, t)), s"Bad input type: $inputType. Requires ${validTypes.mkString(", ")}") } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), DLImageSchema.floatSchema, nullable = false) StructType(outputFields) } protected override def internalTransform(dataFrame: DataFrame): DataFrame = { transformSchema(dataFrame.schema, logging = true) val sc = dataFrame.sqlContext.sparkContext val localTransformer = this.transformer val transformerBC = sc.broadcast(localTransformer) val toTensorBC = sc.broadcast(MatToTensor[Float](shareBuffer = true)) val inputColIndex = dataFrame.schema.fieldIndex($(inputCol)) val resultRDD = dataFrame.rdd.mapPartitions { rowIter => val localTransformer = transformerBC.value.cloneTransformer() val toTensorTransformer = toTensorBC.value.cloneTransformer().asInstanceOf[MatToTensor[Float]] rowIter.map { row => val imf = DLImageSchema.row2IMF(row.getAs[Row](inputColIndex)) val output = localTransformer.apply(Iterator(imf)).toArray.head if (!output.contains(ImageFeature.imageTensor)) { toTensorTransformer.transform(output) } Row.fromSeq(row.toSeq ++ Seq(DLImageSchema.imf2Row(output))) } } val resultSchema = transformSchema(dataFrame.schema) dataFrame.sqlContext.createDataFrame(resultRDD, resultSchema) } }
Example 70
Source File: DLClassifier.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dlframes import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.{Criterion, Module} import org.apache.spark.ml.adapter.SchemaUtils import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types._ import scala.reflect.ClassTag @deprecated("`DLClassifierModel` is deprecated." + "com.intel.analytics.bigdl.dlframes is deprecated in BigDL 0.11, " + "and will be removed in future releases", "0.10.0") class DLClassifierModel[T: ClassTag]( @transient override val model: Module[T], featureSize : Array[Int], override val uid: String = "DLClassifierModel" )(implicit ev: TensorNumeric[T]) extends DLModel[T](model, featureSize) { protected override def outputToPrediction(output: Tensor[T]): Any = { if (output.size().deep == Array(1).deep) { val raw = ev.toType[Double](output.toArray().head) if (raw > 0.5) 1.0 else 0.0 } else { ev.toType[Double](output.max(1)._2.valueAt(1)) } } override def transformSchema(schema : StructType): StructType = { validateDataType(schema, $(featuresCol)) SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType) } }
Example 71
Source File: RWrapperUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.RFormula import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.Dataset object RWrapperUtils extends Logging { def checkDataColumns(rFormula: RFormula, data: Dataset[_]): Unit = { if (data.schema.fieldNames.contains(rFormula.getFeaturesCol)) { val newFeaturesName = s"${Identifiable.randomUID(rFormula.getFeaturesCol)}" logWarning(s"data containing ${rFormula.getFeaturesCol} column, " + s"using new name $newFeaturesName instead") rFormula.setFeaturesCol(newFeaturesName) } } }
Example 72
Source File: RegressionEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 73
Source File: MulticlassClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 74
Source File: BinaryClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 75
Source File: UDFTransformer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasInputCols, HasOutputCol, Wrappable} import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.serialize.ComplexParam import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam, UDPyFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.execution.python.UserDefinedPythonFunction import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.{Column, DataFrame, Dataset} import org.apache.spark.sql.functions.col object UDFTransformer extends ComplexParamsReadable[UDFTransformer] override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) if (isSet(inputCol)) { dataset.withColumn(getOutputCol, applyUDF(dataset.col(getInputCol))) } else { dataset.withColumn(getOutputCol, applyUDFOnCols(getInputCols.map(col): _*)) } } def validateAndTransformSchema(schema: StructType): StructType = { if (isSet(inputCol)) schema(getInputCol) else schema(Set(getInputCols: _*)) schema.add(StructField(getOutputCol, getDataType)) } def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema) def copy(extra: ParamMap): UDFTransformer = defaultCopy(extra) }
Example 76
Source File: HashingTF.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 77
Source File: Tokenizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getPattern: String = $(pattern) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") override protected def createTransformFunc: String => Seq[String] = { str => val re = $(pattern).r val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) }
Example 78
Source File: RWrapperUtils.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.internal.Logging import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute} import org.apache.spark.ml.feature.{RFormula, RFormulaModel} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.Dataset private[r] object RWrapperUtils extends Logging { def getFeaturesAndLabels( rFormulaModel: RFormulaModel, data: Dataset[_]): (Array[String], Array[String]) = { val schema = rFormulaModel.transform(data).schema val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol)) .attributes.get val features = featureAttrs.map(_.name.get) val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol)) .asInstanceOf[NominalAttribute] val labels = labelAttr.values.get (features, labels) } }
Example 79
Source File: RegressionEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 80
Source File: MulticlassClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 81
Source File: BinaryClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 82
Source File: ElementwiseProduct.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 83
Source File: HTTPTransformer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.http import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable} import com.microsoft.ml.spark.io.http.HandlingUtils.HandlerFunc import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.concurrent.ExecutionContext import scala.concurrent.duration.Duration trait HasHandler extends Params { val handler: UDFParam = new UDFParam( this, "handler", "Which strategy to use when handling requests") override def transform(dataset: Dataset[_]): DataFrame = { val df = dataset.toDF() val enc = RowEncoder(transformSchema(df.schema)) val colIndex = df.schema.fieldNames.indexOf(getInputCol) val fromRow = HTTPRequestData.makeFromRowConverter val toRow = HTTPResponseData.makeToRowConverter df.mapPartitions { it => if (!it.hasNext) { Iterator() }else{ val c = clientHolder.get val responsesWithContext = c.sendRequestsWithContext(it.map{row => c.RequestWithContext(Option(row.getStruct(colIndex)).map(fromRow), Some(row)) }) responsesWithContext.map { rwc => Row.merge(rwc.context.get.asInstanceOf[Row], Row(rwc.response.flatMap(Option(_)).map(toRow).orNull)) } } }(enc) } def copy(extra: ParamMap): HTTPTransformer = defaultCopy(extra) def transformSchema(schema: StructType): StructType = { assert(schema(getInputCol).dataType == HTTPSchema.Request) schema.add(getOutputCol, HTTPSchema.Response, nullable=true) } }
Example 84
Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable} import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions] class VowpalWabbitInteractions(override val uid: String) extends Transformer with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("VowpalWabbitInteractions")) override def transform(dataset: Dataset[_]): DataFrame = { val fieldSubset = dataset.schema.fields .filter(f => getInputCols.contains(f.name)) val mask = getMask val mode = udf((r: Row) => { // compute the final number of features val numElems = (0 until r.length) .map(r.getAs[Vector](_).numNonzeros).product val newIndices = new Array[Int](numElems) val newValues = new Array[Double](numElems) // build interaction features using FNV-1 val fnvPrime = 16777619 var i = 0 def interact(idx: Int, value: Double, ns: Int): Unit = { if (ns == r.size) { newIndices(i) += mask & idx newValues(i) += value i += 1 } else { val idx1 = idx * fnvPrime r.getAs[Vector](ns).foreachActive { case (idx2, value2) => interact(idx1 ^ idx2, value * value2, ns + 1) } } } // start the recursion interact(0, 1, 0) val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions) Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted) }) dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*))) } override def transformSchema(schema: StructType): StructType = { val fieldNames = schema.fields.map(_.name) for (f <- getInputCols) if (!fieldNames.contains(f)) throw new IllegalArgumentException("missing input column " + f) else { val fieldType = schema.fields(schema.fieldIndex(f)).dataType if (fieldType != VectorType) throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName) } schema.add(StructField(getOutputCol, VectorType, true)) } override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra) }
Example 85
Source File: Lambda.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.SparkContext import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object Lambda extends ComplexParamsReadable[Lambda] { def apply(f: Dataset[_] => DataFrame): Lambda = { new Lambda().setTransform(f) } } class Lambda(val uid: String) extends Transformer with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("Lambda")) val transformFunc = new UDFParam(this, "transformFunc", "holder for dataframe function") def setTransform(f: Dataset[_] => DataFrame): this.type = { set(transformFunc, udf(f, StringType)) } def getTransform: Dataset[_] => DataFrame = { $(transformFunc).f.asInstanceOf[Dataset[_] => DataFrame] } val transformSchemaFunc = new UDFParam(this, "transformSchemaFunc", "the output schema after the transformation") def setTransformSchema(f: StructType => StructType): this.type = { set(transformSchemaFunc, udf(f, StringType)) } def getTransformSchema: StructType => StructType = { $(transformSchemaFunc).f.asInstanceOf[StructType => StructType] } override def transform(dataset: Dataset[_]): DataFrame = { getTransform(dataset) } def transformSchema(schema: StructType): StructType = { if (get(transformSchemaFunc).isEmpty) { val sc = SparkContext.getOrCreate() val df = SparkSession.builder().getOrCreate().createDataFrame(sc.emptyRDD[Row], schema) transform(df).schema } else { getTransformSchema(schema) } } def copy(extra: ParamMap): Lambda = defaultCopy(extra) }
Example 86
Source File: Repartition.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ object Repartition extends DefaultParamsReadable[Repartition] override def transform(dataset: Dataset[_]): DataFrame = { if (getDisable) dataset.toDF else if (getN < dataset.rdd.getNumPartitions) dataset.coalesce(getN).toDF() else dataset.sqlContext.createDataFrame( dataset.rdd.repartition(getN).asInstanceOf[RDD[Row]], dataset.schema) } def transformSchema(schema: StructType): StructType = { schema } def copy(extra: ParamMap): this.type = defaultCopy(extra) }
Example 87
Source File: ElementwiseProduct.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 88
Source File: ArrayMapParam.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package org.apache.spark.ml.param import org.apache.spark.ml.util.Identifiable import spray.json.{DefaultJsonProtocol, _} import scala.collection.immutable.Map object ArrayMapJsonProtocol extends DefaultJsonProtocol { implicit object MapJsonFormat extends JsonFormat[Map[String, Any]] { def write(m: Map[String, Any]): JsValue = { JsObject(m.mapValues { case v: Int => JsNumber(v) case v: Double => JsNumber(v) case v: String => JsString(v) case true => JsTrue case false => JsFalse case v: Map[_, _] => write(v.asInstanceOf[Map[String, Any]]) case default => serializationError(s"Unable to serialize $default") }) } def read(value: JsValue): Map[String, Any] = value.asInstanceOf[JsObject].fields.map(kvp => { val convValue = kvp._2 match { case JsNumber(n) => if (n.isValidInt) n.intValue().asInstanceOf[Any] else n.toDouble.asInstanceOf[Any] case JsString(s) => s case JsTrue => true case JsFalse => false case v: JsValue => read(v) case default => deserializationError(s"Unable to deserialize $default") } (kvp._1, convValue) }) } } override def w(value: Array[Map[String, Any]]): ParamPair[Array[Map[String, Any]]] = super.w(value) override def jsonEncode(value: Array[Map[String, Any]]): String = { val json = value.toSeq.toJson json.prettyPrint } override def jsonDecode(json: String): Array[Map[String, Any]] = { val jsonValue = json.parseJson jsonValue.convertTo[Seq[Map[String, Any]]].toArray } }
Example 89
Source File: S2CellTransformer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import com.google.common.geometry.{S2LatLng, S2CellId} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} class S2CellTransformer(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("S2CellTransformer")) // Input/Output column names val latCol: Param[String] = new Param[String](this, "latCol", "latitude column") val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column") val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column") val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]", (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i)) // Default parameters setDefault( latCol -> "lat", lonCol -> "lon", cellCol -> "cell", level -> 10 ) def getLatCol: String = $(latCol) def getLonCol: String = $(lonCol) def getCellCol: String = $(cellCol) def getLevel: Int = $(level) def setLatCol(value: String): this.type = set(latCol, value) def setLonCol(value: String): this.type = set(lonCol, value) def setCellCol(value: String): this.type = set(cellCol, value) def setLevel(value: Int): this.type = set(level, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val currentLevel = $(level) val t = udf { (lat: Double, lon: Double) => val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon)) cellId.parent(currentLevel).toToken } val metadata = outputSchema($(cellCol)).metadata dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val latColumnName = $(latCol) val latDataType = schema(latColumnName).dataType require(latDataType == DoubleType, s"The latitude column $latColumnName must be Double type, " + s"but got $latDataType.") val lonColumnName = $(lonCol) val lonDataType = schema(lonColumnName).dataType require(lonDataType == DoubleType, s"The longitude column $lonColumnName must be Double type, " + s"but got $lonDataType.") val inputFields = schema.fields val outputColName = $(cellCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = NominalAttribute.defaultAttr.withName($(cellCol)) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) }
Example 90
Source File: StringToShortIndexer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.util.collection.OpenHashMap class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel] with StringIndexerBase { def this() = this(Identifiable.randomUID("strShortIdx")) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def fit(dataset: DataFrame): StringToShortIndexerModel = { val counts = dataset.select(col($(inputCol)).cast(StringType)) .map(_.getString(0)) .countByValue() val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") copyValues(new StringToShortIndexerModel(uid, labels).setParent(this)) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra) } class StringToShortIndexerModel ( override val uid: String, val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase { def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels) require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") private val labelToIndex: OpenHashMap[String, Short] = { val n = labels.length.toShort val map = new OpenHashMap[String, Short](n) var i: Short = 0 while (i < n) { map.update(labels(i), i) i = (i + 1).toShort } map } def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { if (!dataset.schema.fieldNames.contains($(inputCol))) { logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " + "Skip StringToShortIndexerModel.") return dataset } val indexer = udf { label: String => if (labelToIndex.contains(label)) { labelToIndex(label) } else { // TODO: handle unseen labels throw new SparkException(s"Unseen label: $label.") } } val outputColName = $(outputCol) val metadata = NominalAttribute.defaultAttr .withName(outputColName).withValues(labels).toMetadata() dataset.select(col("*"), indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { if (schema.fieldNames.contains($(inputCol))) { validateAndTransformSchema(schema) } else { // If the input column does not exist during transformation, we skip StringToShortIndexerModel. schema } } override def copy(extra: ParamMap): StringToShortIndexerModel = { val copied = new StringToShortIndexerModel(uid, labels) copyValues(copied, extra).setParent(parent) } }
Example 91
Source File: Gather.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.HasOutputCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ext.functions._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ private[feature] trait GatherParams extends Params with HasKeyCol with HasValueCol with HasOutputCol { val primaryKeyCols: Param[Array[String]] = new StringArrayParam(this, "primaryKeyCols", "Primary key column names", ParamValidators.arrayLengthGt(0)) val valueAgg: Param[String] = new Param[String](this, "valueAgg", "Aggregate function applied to valueCol: 'sum' or 'count'", ParamValidators.inArray(Array("sum", "count"))) def getPrimaryKeyCols: Array[String] = $(primaryKeyCols) def getValueAgg: String = $(valueAgg) } class Gather(override val uid: String) extends Transformer with GatherParams { def this() = this(Identifiable.randomUID("gather")) def setPrimaryKeyCols(value: String*): this.type = set(primaryKeyCols, value.toArray) def setKeyCol(value: String): this.type = set(keyCol, value) def setValueCol(value: String): this.type = set(valueCol, value) def setValueAgg(value: String): this.type = set(valueAgg, value) def setOutputCol(value: String): this.type = set(outputCol, value) setDefault( valueAgg -> "sum" ) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val pkCols = $(primaryKeyCols).map(col) val grouped = dataset.groupBy(pkCols :+ col($(keyCol)) : _*) val aggregateCol = s"${uid}_value_aggregate" val aggregated = $(valueAgg) match { case "sum" => grouped.agg(sum($(valueCol)) as aggregateCol) case "count" => grouped.agg(count($(valueCol)) as aggregateCol) } val metadata = outputSchema($(outputCol)).metadata aggregated .groupBy(pkCols: _*) .agg(collectArray(struct( col($(keyCol)), col(aggregateCol).cast(DoubleType).as($(valueCol)) )).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val valueFunName = $(valueAgg) val keyColName = $(keyCol) val keyColDataType = schema(keyColName).dataType keyColDataType match { case _: NumericType => case _: StringType => case other => throw new IllegalArgumentException(s"Key column data type $other is not supported.") } val valueColName = $(valueCol) val valueColDataType = schema(valueColName).dataType valueColDataType match { case _: NumericType => case _: StringType if valueFunName == "count" => case other => throw new IllegalArgumentException(s"Value data type $other is not supported with value aggregate $valueAgg.") } val pkFields = $(primaryKeyCols).map(schema.apply) val rollupType = StructType(Array( StructField($(keyCol), keyColDataType), StructField($(valueCol), DoubleType) )) val rollupField = StructField($(outputCol), ArrayType(rollupType), nullable = false) StructType(pkFields :+ rollupField) } override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) }
Example 92
Source File: TokenAssembler.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotators.common.NerTagged import com.johnsnowlabs.nlp.annotators.ner.NerTagsEncoding import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import scala.collection.mutable.ArrayBuffer class TokenAssembler(override val uid: String) extends AnnotatorModel[TokenAssembler] { import com.johnsnowlabs.nlp.AnnotatorType._ override val outputAnnotatorType: AnnotatorType = DOCUMENT override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT, TOKEN) val preservePosition: BooleanParam = new BooleanParam(this, "preservePosition", "Whether to preserve the actual position of the tokens or reduce them to one space") def setPreservePosition(value: Boolean): this.type = set(preservePosition, value) setDefault( preservePosition -> false ) def this() = this(Identifiable.randomUID("TOKEN_ASSEMBLER")) override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { val result = ArrayBuffer[Annotation]() val sentences_init = annotations.filter(_.annotatorType == AnnotatorType.DOCUMENT) sentences_init.zipWithIndex.foreach { case (sentence, sentenceIndex) => val tokens = annotations.filter(token => token.annotatorType == AnnotatorType.TOKEN && token.begin >= sentence.begin && token.end <= sentence.end) var fullSentence: String = "" var lastEnding: Int = 0 tokens.foreach { case (token) => if (token.begin > lastEnding && token.begin - lastEnding != 1 && lastEnding != 0) { if ($(preservePosition)) { val tokenBreaks = sentence.result.substring(lastEnding + 1 - sentence.begin, token.begin - sentence.begin) val matches = ("[\\r\\t\\f\\v\\n ]+".r).findAllIn(tokenBreaks).mkString if (matches.length > 0) { fullSentence = fullSentence ++ matches ++ token.result } else { fullSentence = fullSentence ++ " " ++ token.result } } else { fullSentence = fullSentence ++ " " ++ token.result } } else { fullSentence = fullSentence ++ token.result } lastEnding = token.end fullSentence } val beginIndex = sentence.begin val endIndex = fullSentence.length - 1 val annotation = Annotation( DOCUMENT, beginIndex, beginIndex + endIndex, fullSentence, Map("sentence" -> sentenceIndex.toString) ) result.append(annotation) } result } } object TokenAssembler extends DefaultParamsReadable[TokenAssembler]
Example 93
Source File: NerConverter.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.ner import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, NAMED_ENTITY, TOKEN} import com.johnsnowlabs.nlp.annotators.common.NerTagged import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, ParamsAndFeaturesReadable} import org.apache.spark.ml.param.{BooleanParam, StringArrayParam} import org.apache.spark.ml.util.Identifiable import scala.collection.Map def setPreservePosition(value: Boolean): this.type = set(preservePosition, value) setDefault( preservePosition -> true ) override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { val sentences = NerTagged.unpack(annotations) val docs = annotations.filter(a => a.annotatorType == AnnotatorType.DOCUMENT) val entities = sentences.zip(docs.zipWithIndex).flatMap { case (sentence, doc) => NerTagsEncoding.fromIOB(sentence, doc._1, sentenceIndex=doc._2, $(preservePosition)) } entities.filter(entity => get(whiteList).forall(validEntity => validEntity.contains(entity.entity))). zipWithIndex.map{case (entity, idx) => Annotation( outputAnnotatorType, entity.start, entity.end, entity.text, Map("entity" -> entity.entity, "sentence" -> entity.sentenceId, "chunk" -> idx.toString) ) } } } object NerConverter extends ParamsAndFeaturesReadable[NerConverter]
Example 94
Source File: NerOverwriter.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.ner import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel} import org.apache.spark.ml.param.{Param, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} def getNewResult: String = $(newResult) setDefault( newResult -> "I-OVERWRITE" ) override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { var annotationsOverwritten = annotations annotationsOverwritten.map { tokenAnnotation => val stopWordsSet = $(stopWords).toSet if (stopWordsSet.contains(tokenAnnotation.metadata("word"))) { Annotation( outputAnnotatorType, tokenAnnotation.begin, tokenAnnotation.end, $(newResult), tokenAnnotation.metadata ) } else { Annotation( outputAnnotatorType, tokenAnnotation.begin, tokenAnnotation.end, tokenAnnotation.result, tokenAnnotation.metadata ) } } } } object NerOverwriter extends DefaultParamsReadable[NerOverwriter]
Example 95
Source File: ChunkTokenizer.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, TOKEN} import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.Dataset override val outputAnnotatorType: AnnotatorType = TOKEN override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): TokenizerModel = { val ruleFactory = buildRuleFactory val processedExceptions = get(exceptionsPath) .map(er => ResourceHelper.parseLines(er)) .getOrElse(Array.empty[String]) ++ get(exceptions).getOrElse(Array.empty[String]) val raw = new ChunkTokenizerModel() .setCaseSensitiveExceptions($(caseSensitiveExceptions)) .setTargetPattern($(targetPattern)) .setRules(ruleFactory) if (processedExceptions.nonEmpty) raw.setExceptions(processedExceptions) else raw } } object ChunkTokenizer extends DefaultParamsReadable[ChunkTokenizer]
Example 96
Source File: AnnotatorParam.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.param import java.util.{Date, TimeZone} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.Identifiable import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.json4s.jackson.Serialization.write object SerializableFormat extends Formats with Serializable { class SerializableDateFormat extends DateFormat { def timezone: TimeZone = throw new Exception("SerializableFormat does not implement dateformat") override def format(d: Date): String = throw new Exception("SerializableFormat does not implement dateformat") override def parse(s: String): Option[Date] = throw new Exception("SerializableFormat does not implement dateformat") } override def dateFormat: DateFormat = new SerializableDateFormat } implicit val formats = SerializableFormat override def jsonEncode(value: A): String = write(value.serialize) override def jsonDecode(json: String): A = parse(json).extract[B].deserialize }
Example 97
Source File: Token2Chunk.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel} import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} override val inputAnnotatorTypes: Array[String] = Array(TOKEN) def this() = this(Identifiable.randomUID("TOKEN2CHUNK")) override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { annotations.map { token => Annotation( CHUNK, token.begin, token.end, token.result, token.metadata ) } } } object Token2Chunk extends DefaultParamsReadable[Token2Chunk]
Example 98
Source File: BigTextMatcher.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.btm import com.johnsnowlabs.collections.StorageSearchTrie import com.johnsnowlabs.nlp.AnnotatorType.{TOKEN, DOCUMENT, CHUNK} import com.johnsnowlabs.nlp.annotators.TokenizerModel import com.johnsnowlabs.nlp.serialization.StructFeature import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper} import com.johnsnowlabs.nlp.AnnotatorApproach import com.johnsnowlabs.storage.Database.Name import com.johnsnowlabs.storage.{Database, HasStorage, RocksDBConnection, StorageWriter} import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.Dataset class BigTextMatcher(override val uid: String) extends AnnotatorApproach[BigTextMatcherModel] with HasStorage { def this() = this(Identifiable.randomUID("ENTITY_EXTRACTOR")) override val inputAnnotatorTypes = Array(DOCUMENT, TOKEN) override val outputAnnotatorType: AnnotatorType = CHUNK override val description: String = "Extracts entities from target dataset given in a text file" val mergeOverlapping = new BooleanParam(this, "mergeOverlapping", "whether to merge overlapping matched chunks. Defaults false") val tokenizer = new StructFeature[TokenizerModel](this, "tokenizer") setDefault(inputCols,Array(TOKEN)) setDefault(caseSensitive, true) setDefault(mergeOverlapping, false) def setTokenizer(tokenizer: TokenizerModel): this.type = set(this.tokenizer, tokenizer) def getTokenizer: TokenizerModel = $$(tokenizer) def setMergeOverlapping(v: Boolean): this.type = set(mergeOverlapping, v) def getMergeOverlapping: Boolean = $(mergeOverlapping) private def loadEntities(path: String, writers: Map[Database.Name, StorageWriter[_]]): Unit = { val inputFiles: Seq[Iterator[String]] = ResourceHelper.parseLinesIterator(ExternalResource(path, ReadAs.TEXT, Map())) inputFiles.foreach { inputFile => { StorageSearchTrie.load(inputFile, writers, get(tokenizer)) }} } override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): BigTextMatcherModel = { new BigTextMatcherModel() .setInputCols($(inputCols)) .setOutputCol($(outputCol)) .setCaseSensitive($(caseSensitive)) .setStorageRef($(storageRef)) .setMergeOverlapping($(mergeOverlapping)) } override protected def createWriter(database: Name, connection: RocksDBConnection): StorageWriter[_] = { database match { case Database.TMVOCAB => new TMVocabReadWriter(connection, $(caseSensitive)) case Database.TMEDGES => new TMEdgesReadWriter(connection, $(caseSensitive)) case Database.TMNODES => new TMNodesWriter(connection) } } override protected def index( fitDataset: Dataset[_], storageSourcePath: Option[String], readAs: Option[ReadAs.Value], writers: Map[Database.Name, StorageWriter[_]], readOptions: Option[Map[String, String]] ): Unit = { require(readAs.get == ReadAs.TEXT, "BigTextMatcher only supports TEXT input formats at the moment.") loadEntities(storageSourcePath.get, writers) } override protected val databases: Array[Name] = BigTextMatcherModel.databases } object BigTextMatcher extends DefaultParamsReadable[BigTextMatcher]