org.apache.spark.ml.Transformer Scala Examples
The following examples show how to use org.apache.spark.ml.Transformer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkPFASuiteBase.scala From aardpfark with Apache License 2.0 | 6 votes |
package com.ibm.aardpfark.pfa import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.SparkConf import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.scalactic.Equality import org.scalatest.FunSuite abstract class SparkPFASuiteBase extends FunSuite with DataFrameSuiteBase with PFATestUtils { val sparkTransformer: Transformer val input: Array[String] val expectedOutput: Array[String] val sparkConf = new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID). set("spark.driver.host", "localhost") override lazy val spark = SparkSession.builder().config(sparkConf).getOrCreate() override val reuseContextIfPossible = true // Converts column containing a vector to an array def withColumnAsArray(df: DataFrame, colName: String) = { val vecToArray = udf { v: Vector => v.toArray } df.withColumn(colName, vecToArray(df(colName))) } def withColumnAsArray(df: DataFrame, first: String, others: String*) = { val vecToArray = udf { v: Vector => v.toArray } var result = df.withColumn(first, vecToArray(df(first))) others.foreach(c => result = result.withColumn(c, vecToArray(df(c)))) result } // Converts column containing a vector to a sparse vector represented as a map def getColumnAsSparseVectorMap(df: DataFrame, colName: String) = { val vecToMap = udf { v: Vector => v.toSparse.indices.map(i => (i.toString, v(i))).toMap } df.withColumn(colName, vecToMap(df(colName))) } } abstract class Result object ApproxEquality extends ApproxEquality trait ApproxEquality { import org.scalactic.Tolerance._ import org.scalactic.TripleEquals._ implicit val seqApproxEq: Equality[Seq[Double]] = new Equality[Seq[Double]] { override def areEqual(a: Seq[Double], b: Any): Boolean = { b match { case d: Seq[Double] => a.zip(d).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } implicit val vectorApproxEq: Equality[Vector] = new Equality[Vector] { override def areEqual(a: Vector, b: Any): Boolean = { b match { case v: Vector => a.toArray.zip(v.toArray).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } }
Example 2
Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0 | 6 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.log4j import scala.collection.mutable object MNISTBenchmark { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt) val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2" val numPartitions = if(args.length >= 3) args(2).toInt else 10 val models = if(args.length >=4) args(3).split(',') else Array("tree","naive") val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, path) .zipWithIndex() .filter(_._2 < ns.max) .sortBy(_._2, numPartitions = numPartitions) .keys .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) .cache() dataset.count() //force persist val limiter = new Limiter() val knn = new KNNClassifier() .setTopTreeSize(numPartitions * 10) .setFeaturesCol("features") .setPredictionCol("prediction") .setK(1) val naiveKNN = new NaiveKNNClassifier() val pipeline = new Pipeline() .setStages(Array(limiter, knn)) val naivePipeline = new Pipeline() .setStages(Array(limiter, naiveKNN)) val paramGrid = new ParamGridBuilder() .addGrid(limiter.n, ns) .build() val bm = new Benchmarker() .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumTimes(3) val metrics = mutable.ArrayBuffer[String]() if(models.contains("tree")) { val bmModel = bm.setEstimator(pipeline).fit(dataset) metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}" } if(models.contains("naive")) { val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset) metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}" } logger.info(metrics.mkString("\n")) } } class Limiter(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("limiter")) val n: IntParam = new IntParam(this, "n", "number of rows to limit") def setN(value: Int): this.type = set(n, value) // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN) override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF() override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = schema }
Example 3
Source File: HashingTF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 4
Source File: SQLTransformer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 5
Source File: Binarizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val schema = dataset.schema val inputType = schema($(inputCol)).dataType val td = $(threshold) val binarizerDouble = udf { in: Double => if (in > td) 1.0 else 0.0 } val binarizerVector = udf { (data: Vector) => val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] data.foreachActive { (index, value) => if (value > td) { indices += index values += 1.0 } } Vectors.sparse(data.size, indices.result(), values.result()).compressed } val metadata = outputSchema($(outputCol)).metadata inputType match { case DoubleType => dataset.select(col("*"), binarizerDouble(col($(inputCol))).as($(outputCol), metadata)) case _: VectorUDT => dataset.select(col("*"), binarizerVector(col($(inputCol))).as($(outputCol), metadata)) } } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) val outCol: StructField = inputType match { case DoubleType => BinaryAttribute.defaultAttr.withName(outputColName).toStructField() case _: VectorUDT => StructField(outputColName, new VectorUDT) case _ => throw new IllegalArgumentException(s"Data type $inputType is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ outCol) } @Since("1.4.1") override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 6
Source File: SpecificTransformerConversions.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.classification._ import io.hydrosphere.spark_ml_serving.common.LocalTransformer import io.hydrosphere.spark_ml_serving.preprocessors.{LocalImputerModel, LocalWord2VecModel} import org.apache.spark.ml.Transformer import org.apache.spark.ml.classification._ import org.apache.spark.ml.feature.{ImputerModel, Word2VecModel} object SpecificTransformerConversions extends DynamicTransformerConverter { implicit def transformerToLocal(transformer: Transformer): LocalTransformer[_] = { transformer match { case x: LogisticRegressionModel => new LocalLogisticRegressionModel(x) case x: LinearSVCModel => new LocalLinearSVCModel(x) case x: Word2VecModel => new LocalWord2VecModel(x) case x: ImputerModel => new LocalImputerModel(x) case x => throw new Exception(s"Unknown model: ${x.getClass}") } } }
Example 7
Source File: LocalPipelineModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.{PipelineModel, Transformer} import io.hydrosphere.spark_ml_serving.common.utils.PumpedClass class LocalPipelineModel(override val sparkTransformer: PipelineModel) extends LocalTransformer[PipelineModel] { def transform(localData: LocalData): LocalData = { import CommonTransormerConversions._ sparkTransformer.stages.foldLeft(localData) { case (data, transformer) => transformer.transform(data) } } } object LocalPipelineModel extends ModelLoader[PipelineModel] with TypedTransformerConverter[PipelineModel] { import CommonLoaderConversions._ def getStages(pipelineParameters: Metadata, source: ModelSource): Array[Transformer] = { pipelineParameters.paramMap("stageUids").asInstanceOf[List[String]].zipWithIndex.toArray.map { case (uid: String, index: Int) => val currentStage = s"stages/${index}_$uid" val modelMetadata = source.readFile(s"$currentStage/metadata/part-00000") val stageParameters = Metadata.fromJson(modelMetadata) val companion = PumpedClass.companionFromClassName(stageParameters.`class`) companion.load(s"${source.root}/$currentStage").asInstanceOf[Transformer] } } override def load(source: ModelSource): PipelineModel = { val metadata = source.readFile("metadata/part-00000") val pipelineParameters = Metadata.fromJson(metadata) val stages: Array[Transformer] = getStages(pipelineParameters, source) val cstr = classOf[PipelineModel].getDeclaredConstructor( classOf[String], classOf[Array[Transformer]] ) cstr.setAccessible(true) cstr .newInstance( pipelineParameters.uid, stages ) } implicit def toLocal(sparkTransformer: PipelineModel) = new LocalPipelineModel(sparkTransformer) }
Example 8
Source File: ModelLoader.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.spark.ml.Transformer trait ModelLoader[T <: Transformer] { def load(source: ModelSource): T final def load(path: String): T = { val source = if (path.startsWith("hdfs://")) { val uri = new URI(path) val p = uri.getPath ModelSource.hadoop(p, new Configuration()) } else { ModelSource.local(path) } load(source) } }
Example 9
Source File: SimpleModelLoader.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import org.apache.spark.ml.Transformer trait SimpleModelLoader[SparkConverter <: Transformer] extends ModelLoader[SparkConverter] { def build(metadata: Metadata, data: LocalData): SparkConverter def getData(source: ModelSource, metadata: Metadata): LocalData = { ModelDataReader.parse(source, "data/") } def load(source: ModelSource): SparkConverter = { val metadataRaw = source.readFile("metadata/part-00000") val metadata = Metadata.fromJson(metadataRaw) val data = getData(source, metadata) build(metadata, data) } }
Example 10
Source File: PumpedClass.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common.utils import org.apache.spark.ml.Transformer import scala.reflect.runtime.universe class PumpedClass(classz: Class[_]) { def companion: Any = { val companionClassName = classz.getName + "$" val companionClass = Class.forName(companionClassName) val moduleField = companionClass.getField("MODULE$") moduleField.get(null) } } object PumpedClass { def companionFromClassName(className: String): Any = { val runtimeMirror = universe.runtimeMirror(this.getClass.getClassLoader) val module = runtimeMirror.staticModule(className + "$") val obj = runtimeMirror.reflectModule(module) obj.instance } }
Example 11
Source File: ParamUtils.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common.utils import io.hydrosphere.spark_ml_serving.common.Metadata import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Param object ParamUtils { def set[TransformerT <: Transformer, ParamT](transformer: TransformerT, param: Param[ParamT], metadata: Metadata): TransformerT = { transformer.set(param, extract(param, metadata)) } def extract[T](param: Param[T], metadata: Metadata): T = { metadata.getAs[Any](param.name).getOrElse(throw new IllegalArgumentException(param.name)) match { case p: BigInt => p.intValue().asInstanceOf[T] case p => p.asInstanceOf[T] } } }
Example 12
Source File: TreeModelLoader.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import org.apache.spark.ml.Transformer trait TreeModelLoader[SparkConverter <: Transformer] extends ModelLoader[SparkConverter] { def build(metadata: Metadata, data: LocalData, treeMetadata: LocalData): SparkConverter def loadData(source: ModelSource, metadata: Metadata): LocalData = { ModelDataReader.parse(source, "data/") } def loadTreeMetadata(source: ModelSource): LocalData = { ModelDataReader.parse(source, "treesMetadata/") } def load(source: ModelSource): SparkConverter = { val trees = loadTreeMetadata(source) val metadataRaw = source.readFile("metadata/part-00000") val metadata = Metadata.fromJson(metadataRaw) val data = loadData(source, metadata) build(metadata, data, trees) } }
Example 13
Source File: CommonTransormerConversions.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.classification._ import io.hydrosphere.spark_ml_serving.clustering._ import io.hydrosphere.spark_ml_serving.common.LocalTransformer import io.hydrosphere.spark_ml_serving.preprocessors._ import io.hydrosphere.spark_ml_serving.regression._ import org.apache.spark.ml.classification._ import org.apache.spark.ml.clustering.{GaussianMixtureModel, KMeansModel, LocalLDAModel => SparkLocalLDAModel} import org.apache.spark.ml.feature._ import org.apache.spark.ml.regression._ import org.apache.spark.ml.{PipelineModel, Transformer} object CommonTransormerConversions extends DynamicTransformerConverter { implicit def transformerToLocal(transformer: Transformer): LocalTransformer[_] = { transformer match { case x: PipelineModel => new LocalPipelineModel(x) // Classification models case x: DecisionTreeClassificationModel => new LocalDecisionTreeClassificationModel(x) case x: MultilayerPerceptronClassificationModel => new LocalMultilayerPerceptronClassificationModel(x) case x: NaiveBayesModel => new LocalNaiveBayes(x) case x: RandomForestClassificationModel => new LocalRandomForestClassificationModel(x) case x: GBTClassificationModel => new LocalGBTClassificationModel(x) // Clustering models case x: GaussianMixtureModel => new LocalGaussianMixtureModel(x) case x: KMeansModel => new LocalKMeansModel(x) case x: SparkLocalLDAModel => new LocalLDAModel(x) // Preprocessing case x: Binarizer => new LocalBinarizer(x) case x: CountVectorizerModel => new LocalCountVectorizerModel(x) case x: DCT => new LocalDCT(x) case x: HashingTF => new LocalHashingTF(x) case x: IndexToString => new LocalIndexToString(x) case x: MaxAbsScalerModel => new LocalMaxAbsScalerModel(x) case x: MinMaxScalerModel => new LocalMinMaxScalerModel(x) case x: NGram => new LocalNGram(x) case x: Normalizer => new LocalNormalizer(x) case x: OneHotEncoder => new LocalOneHotEncoder(x) case x: PCAModel => new LocalPCAModel(x) case x: PolynomialExpansion => new LocalPolynomialExpansion(x) case x: StandardScalerModel => new LocalStandardScalerModel(x) case x: StopWordsRemover => new LocalStopWordsRemover(x) case x: StringIndexerModel => new LocalStringIndexerModel(x) case x: Tokenizer => new LocalTokenizer(x) case x: VectorIndexerModel => new LocalVectorIndexerModel(x) case x: IDFModel => new LocalIDF(x) case x: ChiSqSelectorModel => new LocalChiSqSelectorModel(x) case x: RegexTokenizer => new LocalRegexTokenizer(x) case x: VectorAssembler => new LocalVectorAssembler(x) // Regression case x: DecisionTreeRegressionModel => new LocalDecisionTreeRegressionModel(x) case x: LinearRegressionModel => new LocalLinearRegressionModel(x) case x: RandomForestRegressionModel => new LocalRandomForestRegressionModel(x) case x: GBTRegressionModel => new LocalGBTRegressor(x) case x => SpecificTransformerConversions.transformerToLocal(x) } } }
Example 14
Source File: Sampler.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import scala.util.Random class Sampler(fraction: Double, override val uid: String, seed: Int = Random.nextInt) extends Transformer { def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler")) final def getOutputCol: String = $(inputCol) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sample(false, fraction, seed).toDF } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Sampler = defaultCopy(extra) } object Sampler { def main(args: Array[String]): Unit = { val ss = SparkSession .builder .master("local") .appName("preprocess") .getOrCreate() val training = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") println(training.count) val sampler = new Sampler(0.5) .setInputCol("features") val pipeline = new Pipeline() .setStages(Array(sampler)) val model = pipeline.fit(training) val test = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") model.transform(test).select("*") .collect() .foreach { case Row(label: Double, vector: Vector) => println(s"($label, " + s"${vector.toSparse.indices.mkString("[", ",", "]")}, " + s"${vector.toSparse.values.mkString("[", ",", "]")}") } ss.stop() } }
Example 15
Source File: HashingTFWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} import com.tencent.angel.spark.automl.feature.TransformerWrapper import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.HashingTF class HashingTFWrapper(numFeatures: Int) extends TransformerWrapper { override val transformer: Transformer = new HashingTF().setNumFeatures(numFeatures) override var parent: TransformerWrapper = _ override val hasMultiInputs: Boolean = false override val hasMultiOutputs: Boolean = false override val needAncestorInputs: Boolean = false override val relation: InToOutRelation = OneToOne override val requiredInputCols: Array[String] = Array("words") override val requiredOutputCols: Array[String] = Array("outHashingTF") override def declareInAndOut(): this.type = { transformer.asInstanceOf[HashingTF].setInputCol(getInputCols(0)) transformer.asInstanceOf[HashingTF].setOutputCol(getOutputCols(0)) this } }
Example 16
Source File: StopWordsRemoverWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} import com.tencent.angel.spark.automl.feature.TransformerWrapper import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.StopWordsRemover class StopWordsRemoverWrapper extends TransformerWrapper { override val transformer: Transformer = new StopWordsRemover() override var parent: TransformerWrapper = _ override val hasMultiInputs: Boolean = false override val hasMultiOutputs: Boolean = false override val needAncestorInputs: Boolean = false override val relation: InToOutRelation = OneToOne override val requiredInputCols: Array[String] = Array("words") override val requiredOutputCols: Array[String] = Array("stopwords") override def declareInAndOut(): this.type = { transformer.asInstanceOf[StopWordsRemover].setInputCol(getInputCols(0)) transformer.asInstanceOf[StopWordsRemover].setOutputCol(getOutputCols(0)) this } }
Example 17
Source File: SamplerWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.feature.InToOutRelation.{InPlace, InToOutRelation} import com.tencent.angel.spark.automl.feature.TransformerWrapper import org.apache.spark.ml.Transformer class SamplerWrapper(fraction: Double) extends TransformerWrapper { override val transformer: Transformer = new Sampler(fraction) override var parent: TransformerWrapper = _ override val hasMultiInputs: Boolean = false override val hasMultiOutputs: Boolean = false override val needAncestorInputs: Boolean = false override val relation: InToOutRelation = InPlace override val requiredInputCols: Array[String] = null override val requiredOutputCols: Array[String] = null override def declareInAndOut(): this.type = { transformer.asInstanceOf[Sampler].setInputCol(getInputCols(0)) this } }
Example 18
Source File: PipelineOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops import ml.combust.bundle.BundleContext import ml.combust.bundle.op.OpModel import ml.combust.bundle.serializer.GraphSerializer import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.{PipelineModel, Transformer} class PipelineOp extends SimpleSparkOp[PipelineModel] { override val Model: OpModel[SparkBundleContext, PipelineModel] = new OpModel[SparkBundleContext, PipelineModel] { override val klazz: Class[PipelineModel] = classOf[PipelineModel] override def opName: String = Bundle.BuiltinOps.pipeline override def store(model: Model, obj: PipelineModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val nodes = GraphSerializer(context).write(obj.stages).get model.withValue("nodes", Value.stringList(nodes)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): PipelineModel = { val nodes = GraphSerializer(context).read(model.value("nodes").getStringList). map(_.map(_.asInstanceOf[Transformer])).get.toArray new PipelineModel(uid = "", stages = nodes) } } override def sparkLoad(uid: String, shape: NodeShape, model: PipelineModel): PipelineModel = { new PipelineModel(uid = uid, stages = model.stages) } override def sparkInputs(obj: PipelineModel): Seq[ParamSpec] = Seq() override def sparkOutputs(obj: PipelineModel): Seq[SimpleParamSpec] = Seq() override def load(node: Node, model: PipelineModel)(implicit context: BundleContext[SparkBundleContext]): PipelineModel = { new PipelineModel(uid = node.name, stages = model.stages) } }
Example 19
Source File: LinearSVCParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.parity import org.apache.spark.ml.classification.LinearSVCModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class LinearSVCParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline() .setStages(Array( new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LinearSVCModel("linear_svc", Vectors.dense(0.44, 0.77), 0.66).setThreshold(0.5).setFeaturesCol("features"))) .fit(dataset) // The string order type is ignored, because once the transformer is built based on some order type, we need to serialize only the string to index map // but not the order in which it has to index. This value we can ignore while we check the transformer values. override val unserializedParams: Set[String] = Set("stringOrderType") }
Example 20
Source File: CrossValidatorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor} import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.DataFrame class CrossValidatorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new CrossValidator(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 21
Source File: TrainValidationSplitParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.sql.DataFrame class TrainValidationSplitParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new TrainValidationSplit(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 22
Source File: MinMaxScalerWithNonDefaultsParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{MinMaxScaler, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class MinMaxScalerWithNonDefaultsParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new MinMaxScaler(). setInputCol("features"). setOutputCol("scaled_features"). setMin(2.0). setMax(4.0))).fit(dataset) }
Example 23
Source File: HashingTermFrequencyParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame import ml.combust.mleap.spark.SparkSupport._ class HashingTermFrequencyParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new HashingTF(). setNumFeatures(1 << 17). setInputCol("loan_title_tokens"). setOutputCol("loan_title_tf"))).fit(dataset) }
Example 24
Source File: BinarizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{Binarizer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame class BinarizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti")). setOutputCol("features"), new Binarizer(). setThreshold(0.12). setInputCol("dti"). setOutputCol("thresholded_features_double"), new Binarizer(). setThreshold(0.12). setInputCol("features"). setOutputCol("thresholded_features"))).fit(dataset) }
Example 25
Source File: PcaParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{PCA, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class PcaParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new PCA(). setInputCol("features"). setOutputCol("pca_features"). setK(2))).fit(dataset) override val unserializedParams = Set("k") }
Example 26
Source File: OneHotEncoderParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class OneHotEncoderParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("state") override val sparkTransformer: Transformer = new Pipeline() .setStages(Array( new StringIndexer().setInputCol("state").setOutputCol("state_index"), new StringIndexer().setInputCol("state").setOutputCol("state_index2"), new OneHotEncoderEstimator() .setInputCols(Array("state_index", "state_index2")) .setOutputCols(Array("state_oh", "state_oh2")) )) .fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 27
Source File: PolynomialExpansionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{PolynomialExpansion, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class PolynomialExpansionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new PolynomialExpansion(). setInputCol("features"). setOutputCol("poly"). setDegree(3))).fit(dataset) }
Example 28
Source File: RegexTokenizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.RegexTokenizer import org.apache.spark.sql.DataFrame class RegexTokenizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new RegexTokenizer() .setInputCol("loan_title") .setOutputCol("loan_title_tokens") .setGaps(true) .setToLowercase(true) .setMinTokenLength(2) .setPattern("\\s") }
Example 29
Source File: NGramsParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{NGram, Tokenizer} import org.apache.spark.sql.DataFrame class NGramsParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new NGram(). setInputCol("loan_title_tokens"). setOutputCol("loan_title_ngram"). setN(3))).fit(dataset) }
Example 30
Source File: ReverseStringIndexerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{IndexToString, StringIndexer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class ReverseStringIndexerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("state") override val sparkTransformer: Transformer = { val stringIndexer = new StringIndexer(). setInputCol("state"). setOutputCol("state_index"). fit(dataset) val reverseStringIndexer = new IndexToString(). setInputCol("state_index"). setOutputCol("state_reverse"). setLabels(stringIndexer.labels) new Pipeline().setStages(Array(stringIndexer, reverseStringIndexer)).fit(dataset) } override val unserializedParams = Set("stringOrderType") }
Example 31
Source File: CountVectorizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{CountVectorizer, Tokenizer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class CountVectorizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new CountVectorizer(). setInputCol("loan_title_tokens"). setOutputCol("loan_title_token_counts") .setMinTF(2))).fit(dataset) }
Example 32
Source File: NormalizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{Normalizer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class NormalizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new Normalizer(). setP(3d). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) }
Example 33
Source File: WordToVectorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{Tokenizer, Word2Vec} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class WordToVectorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new Word2Vec(uid = "words"). setInputCol("loan_title_tokens"). setOutputCol("loan_title_token_counts"))).fit(dataset) override val unserializedParams = Set("seed") }
Example 34
Source File: VectorSlicerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{VectorAssembler, VectorSlicer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class VectorSlicerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new VectorSlicer(). setIndices(Array(1)). setNames(Array("dti")). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) }
Example 35
Source File: BucketedRandomProjectionLSHParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{BucketedRandomProjectionLSH, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class BucketedRandomProjectionLSHParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new BucketedRandomProjectionLSH(). setInputCol("features"). setBucketLength(2). setOutputCol("lsh_features"))).fit(dataset) }
Example 36
Source File: StopWordsRemoverParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer} import org.apache.spark.sql.DataFrame class StopWordsRemoverParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new StopWordsRemover(). setInputCol("loan_title_tokens"). setOutputCol("loan_title_stop"). setStopWords(Array("loan")))).fit(dataset) }
Example 37
Source File: DCTParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{DCT, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class DCTParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new DCT(uid = "dct"). setInverse(true). setInputCol("features"). setOutputCol("filter_features"))).fit(dataset) }
Example 38
Source File: FeatureHasherParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.FeatureHasher import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types._ import scala.util.Random class FeatureHasherParitySpec extends SparkParityBase { val categories = Seq( "spark", "and", "mleap", "are", "super", "dope", "together" ) def randomRow(): Row = Row(Random.nextDouble(), Random.nextBoolean(), Random.nextInt(20), Random.nextInt(20).toString, Random.shuffle(categories).head) val rows = spark.sparkContext.parallelize(Seq.tabulate(100) { _ => randomRow() }) val schema = new StructType() .add("real", DoubleType, nullable = false) .add("bool", BooleanType, nullable = false) .add("int", IntegerType, nullable = false) .add("stringNum", StringType, nullable = true) .add("string", StringType, nullable = true) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new FeatureHasher() .setInputCols("real", "bool", "int", "stringNum", "string") .setOutputCol("features") .setNumFeatures(1 << 17) .setCategoricalCols(Array("int")) }
Example 39
Source File: VectorIndexerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, VectorIndexer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class VectorIndexerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "state") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("state"). setOutputCol("state_index"), new VectorAssembler(). setInputCols(Array("dti", "loan_amount", "state_index")). setOutputCol("features"), new VectorIndexer(). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 40
Source File: BisectingKMeansParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.clustering.BisectingKMeans import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class BisectingKMeansParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new BisectingKMeans(). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "k", "maxIter", "seed", "minDivisibleClusterSize") }
Example 41
Source File: LDAParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.clustering.LDA import org.apache.spark.ml.feature.{CountVectorizer, StopWordsRemover, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame import org.scalatest.Ignore @Ignore class LDAParitySpec extends SparkParityBase { override val dataset: DataFrame = textDataset.select("text") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val remover = new StopWordsRemover() .setInputCol(tokenizer.getOutputCol) .setOutputCol("words_filtered") val cv = new CountVectorizer().setInputCol("words_filtered").setOutputCol("features").setVocabSize(50000) val lda = new LDA().setK(5).setMaxIter(2) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(tokenizer, remover, cv, lda)).fit(dataset) override def equalityTest(sparkDataset: DataFrame, mleapDataset: DataFrame): Unit = { val sparkPredictionCol = sparkDataset.schema.fieldIndex("topicDistribution") val mleapPredictionCol = mleapDataset.schema.fieldIndex("topicDistribution") sparkDataset.collect().zip(mleapDataset.collect()).foreach { case (sv, mv) => val sparkPrediction = sv.getAs[Vector](sparkPredictionCol) val mleapPrediction = mv.getAs[Vector](mleapPredictionCol) sparkPrediction.toArray.zip(mleapPrediction.toArray).foreach { case (s, m) => assert(Math.abs(m - s) < 0.001) } } } }
Example 42
Source File: KMeansParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.clustering.KMeans import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class KMeansParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new KMeans(). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "initMode", "initSteps", "maxIter", "tol", "k", "seed") }
Example 43
Source File: GaussianMixtureParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.clustering.GaussianMixture import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class GaussianMixtureParitySpec extends SparkParityBase { override val dataset: DataFrame = { baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") } override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new GaussianMixture(). setFeaturesCol("features"). setPredictionCol("prediction"). setProbabilityCol("probability"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "k", "maxIter", "seed", "tol") }
Example 44
Source File: ALSParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.recommendation import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.recommendation.ALS import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class ALSParitySpec extends SparkParityBase { override val dataset: DataFrame = recommendationDataset override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") )).fit(dataset) override def equalityTest(sparkDataset: DataFrame, mleapDataset: DataFrame): Unit = super.equalityTest(sparkDataset.orderBy("userId", "movieId"), mleapDataset.orderBy("userId", "movieId")) //TODO: maybe coldStartStrategy should be serialized override val unserializedParams = Set("coldStartStrategy") }
Example 45
Source File: MultiLayerPerceptronClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MultiLayerPerceptronClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = multiClassClassificationDataset override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new MultilayerPerceptronClassifier(uid = "mlp"). setThresholds(Array(0.1, 0.2, 0.3)). // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 // and output of size 3 (classes) setLayers(Array(4, 5, 4, 3)). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) }
Example 46
Source File: DecisionTreeClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class DecisionTreeClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new DecisionTreeClassifier(). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 47
Source File: OneVsRestParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame class OneVsRestParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new OneVsRest().setClassifier(new LogisticRegression()). setLabelCol("fico_index"). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "classifier", "labelCol") }
Example 48
Source File: LogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame import org.apache.spark.ml.linalg.Vectors class LogisticRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficients = Vectors.dense(0.44, 0.77), intercept = 0.66).setThreshold(0.7).setFeaturesCol("features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 49
Source File: MultinomialLogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class MultinomialLogisticRegressionParitySpec extends SparkParityBase { val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0) val ages = Seq(15, 30, 40, 50, 15, 80) val heights = Seq(175, 190, 155, 160, 170, 180) val weights = Seq(67, 100, 57, 56, 56, 88) val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) }) val schema = new StructType().add("label", DoubleType, nullable = false) .add("age", IntegerType, nullable = false) .add("height", IntegerType, nullable = false) .add("weight", IntegerType, nullable = false) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new VectorAssembler(). setInputCols(Array("age", "height", "weight")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)), interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703), numClasses = 3, isMultinomial = true))).fit(dataset) }
Example 50
Source File: GBTClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class GBTClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new GBTClassifier(). setFeaturesCol("features"). setLabelCol("label"). setThresholds(Array(1.0, 1.0)). setProbabilityCol("myProbability"). setPredictionCol("myPrediction"). setRawPredictionCol("myRawPrediction") )).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 51
Source File: RandomForestClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class RandomForestClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new RandomForestClassifier(). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "seed") }
Example 52
Source File: NaiveBayesClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class NaiveBayesClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new NaiveBayes(uid = "nb"). setModelType("multinomial"). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "smoothing") }
Example 53
Source File: GBTRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.GBTRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class GBTRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new GBTRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 54
Source File: AFTSurvivalRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.AFTSurvivalRegression import org.apache.spark.sql._ import org.apache.spark.sql.functions.lit class AFTSurvivalRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount").withColumn("censor", lit(1.0)) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new AFTSurvivalRegression(). setQuantileProbabilities(Array(0.5)). setFeaturesCol("features"). setLabelCol("loan_amount"). setQuantilesCol("quant"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("labelCol", "stringOrderType", "maxIter", "tol") }
Example 55
Source File: LinearRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class LinearRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new LinearRegression(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "elasticNetParam", "maxIter", "tol", "epsilon", "labelCol", "loss", "regParam", "solver") }
Example 56
Source File: RandomForestRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class RandomForestRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 57
Source File: DecisionTreeRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class DecisionTreeRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new DecisionTreeRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 58
Source File: GeneralizedLinearRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.sql._ class GeneralizedLinearRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new GeneralizedLinearRegression(). setFamily("gaussian"). setLink("log"). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "maxIter", "tol", "regParam", "solver", "variancePower") }
Example 59
Source File: IsotonicRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.IsotonicRegression import org.apache.spark.sql._ class IsotonicRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount").sample(withReplacement = true, 0.05) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti")). setOutputCol("features"), new IsotonicRegression(). setFeaturesCol("dti"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("labelCol") }
Example 60
Source File: SimpleSparkSerializer.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.spark import ml.combust.bundle.BundleFile import ml.combust.mleap.spark.SparkSupport._ import ml.combust.bundle.serializer.SerializationFormat import org.apache.spark.ml.Transformer import org.apache.spark.ml.bundle.SparkBundleContext import org.apache.spark.sql.DataFrame import resource._ class SimpleSparkSerializer() { def serializeToBundle(transformer: Transformer, path: String, dataset: DataFrame): Unit = { serializeToBundleWithFormat(transformer = transformer, path = path, dataset = dataset, format = SerializationFormat.Json) } def serializeToBundleWithFormat(transformer: Transformer, path: String, dataset: DataFrame, format: SerializationFormat = SerializationFormat.Json): Unit = { implicit val context: SparkBundleContext = Option(dataset). map(d => SparkBundleContext.defaultContext.withDataset(d)). getOrElse(SparkBundleContext.defaultContext) (for(file <- managed(BundleFile.load(path))) yield { transformer.writeBundle.format(format).save(file).get }).tried.get } def deserializeFromBundle(path: String): Transformer = { implicit val context: SparkBundleContext = SparkBundleContext.defaultContext (for(file <- managed(BundleFile.load(path))) yield { file.loadSparkBundle().get.root }).tried.get } }
Example 61
Source File: SparkSupport.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.spark import java.net.URI import ml.combust.bundle.dsl.Bundle import ml.combust.bundle.{BundleFile, BundleWriter} import ml.combust.mleap.core.types import ml.combust.mleap.runtime.frame import ml.combust.mleap.runtime.frame.Row import org.apache.spark.ml.Transformer import org.apache.spark.ml.bundle.SparkBundleContext import org.apache.spark.sql.DataFrame import org.apache.spark.sql.mleap.TypeConverters import org.apache.spark.sql.types.StructType import resource._ import scala.util.Try trait SparkSupport { implicit class SparkTransformerOps(transformer: Transformer) { def writeBundle: BundleWriter[SparkBundleContext, Transformer] = BundleWriter(transformer) } implicit class SparkBundleFileOps(file: BundleFile) { def loadSparkBundle() (implicit context: SparkBundleContext): Try[Bundle[Transformer]] = file.load() } implicit class URIBundleFileOps(uri: URI) { def loadMleapBundle() (implicit context: SparkBundleContext): Try[Bundle[Transformer]] = { (for (bf <- managed(BundleFile.load(uri))) yield { bf.load[SparkBundleContext, Transformer]().get }).tried } } implicit class MleapSparkTransformerOps[T <: frame.Transformer](transformer: T) { def sparkTransform(dataset: DataFrame): DataFrame = { transformer.transform(dataset.toSparkLeapFrame).get.toSpark } } implicit class SparkDataFrameOps(dataset: DataFrame) { def toSparkLeapFrame: SparkLeapFrame = { val spec = dataset.schema.fields. map(f => TypeConverters.sparkToMleapConverter(dataset, f)) val schema = types.StructType(spec.map(_._1)).get val converters = spec.map(_._2) val data = dataset.rdd.map(r => { val values = r.toSeq.zip(converters).map { case (v, c) => c(v) } Row(values: _*) }) SparkLeapFrame(schema, data, dataset.sqlContext) } def mleapSchema: types.StructType = TypeConverters.sparkSchemaToMleapSchema(dataset) } implicit class MleapSchemaOps(schema: types.StructType) { def toSpark: StructType = TypeConverters.mleapSchemaToSparkSchema(schema) } } object SparkSupport extends SparkSupport
Example 62
Source File: SimpleSparkOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl.{Node, NodeShape} import ml.combust.bundle.op.OpNode import org.apache.spark.ml.Transformer import scala.reflect.ClassTag abstract class SimpleSparkOp[N <: Transformer](implicit ct: ClassTag[N]) extends OpNode[SparkBundleContext, N, N] { override val klazz: Class[N] = ct.runtimeClass.asInstanceOf[Class[N]] def sparkInputs(obj: N): Seq[ParamSpec] def sparkOutputs(obj: N): Seq[ParamSpec] override def name(node: N): String = node.uid override def model(node: N): N = node def sparkLoad(uid: String, shape: NodeShape, model: N): N override def load(node: Node, model: N) (implicit context: BundleContext[SparkBundleContext]): N = { val n = sparkLoad(node.name, node.shape, model) SparkShapeLoader(node.shape, n, sparkInputs(n), sparkOutputs(n)).loadShape() n } override def shape(node: N) (implicit context: BundleContext[SparkBundleContext]): NodeShape = { val dataset = context.context.dataset.getOrElse { throw new IllegalArgumentException( """ |Must provide a transformed data frame to MLeap for serializing a pipeline. |The transformed data frame is used to extract data types and other metadata |required for execution. | |Example usage: |``` |val sparkTransformer: org.apache.spark.ml.Transformer |val transformedDataset = sparkTransformer.transform(trainingDataset) | |implicit val sbc = SparkBundleContext().withDataset(transformedDataset) | |for(bf <- managed(BundleFile(file))) { | sparkTransformer.writeBundle.format(SerializationFormat.Json).save(bf).get |} |``` """.stripMargin) } SparkShapeSaver(dataset, node, sparkInputs(node), sparkOutputs(node)).asNodeShape } }
Example 63
Source File: StringMap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ private val className = classOf[StringMap].getName override def load(path: String): StringMap = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("labels", "handleInvalid", "defaultValue").head() val labels = data.getAs[Map[String, Double]](0) val handleInvalid = HandleInvalid.fromString(data.getAs[String](1)) val defaultValue = data.getAs[Double](2) val model = new StringMapModel(labels, handleInvalid = handleInvalid, defaultValue = defaultValue) val transformer = new StringMap(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 64
Source File: MathUnary.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType} import org.apache.spark.sql.functions.udf private val className = classOf[MathUnary].getName override def load(path: String): MathUnary = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("operation").head() val operation = data.getAs[String](0) val model = MathUnaryModel(UnaryOperation.forName(operation)) val transformer = new MathUnary(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 65
Source File: WordLengthFilter.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.WordLengthFilterModel import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, Params} import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} final def getWordLength: Int = $(wordLength) } class WordLengthFilter(override val uid: String) extends Transformer with WordLengthFilterParams with DefaultParamsWritable { val defaultLength = 3 var model: WordLengthFilterModel = new WordLengthFilterModel(defaultLength) //Initialize with default filter length 3 def this(model: WordLengthFilterModel) = this(uid = Identifiable.randomUID("filter_words")) def this() = this(new WordLengthFilterModel) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) def setWordLength(value: Int = defaultLength): this.type = set(wordLength, value) override def transform(dataset: Dataset[_]): DataFrame = { if(defaultLength != getWordLength) model = new WordLengthFilterModel(getWordLength) val filterWordsUdf = udf { (words: Seq[String]) => model(words) } dataset.withColumn($(outputCol), filterWordsUdf(dataset($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { require(schema($(inputCol)).dataType.isInstanceOf[ArrayType], s"Input column must be of type ArrayType(StringType,true) but got ${schema($(inputCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(outputCol)), s"Output column ${$(outputCol)} already exists.") StructType(schema.fields :+ StructField($(outputCol), ArrayType(StringType, true))) } } object WordLengthFilter extends DefaultParamsReadable[WordLengthFilter] { override def load(path: String): WordLengthFilter = super.load(path) }
Example 66
Source File: MultinomialLabeler.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.MultinomialLabelerModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasFeaturesCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{udf, col} import ml.combust.mleap.core.util.VectorConverters._ class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"), val model: MultinomialLabelerModel) extends Transformer with HasFeaturesCol with HasProbabilitiesCol with HasLabelsCol { def setFeaturesCol(value: String): this.type = set(featuresCol, value) def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value) def setLabelsCol(value: String): this.type = set(labelsCol, value) @org.apache.spark.annotation.Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val probabilitiesUdf = udf { (vector: Vector) => model.top(vector).map(_._1).toArray } val labelsUdf = udf { (vector: Vector) => model.topLabels(vector).toArray } dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))). withColumn($(labelsCol), labelsUdf(col($(featuresCol)))) } override def copy(extra: ParamMap): Transformer = copyValues(new MultinomialLabeler(uid, model), extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT], s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(probabilitiesCol)), s"Output column ${$(probabilitiesCol)} already exists.") require(!inputFields.exists(_.name == $(labelsCol)), s"Output column ${$(labelsCol)} already exists.") StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)), StructField($(labelsCol), ArrayType(StringType)))) } }
Example 67
Source File: ImputerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import org.apache.spark.ml.Transformer import org.apache.spark.ml.mleap.feature.Imputer import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ import org.apache.spark.sql.types.{DoubleType, StructType} import scala.util.Random class ImputerParitySpec extends SparkParityBase { def randomRow(): Row = { if(Random.nextBoolean()) { if(Random.nextBoolean()) { Row(23.4) } else { Row(Random.nextDouble()) } } else { Row(33.2) } } val rows = spark.sparkContext.parallelize(Seq.tabulate(100) { i => randomRow() }) val schema = new StructType().add("mv", DoubleType, nullable = true) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Imputer(uid = "imputer"). setInputCol("mv"). setOutputCol("mv_imputed"). setMissingValue(23.4). setStrategy("mean").fit(dataset) }
Example 68
Source File: MathUnaryParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.MathUnaryModel import ml.combust.mleap.core.feature.UnaryOperation.Tan import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.ml.mleap.feature.MathUnary import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MathUnaryParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new MathUnary(uid = "math_unary", model = MathUnaryModel(Tan)). setInputCol("dti"). setOutputCol("dti_tan") )).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 69
Source File: MultinomialLabelerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.{MultinomialLabelerModel, ReverseStringIndexerModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.mleap.feature.MultinomialLabeler import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MultinomialLabelerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new MultinomialLabeler(uid = "multinomial_labeler", model = MultinomialLabelerModel(threshold = 0.1, indexer = ReverseStringIndexerModel(Seq("fico", "dtizy")))). setFeaturesCol("features"). setProbabilitiesCol("probabilities"). setLabelsCol("labels"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 70
Source File: StringMapParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel} import org.apache.spark.ml.mleap.feature.StringMap import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ import org.apache.spark.sql.types.{StringType, StructType} class StringMapParitySpec extends SparkParityBase { val names = Seq("alice", "andy", "kevin") val rows = spark.sparkContext.parallelize(Seq.tabulate(3) { i => Row(names(i)) }) val schema = new StructType().add("name", StringType, nullable = false) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new StringMap(uid = "string_map", model = new StringMapModel( Map("alice" -> 0, "andy" -> 1, "kevin" -> 2) )).setInputCol("name").setOutputCol("index"), new StringMap(uid = "string_map2", model = new StringMapModel( // This map is missing the label "kevin". Exception is thrown unless HandleInvalid.Keep is set. Map("alice" -> 0, "andy" -> 1), handleInvalid = HandleInvalid.Keep, defaultValue = 1.0 )).setInputCol("name").setOutputCol("index2") )).fit(dataset) }
Example 71
Source File: MathBinaryParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.BinaryOperation.Multiply import ml.combust.mleap.core.feature.MathBinaryModel import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.ml.mleap.feature.MathBinary import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MathBinaryParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new MathBinary(uid = "math_bin", model = MathBinaryModel(Multiply)). setInputA("fico_index"). setInputB("dti"). setOutputCol("bin_out") )).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 72
Source File: SupportVectorMachineParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.classification import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.mleap.classification.SVMModel import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql._ class SupportVectorMachineParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new SVMModel(uid = "svm", model = new mllib.classification.SVMModel(weights = Vectors.dense(0.53, 0.67), intercept = 0.77)). setRawPredictionCol("raw_prediction"). setProbabilityCol("probability"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 73
Source File: HashingTF.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 74
Source File: SQLTransformer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 75
Source File: Binarizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val schema = dataset.schema val inputType = schema($(inputCol)).dataType val td = $(threshold) val binarizerDouble = udf { in: Double => if (in > td) 1.0 else 0.0 } val binarizerVector = udf { (data: Vector) => val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] data.foreachActive { (index, value) => if (value > td) { indices += index values += 1.0 } } Vectors.sparse(data.size, indices.result(), values.result()).compressed } val metadata = outputSchema($(outputCol)).metadata inputType match { case DoubleType => dataset.select(col("*"), binarizerDouble(col($(inputCol))).as($(outputCol), metadata)) case _: VectorUDT => dataset.select(col("*"), binarizerVector(col($(inputCol))).as($(outputCol), metadata)) } } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) val outCol: StructField = inputType match { case DoubleType => BinaryAttribute.defaultAttr.withName(outputColName).toStructField() case _: VectorUDT => StructField(outputColName, new VectorUDT) case _ => throw new IllegalArgumentException(s"Data type $inputType is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ outCol) } @Since("1.4.1") override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 76
Source File: Word2Vec.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import scala.util.Random import org.apache.spark.ml import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.feature.Word2VecModel import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, split} import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object Word2Vec extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ val df = DataGenerator.generateDoc( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, vocabSize, docLength, "text" ) df.select(split(col("text"), " ").as("text")) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.feature.Word2Vec().setInputCol("text") } override def testAdditionalMethods( ctx: MLBenchContext, model: Transformer): Map[String, () => _] = { import ctx.params._ val rng = new Random(ctx.seed()) val word2vecModel = model.asInstanceOf[Word2VecModel] val testWord = Vectors.dense(Array.fill(word2vecModel.getVectorSize)(rng.nextGaussian())) Map("findSynonyms" -> (() => { word2vecModel.findSynonyms(testWord, numSynonymsToFind) })) } }
Example 77
Source File: FPGrowth.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.fpm import org.apache.spark.ml import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.fpm.FPGrowthModel import org.apache.spark.sql.DataFrame import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object FPGrowth extends BenchmarkAlgorithm with TestFromTraining { def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateItemSet( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numItems, itemSetSize) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.fpm.FPGrowth() .setItemsCol("items") } override def testAdditionalMethods( ctx: MLBenchContext, model: Transformer): Map[String, () => _] = { val fpModel = model.asInstanceOf[FPGrowthModel] Map("associationRules" -> (() => { fpModel.associationRules.count() })) } }
Example 78
Source File: NaiveBayes.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object NaiveBayes extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ val rng = ctx.newGenerator() // Max possible arity of a feature in generated training/test data for NaiveBayes models val maxFeatureArity = 20 // All features for Naive Bayes must be categorical, i.e. have arity >= 2 val featureArity = 0.until(numFeatures).map(_ => 2 + rng.nextInt(maxFeatureArity - 2)).toArray DataGenerator.generateMixedFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, featureArity) } override protected def trueModel(ctx: MLBenchContext): Transformer = { import ctx.params._ val rng = ctx.newGenerator() // pi = log of class priors, whose dimension is C (number of classes) // theta = log of class conditional probabilities, whose dimension is C (number of classes) // by D (number of features) val unnormalizedProbs = 0.until(numClasses).map(_ => rng.nextDouble() + 1e-5).toArray val logProbSum = math.log(unnormalizedProbs.sum) val piArray = unnormalizedProbs.map(prob => math.log(prob) - logProbSum) // For class i, set the class-conditional probability of feature i to 0.7, and split up the // remaining probability mass across the other features val currClassProb = 0.7 val thetaArray = Array.tabulate(numClasses) { i: Int => val baseProbMass = (1 - currClassProb) / (numFeatures - 1) val probs = Array.fill[Double](numFeatures)(baseProbMass) probs(i) = currClassProb probs }.map(_.map(math.log)) // Initialize new Naive Bayes model val pi = Vectors.dense(piArray) val theta = new DenseMatrix(numClasses, numFeatures, thetaArray.flatten, true) ModelBuilderSSP.newNaiveBayesModel(pi, theta) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.NaiveBayes() .setSmoothing(smoothing) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 79
Source File: LinearSVC.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml import org.apache.spark.ml.linalg.Vectors import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object LinearSVC extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLinearSVCModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.LinearSVC() .setTol(tol) .setMaxIter(maxIter) .setRegParam(regParam) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 80
Source File: GBTClassification.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ object GBTClassification extends BenchmarkAlgorithm with TreeOrForestClassifier { import TreeOrForestEstimator.getFeatureArity override protected def trueModel(ctx: MLBenchContext): Transformer = { import ctx.params._ // We add +1 to the depth to make it more likely that many iterations of boosting are needed // to model the true tree. ModelBuilderSSP.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx), ctx.seed()) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ // TODO: subsamplingRate, featureSubsetStrategy // TODO: cacheNodeIds, checkpoint? new GBTClassifier() .setMaxDepth(depth) .setMaxIter(maxIter) .setSeed(ctx.seed()) } }
Example 81
Source File: LogisticRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.{Estimator, ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml import org.apache.spark.ml.linalg.Vectors object LogisticRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLogisticRegressionModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.LogisticRegression() .setTol(tol) .setMaxIter(maxIter) .setRegParam(regParam) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 82
Source File: GLMRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.regression import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { import ctx.params._ val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) val m = ModelBuilderSSP.newGLR(coefficients, intercept) m.set(m.link, link.get) m.set(m.family, family.get) m } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new GeneralizedLinearRegression() .setLink(link) .setFamily(family) .setRegParam(regParam) .setMaxIter(maxIter) .setTol(tol) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator() }
Example 83
Source File: LinearRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.regression import org.apache.spark.ml import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLinearRegressionModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.regression.LinearRegression() .setSolver("l-bfgs") .setRegParam(regParam) .setMaxIter(maxIter) .setTol(tol) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator() }
Example 84
Source File: Cleaner.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions.clean import com.hankcs.hanlp.HanLP import config.paramconf.{HasOutputCol, HasInputCol} import functions.MySchemaUtils import functions.clean.chinese.BCConvert import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1) override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val cleanFunc = udf {line: String => var cleaned = "" getFanJian match { case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line) case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line) case _ => cleaned = line } getQuanBan match { case "q2b" => cleaned = BCConvert.qj2bj(cleaned) case "b2q" => cleaned = BCConvert.bj2qj(cleaned) case _ => cleaned = cleaned } cleaned } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record => val outputIndex = record.fieldIndex($(outputCol)) record.getString(outputIndex).length >= getMinLineLen } } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.typeName.equals(StringType.typeName), s"Input type must be StringType but got $inputType.") MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable) } } object Cleaner extends DefaultParamsReadable[Cleaner] { override def load(path: String): Cleaner = super.load(path) }
Example 85
Source File: LightPipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.ml.{PipelineModel, Transformer} import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.JavaConverters._ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddingsVectors: Boolean = false) { private var ignoreUnsupported = false def setIgnoreUnsupported(v: Boolean): Unit = ignoreUnsupported = v def getIgnoreUnsupported: Boolean = ignoreUnsupported def getStages: Array[Transformer] = pipelineModel.stages def transform(dataFrame: Dataset[_]): DataFrame = pipelineModel.transform(dataFrame) def fullAnnotate(target: String, startWith: Map[String, Seq[Annotation]] = Map.empty[String, Seq[Annotation]]): Map[String, Seq[Annotation]] = { getStages.foldLeft(startWith)((annotations, transformer) => { transformer match { case documentAssembler: DocumentAssembler => annotations.updated(documentAssembler.getOutputCol, documentAssembler.assemble(target, Map.empty[String, String])) case lazyAnnotator: AnnotatorModel[_] if lazyAnnotator.getLazyAnnotator => annotations case recursiveAnnotator: HasRecursiveTransform[_] with AnnotatorModel[_] => val combinedAnnotations = recursiveAnnotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil)) annotations.updated(recursiveAnnotator.getOutputCol, recursiveAnnotator.annotate(combinedAnnotations, pipelineModel)) case annotator: AnnotatorModel[_] => val combinedAnnotations = annotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil)) annotations.updated(annotator.getOutputCol, annotator.annotate(combinedAnnotations)) case finisher: Finisher => annotations.filterKeys(finisher.getInputCols.contains) case rawModel: RawAnnotator[_] => if (ignoreUnsupported) annotations else throw new IllegalArgumentException(s"model ${rawModel.uid} does not support LightPipeline." + s" Call setIgnoreUnsupported(boolean) on LightPipeline to ignore") case pipeline: PipelineModel => new LightPipeline(pipeline, parseEmbeddingsVectors).fullAnnotate(target, annotations) case _ => annotations } }) } def fullAnnotate(targets: Array[String]): Array[Map[String, Seq[Annotation]]] = { targets.par.map(target => { fullAnnotate(target) }).toArray } def fullAnnotateJava(target: String): java.util.Map[String, java.util.List[JavaAnnotation]] = { fullAnnotate(target).mapValues(_.map(aa => JavaAnnotation(aa.annotatorType, aa.begin, aa.end, aa.result, aa.metadata.asJava)).asJava).asJava } def fullAnnotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[JavaAnnotation]]] = { targets.asScala.par.map(target => { fullAnnotateJava(target) }).toList.asJava } def annotate(target: String): Map[String, Seq[String]] = { fullAnnotate(target).mapValues(_.map(a => { a.annotatorType match { case (AnnotatorType.WORD_EMBEDDINGS | AnnotatorType.SENTENCE_EMBEDDINGS) if (parseEmbeddingsVectors) => a.embeddings.mkString(" ") case _ => a.result } })) } def annotate(targets: Array[String]): Array[Map[String, Seq[String]]] = { targets.par.map(target => { annotate(target) }).toArray } def annotateJava(target: String): java.util.Map[String, java.util.List[String]] = { annotate(target).mapValues(_.asJava).asJava } def annotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[String]]] = { targets.asScala.par.map(target => { annotateJava(target) }).toList.asJava } }
Example 86
Source File: AnnotatorApproach.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.storage.HasStorage import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer} import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType} import org.apache.spark.ml.util.DefaultParamsWritable override final def transformSchema(schema: StructType): StructType = { require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" + msgHelper(schema) + s"\nMake sure such annotators exist in your pipeline, " + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", outputAnnotatorType) val outputFields = schema.fields :+ StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build) StructType(outputFields) } }
Example 87
Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.mutable.ListBuffer class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline { def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty) def this(uid: String) = this(uid, Array.empty) def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages) this.setStages(baseStages) override def fit(dataset: Dataset[_]): PipelineModel = { transformSchema(dataset.schema, logging = true) val theStages = $(stages) var indexOfLastEstimator = -1 theStages.view.zipWithIndex.foreach { case (stage, index) => stage match { case _: Estimator[_] => indexOfLastEstimator = index case _ => } } var curDataset = dataset val transformers = ListBuffer.empty[Transformer] theStages.view.zipWithIndex.foreach { case (stage, index) => if (index <= indexOfLastEstimator) { val transformer = stage match { case estimator: HasRecursiveFit[_] => estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset)) case estimator: Estimator[_] => estimator.fit(curDataset) case t: Transformer => t case _ => throw new IllegalArgumentException( s"Does not support stage $stage of type ${stage.getClass}") } if (index < indexOfLastEstimator) { curDataset = transformer.transform(curDataset) } transformers += transformer } else { transformers += stage.asInstanceOf[Transformer] } } createPipeline(dataset, transformers.toArray) } } class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel) extends Model[RecursivePipelineModel] with MLWritable with Logging { def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline) // drops right at most because is itself included private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel = new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset) override def copy(extra: ParamMap): RecursivePipelineModel = { new RecursivePipelineModel(uid, innerPipeline.copy(extra)) } override def write: MLWriter = { innerPipeline.write } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match { case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset)) case t: AnnotatorModel[_] if t.getLazyAnnotator => cur case t: Transformer => t.transform(cur) }) } override def transformSchema(schema: StructType): StructType = { innerPipeline.transformSchema(schema) } }
Example 88
Source File: Gather.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.HasOutputCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ext.functions._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ private[feature] trait GatherParams extends Params with HasKeyCol with HasValueCol with HasOutputCol { val primaryKeyCols: Param[Array[String]] = new StringArrayParam(this, "primaryKeyCols", "Primary key column names", ParamValidators.arrayLengthGt(0)) val valueAgg: Param[String] = new Param[String](this, "valueAgg", "Aggregate function applied to valueCol: 'sum' or 'count'", ParamValidators.inArray(Array("sum", "count"))) def getPrimaryKeyCols: Array[String] = $(primaryKeyCols) def getValueAgg: String = $(valueAgg) } class Gather(override val uid: String) extends Transformer with GatherParams { def this() = this(Identifiable.randomUID("gather")) def setPrimaryKeyCols(value: String*): this.type = set(primaryKeyCols, value.toArray) def setKeyCol(value: String): this.type = set(keyCol, value) def setValueCol(value: String): this.type = set(valueCol, value) def setValueAgg(value: String): this.type = set(valueAgg, value) def setOutputCol(value: String): this.type = set(outputCol, value) setDefault( valueAgg -> "sum" ) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val pkCols = $(primaryKeyCols).map(col) val grouped = dataset.groupBy(pkCols :+ col($(keyCol)) : _*) val aggregateCol = s"${uid}_value_aggregate" val aggregated = $(valueAgg) match { case "sum" => grouped.agg(sum($(valueCol)) as aggregateCol) case "count" => grouped.agg(count($(valueCol)) as aggregateCol) } val metadata = outputSchema($(outputCol)).metadata aggregated .groupBy(pkCols: _*) .agg(collectArray(struct( col($(keyCol)), col(aggregateCol).cast(DoubleType).as($(valueCol)) )).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val valueFunName = $(valueAgg) val keyColName = $(keyCol) val keyColDataType = schema(keyColName).dataType keyColDataType match { case _: NumericType => case _: StringType => case other => throw new IllegalArgumentException(s"Key column data type $other is not supported.") } val valueColName = $(valueCol) val valueColDataType = schema(valueColName).dataType valueColDataType match { case _: NumericType => case _: StringType if valueFunName == "count" => case other => throw new IllegalArgumentException(s"Value data type $other is not supported with value aggregate $valueAgg.") } val pkFields = $(primaryKeyCols).map(schema.apply) val rollupType = StructType(Array( StructField($(keyCol), keyColDataType), StructField($(valueCol), DoubleType) )) val rollupField = StructField($(outputCol), ArrayType(rollupType), nullable = false) StructType(pkFields :+ rollupField) } override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) }
Example 89
Source File: S2CellTransformer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import com.google.common.geometry.{S2LatLng, S2CellId} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} class S2CellTransformer(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("S2CellTransformer")) // Input/Output column names val latCol: Param[String] = new Param[String](this, "latCol", "latitude column") val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column") val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column") val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]", (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i)) // Default parameters setDefault( latCol -> "lat", lonCol -> "lon", cellCol -> "cell", level -> 10 ) def getLatCol: String = $(latCol) def getLonCol: String = $(lonCol) def getCellCol: String = $(cellCol) def getLevel: Int = $(level) def setLatCol(value: String): this.type = set(latCol, value) def setLonCol(value: String): this.type = set(lonCol, value) def setCellCol(value: String): this.type = set(cellCol, value) def setLevel(value: Int): this.type = set(level, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val currentLevel = $(level) val t = udf { (lat: Double, lon: Double) => val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon)) cellId.parent(currentLevel).toToken } val metadata = outputSchema($(cellCol)).metadata dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val latColumnName = $(latCol) val latDataType = schema(latColumnName).dataType require(latDataType == DoubleType, s"The latitude column $latColumnName must be Double type, " + s"but got $latDataType.") val lonColumnName = $(lonCol) val lonDataType = schema(lonColumnName).dataType require(lonDataType == DoubleType, s"The longitude column $lonColumnName must be Double type, " + s"but got $lonDataType.") val inputFields = schema.fields val outputColName = $(cellCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = NominalAttribute.defaultAttr.withName($(cellCol)) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) }
Example 90
Source File: IndexToValue.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable} import com.microsoft.ml.spark.core.schema.{CategoricalColumnInfo, CategoricalUtilities} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import com.microsoft.ml.spark.core.schema.SchemaConstants._ import scala.reflect.ClassTag import reflect.runtime.universe.TypeTag object IndexToValue extends DefaultParamsReadable[IndexToValue] override def transform(dataset: Dataset[_]): DataFrame = { val info = new CategoricalColumnInfo(dataset.toDF(), getInputCol) require(info.isCategorical, "column " + getInputCol + "is not Categorical") val dataType = info.dataType val getLevel = dataType match { case _: IntegerType => getLevelUDF[Int](dataset) case _: LongType => getLevelUDF[Long](dataset) case _: DoubleType => getLevelUDF[Double](dataset) case _: StringType => getLevelUDF[String](dataset) case _: BooleanType => getLevelUDF[Boolean](dataset) case _ => throw new Exception("Unsupported type " + dataType.toString) } dataset.withColumn(getOutputCol, getLevel(dataset(getInputCol)).as(getOutputCol)) } private class Default[T] {var value: T = _ } def getLevelUDF[T: TypeTag](dataset: Dataset[_])(implicit ct: ClassTag[T]): UserDefinedFunction = { val map = CategoricalUtilities.getMap[T](dataset.schema(getInputCol).metadata) udf((index: Int) => { if (index == map.numLevels && map.hasNullLevel) { new Default[T].value } else { map.getLevelOption(index) .getOrElse(throw new IndexOutOfBoundsException( "Invalid metadata: Index greater than number of levels in metadata, " + s"index: $index, levels: ${map.numLevels}")) } }) } def transformSchema(schema: StructType): StructType = { val metadata = schema(getInputCol).metadata val dataType = if (metadata.contains(MMLTag)) { CategoricalColumnInfo.getDataType(metadata, throwOnInvalid = true).get } else { schema(getInputCol).dataType } val newField = StructField(getOutputCol, dataType) if (schema.fieldNames.contains(getOutputCol)) { val index = schema.fieldIndex(getOutputCol) val fields = schema.fields fields(index) = newField StructType(fields) } else { schema.add(newField) } } def copy(extra: ParamMap): this.type = defaultCopy(extra) }
Example 91
Source File: StratifiedRepartition.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasLabelCol, Wrappable} import org.apache.spark.RangePartitioner import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.HasSeed import org.apache.spark.ml.util._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} override def transform(dataset: Dataset[_]): DataFrame = { // Count unique values in label column val distinctLabelCounts = dataset.select(getLabelCol).groupBy(getLabelCol).count().collect() val labelToCount = distinctLabelCounts.map(row => (row.getInt(0), row.getLong(1))) val labelToFraction = getMode match { case SPConstants.Equal => getEqualLabelCount(labelToCount, dataset) case SPConstants.Mixed => val equalLabelToCount = getEqualLabelCount(labelToCount, dataset) val normalizedRatio = equalLabelToCount.map { case (label, count) => count }.sum / labelToCount.length labelToCount.map { case (label, count) => (label, count / normalizedRatio)}.toMap case SPConstants.Original => labelToCount.map { case (label, count) => (label, 1.0) }.toMap case _ => throw new Exception(s"Unknown mode specified to StratifiedRepartition: $getMode") } val labelColIndex = dataset.schema.fieldIndex(getLabelCol) val spdata = dataset.toDF().rdd.keyBy(row => row.getInt(labelColIndex)) .sampleByKeyExact(true, labelToFraction, getSeed) .mapPartitions(keyToRow => keyToRow.zipWithIndex.map { case ((key, row), index) => (index, row) }) val rangePartitioner = new RangePartitioner(dataset.rdd.getNumPartitions, spdata) val rspdata = spdata.partitionBy(rangePartitioner).mapPartitions(keyToRow => keyToRow.map{case (key, row) => row}).persist() dataset.sqlContext.createDataFrame(rspdata, dataset.schema) } private def getEqualLabelCount(labelToCount: Array[(Int, Long)], dataset: Dataset[_]): Map[Int, Double] = { val maxLabelCount = Math.max(labelToCount.map { case (label, count) => count }.max, dataset.rdd.getNumPartitions) labelToCount.map { case (label, count) => (label, maxLabelCount.toDouble / count) }.toMap } def transformSchema(schema: StructType): StructType = schema def copy(extra: ParamMap): DropColumns = defaultCopy(extra) }
Example 92
Source File: UDFTransformer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasInputCols, HasOutputCol, Wrappable} import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.serialize.ComplexParam import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam, UDPyFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.execution.python.UserDefinedPythonFunction import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.{Column, DataFrame, Dataset} import org.apache.spark.sql.functions.col object UDFTransformer extends ComplexParamsReadable[UDFTransformer] override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) if (isSet(inputCol)) { dataset.withColumn(getOutputCol, applyUDF(dataset.col(getInputCol))) } else { dataset.withColumn(getOutputCol, applyUDFOnCols(getInputCols.map(col): _*)) } } def validateAndTransformSchema(schema: StructType): StructType = { if (isSet(inputCol)) schema(getInputCol) else schema(Set(getInputCols: _*)) schema.add(StructField(getOutputCol, getDataType)) } def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema) def copy(extra: ParamMap): UDFTransformer = defaultCopy(extra) }
Example 93
Source File: DropColumns.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} object DropColumns extends DefaultParamsReadable[DropColumns] override def transform(dataset: Dataset[_]): DataFrame = { verifySchema(dataset.schema) dataset.toDF().drop(getCols: _*) } def transformSchema(schema: StructType): StructType = { verifySchema(schema) val droppedCols = getCols.toSet StructType(schema.fields.filter(f => !droppedCols(f.name))) } def copy(extra: ParamMap): DropColumns = defaultCopy(extra) private def verifySchema(schema: StructType): Unit = { val providedCols = schema.fields.map(_.name).toSet val invalidCols = getCols.filter(!providedCols(_)) if (invalidCols.length > 0) { throw new NoSuchElementException( s"DataFrame does not contain specified columns: ${invalidCols.reduce(_ + "," + _)}") } } }
Example 94
Source File: Repartition.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ object Repartition extends DefaultParamsReadable[Repartition] override def transform(dataset: Dataset[_]): DataFrame = { if (getDisable) dataset.toDF else if (getN < dataset.rdd.getNumPartitions) dataset.coalesce(getN).toDF() else dataset.sqlContext.createDataFrame( dataset.rdd.repartition(getN).asInstanceOf[RDD[Row]], dataset.schema) } def transformSchema(schema: StructType): StructType = { schema } def copy(extra: ParamMap): this.type = defaultCopy(extra) }
Example 95
Source File: SelectColumns.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ object SelectColumns extends DefaultParamsReadable[SelectColumns] override def transform(dataset: Dataset[_]): DataFrame = { verifySchema(dataset.schema) dataset.toDF().select(getCols.map(col): _*) } def transformSchema(schema: StructType): StructType = { verifySchema(schema) val selectedCols = getCols.toSet StructType(schema.fields.filter(f => selectedCols(f.name))) } def copy(extra: ParamMap): SelectColumns = defaultCopy(extra) private def verifySchema(schema: StructType): Unit = { val providedCols = schema.fields.map(_.name).toSet val invalidCols = getCols.filter(!providedCols(_)) if (invalidCols.length > 0) { throw new NoSuchElementException( s"DataFrame does not contain specified columns: ${invalidCols.reduce(_ + "," + _)}") } } }
Example 96
Source File: Lambda.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.SparkContext import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object Lambda extends ComplexParamsReadable[Lambda] { def apply(f: Dataset[_] => DataFrame): Lambda = { new Lambda().setTransform(f) } } class Lambda(val uid: String) extends Transformer with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("Lambda")) val transformFunc = new UDFParam(this, "transformFunc", "holder for dataframe function") def setTransform(f: Dataset[_] => DataFrame): this.type = { set(transformFunc, udf(f, StringType)) } def getTransform: Dataset[_] => DataFrame = { $(transformFunc).f.asInstanceOf[Dataset[_] => DataFrame] } val transformSchemaFunc = new UDFParam(this, "transformSchemaFunc", "the output schema after the transformation") def setTransformSchema(f: StructType => StructType): this.type = { set(transformSchemaFunc, udf(f, StringType)) } def getTransformSchema: StructType => StructType = { $(transformSchemaFunc).f.asInstanceOf[StructType => StructType] } override def transform(dataset: Dataset[_]): DataFrame = { getTransform(dataset) } def transformSchema(schema: StructType): StructType = { if (get(transformSchemaFunc).isEmpty) { val sc = SparkContext.getOrCreate() val df = SparkSession.builder().getOrCreate().createDataFrame(sc.emptyRDD[Row], schema) transform(df).schema } else { getTransformSchema(schema) } } def copy(extra: ParamMap): Lambda = defaultCopy(extra) }
Example 97
Source File: Explode.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ object Explode extends DefaultParamsReadable[Explode] class Explode(val uid: String) extends Transformer with HasInputCol with HasOutputCol with Wrappable with DefaultParamsWritable { def this() = this(Identifiable.randomUID("Explode")) setDefault(outputCol->(this.uid + "_output")) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) dataset.toDF().withColumn(getOutputCol, explode(col(getInputCol))) } def transformSchema(schema: StructType): StructType = { val innerType = schema(getInputCol).dataType match { case ArrayType(it, _) => it case _ => throw new IllegalArgumentException("Explode only accepts array columns") } schema.add(getOutputCol, innerType) } def copy(extra: ParamMap): Explode = defaultCopy(extra) }
Example 98
Source File: EvaluationUtils.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.automl import com.microsoft.ml.spark.core.metrics.MetricConstants import com.microsoft.ml.spark.core.schema.SchemaConstants import com.microsoft.ml.spark.train.{TrainClassifier, TrainRegressor, TrainedClassifierModel, TrainedRegressorModel} import org.apache.spark.injections.RegressionUtils import org.apache.spark.ml.classification.{ClassificationModel, Classifier} import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.regression._ object EvaluationUtils { val ModelTypeUnsupportedErr = "Model type not supported for evaluation" // Find type of trained models def getModelType(model: PipelineStage): String = { model match { case _: TrainRegressor => SchemaConstants.RegressionKind case _: TrainClassifier => SchemaConstants.ClassificationKind case _: Classifier[_, _, _] => SchemaConstants.ClassificationKind case regressor: PipelineStage if RegressionUtils.isRegressor(regressor) => SchemaConstants.RegressionKind case _: DecisionTreeRegressor => SchemaConstants.RegressionKind case _: GBTRegressor => SchemaConstants.RegressionKind case _: RandomForestRegressor => SchemaConstants.RegressionKind case _: TrainedRegressorModel => SchemaConstants.RegressionKind case _: TrainedClassifierModel => SchemaConstants.ClassificationKind case evm: BestModel => getModelType(evm.getBestModel) case _: ClassificationModel[_, _] => SchemaConstants.ClassificationKind case _: RegressionModel[_, _] => SchemaConstants.RegressionKind case _ => throw new Exception(ModelTypeUnsupportedErr) } } def getMetricWithOperator(model: PipelineStage, evaluationMetric: String): (String, Ordering[Double]) = { val modelType = getModelType(model) getMetricWithOperator(modelType, evaluationMetric) } def getMetricWithOperator(modelType: String, evaluationMetric: String): (String, Ordering[Double]) = { val chooseHighest = Ordering.Double val chooseLowest = Ordering.Double.reverse val (evaluationMetricColumnName, operator): (String, Ordering[Double]) = modelType match { case SchemaConstants.RegressionKind => evaluationMetric match { case MetricConstants.MseSparkMetric => (MetricConstants.MseColumnName, chooseLowest) case MetricConstants.RmseSparkMetric => (MetricConstants.RmseColumnName, chooseLowest) case MetricConstants.R2SparkMetric => (MetricConstants.R2ColumnName, chooseHighest) case MetricConstants.MaeSparkMetric => (MetricConstants.MaeColumnName, chooseLowest) case _ => throw new Exception("Metric is not supported for regressors") } case SchemaConstants.ClassificationKind => evaluationMetric match { case MetricConstants.AucSparkMetric => (MetricConstants.AucColumnName, chooseHighest) case MetricConstants.PrecisionSparkMetric => (MetricConstants.PrecisionColumnName, chooseHighest) case MetricConstants.RecallSparkMetric => (MetricConstants.RecallColumnName, chooseHighest) case MetricConstants.AccuracySparkMetric => (MetricConstants.AccuracyColumnName, chooseHighest) case _ => throw new Exception("Metric is not supported for classifiers") } case _ => throw new Exception("Model type not supported for evaluation") } (evaluationMetricColumnName, operator) } def getModelParams(model: Transformer): ParamMap = { model match { case reg: TrainedRegressorModel => reg.getParamMap case cls: TrainedClassifierModel => cls.getParamMap case evm: BestModel => getModelParams(evm.getBestModel) case _ => throw new Exception("Model type not supported for evaluation") } } def modelParamsToString(model: Transformer): String = getModelParams(model).toSeq.map(pv => s"${pv.param.name}: ${pv.value}").sorted.mkString(", ") }
Example 99
Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable} import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions] class VowpalWabbitInteractions(override val uid: String) extends Transformer with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("VowpalWabbitInteractions")) override def transform(dataset: Dataset[_]): DataFrame = { val fieldSubset = dataset.schema.fields .filter(f => getInputCols.contains(f.name)) val mask = getMask val mode = udf((r: Row) => { // compute the final number of features val numElems = (0 until r.length) .map(r.getAs[Vector](_).numNonzeros).product val newIndices = new Array[Int](numElems) val newValues = new Array[Double](numElems) // build interaction features using FNV-1 val fnvPrime = 16777619 var i = 0 def interact(idx: Int, value: Double, ns: Int): Unit = { if (ns == r.size) { newIndices(i) += mask & idx newValues(i) += value i += 1 } else { val idx1 = idx * fnvPrime r.getAs[Vector](ns).foreachActive { case (idx2, value2) => interact(idx1 ^ idx2, value * value2, ns + 1) } } } // start the recursion interact(0, 1, 0) val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions) Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted) }) dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*))) } override def transformSchema(schema: StructType): StructType = { val fieldNames = schema.fields.map(_.name) for (f <- getInputCols) if (!fieldNames.contains(f)) throw new IllegalArgumentException("missing input column " + f) else { val fieldType = schema.fields(schema.fieldIndex(f)).dataType if (fieldType != VectorType) throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName) } schema.add(StructField(getOutputCol, VectorType, true)) } override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra) }
Example 100
Source File: HTTPTransformer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.http import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable} import com.microsoft.ml.spark.io.http.HandlingUtils.HandlerFunc import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.concurrent.ExecutionContext import scala.concurrent.duration.Duration trait HasHandler extends Params { val handler: UDFParam = new UDFParam( this, "handler", "Which strategy to use when handling requests") override def transform(dataset: Dataset[_]): DataFrame = { val df = dataset.toDF() val enc = RowEncoder(transformSchema(df.schema)) val colIndex = df.schema.fieldNames.indexOf(getInputCol) val fromRow = HTTPRequestData.makeFromRowConverter val toRow = HTTPResponseData.makeToRowConverter df.mapPartitions { it => if (!it.hasNext) { Iterator() }else{ val c = clientHolder.get val responsesWithContext = c.sendRequestsWithContext(it.map{row => c.RequestWithContext(Option(row.getStruct(colIndex)).map(fromRow), Some(row)) }) responsesWithContext.map { rwc => Row.merge(rwc.context.get.asInstanceOf[Row], Row(rwc.response.flatMap(Option(_)).map(toRow).orNull)) } } }(enc) } def copy(extra: ParamMap): HTTPTransformer = defaultCopy(extra) def transformSchema(schema: StructType): StructType = { assert(schema(getInputCol).dataType == HTTPSchema.Request) schema.add(getOutputCol, HTTPSchema.Response, nullable=true) } }
Example 101
Source File: ParserSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.split1 import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.io.http._ import org.apache.http.client.methods.HttpPost import org.apache.spark.ml.Transformer import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, SparkSession} trait ParserUtils extends WithServer { def sampleDf(spark: SparkSession): DataFrame = { val df = spark.createDataFrame((1 to 10).map(Tuple1(_))) .toDF("data") val df2 = new JSONInputParser().setInputCol("data") .setOutputCol("parsedInput").setUrl(url) .transform(df) .withColumn("unparsedOutput", udf({ x: Int => HTTPResponseData( Array(), Some(EntityData( "{\"foo\": \"here\"}".getBytes, None, None, None, false, false, false)), StatusLineData(ProtocolVersionData("foo", 1, 1), 200, "bar"), "en") }).apply(col("data")) ) new JSONOutputParser() .setDataType(new StructType().add("foo", StringType)) .setInputCol("unparsedOutput") .setOutputCol("parsedOutput") .transform(df2) } def makeTestObject[T <: Transformer](t: T, session: SparkSession): Seq[TestObject[T]] = { Seq(new TestObject(t, sampleDf(session))) } } class JsonInputParserSuite extends TransformerFuzzing[JSONInputParser] with ParserUtils { override def testObjects(): Seq[TestObject[JSONInputParser]] = makeTestObject( new JSONInputParser().setInputCol("data").setOutputCol("out") .setUrl(url), session) override def reader: MLReadable[_] = JSONInputParser } class JsonOutputParserSuite extends TransformerFuzzing[JSONOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[JSONOutputParser]] = makeTestObject( new JSONOutputParser().setInputCol("unparsedOutput").setOutputCol("out") .setDataType(new StructType().add("foo", StringType)), session) override def reader: MLReadable[_] = JSONOutputParser } class StringOutputParserSuite extends TransformerFuzzing[StringOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[StringOutputParser]] = makeTestObject( new StringOutputParser().setInputCol("unparsedOutput").setOutputCol("out"), session) override def reader: MLReadable[_] = StringOutputParser } class CustomInputParserSuite extends TransformerFuzzing[CustomInputParser] with ParserUtils { override def testObjects(): Seq[TestObject[CustomInputParser]] = makeTestObject( new CustomInputParser().setInputCol("data").setOutputCol("out") .setUDF({ x: Int => new HttpPost(s"http://$x") }), session) override def reader: MLReadable[_] = CustomInputParser } class CustomOutputParserSuite extends TransformerFuzzing[CustomOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[CustomOutputParser]] = makeTestObject( new CustomOutputParser().setInputCol("unparsedOutput").setOutputCol("out") .setUDF({ x: HTTPResponseData => x.locale }), session) override def reader: MLReadable[_] = CustomOutputParser }
Example 102
Source File: HashingTF.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 103
Source File: SQLTransformer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 104
Source File: Binarizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val schema = dataset.schema val inputType = schema($(inputCol)).dataType val td = $(threshold) val binarizerDouble = udf { in: Double => if (in > td) 1.0 else 0.0 } val binarizerVector = udf { (data: Vector) => val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] data.foreachActive { (index, value) => if (value > td) { indices += index values += 1.0 } } Vectors.sparse(data.size, indices.result(), values.result()).compressed } val metadata = outputSchema($(outputCol)).metadata inputType match { case DoubleType => dataset.select(col("*"), binarizerDouble(col($(inputCol))).as($(outputCol), metadata)) case _: VectorUDT => dataset.select(col("*"), binarizerVector(col($(inputCol))).as($(outputCol), metadata)) } } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) val outCol: StructField = inputType match { case DoubleType => BinaryAttribute.defaultAttr.withName(outputColName).toStructField() case _: VectorUDT => StructField(outputColName, new VectorUDT) case _ => throw new IllegalArgumentException(s"Data type $inputType is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ outCol) } @Since("1.4.1") override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 105
Source File: HashingTF.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 106
Source File: Binarizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) }
Example 107
Source File: TypedTransformer.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml import frameless.ops.SmartProject import org.apache.spark.ml.Transformer import shapeless.{Generic, HList} import shapeless.ops.hlist.{Prepend, Tupler} trait AppendTransformer[Inputs, Outputs, InnerTransformer <: Transformer] extends TypedTransformer { val transformer: InnerTransformer def transform[T, TVals <: HList, OutputsVals <: HList, OutVals <: HList, Out](ds: TypedDataset[T])( implicit i0: SmartProject[T, Inputs], i1: Generic.Aux[T, TVals], i2: Generic.Aux[Outputs, OutputsVals], i3: Prepend.Aux[TVals, OutputsVals, OutVals], i4: Tupler.Aux[OutVals, Out], i5: TypedEncoder[Out] ): TypedDataset[Out] = { val transformed = transformer.transform(ds.dataset).as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](transformed) } } object AppendTransformer { // Random name to a temp column added by a TypedTransformer (the proper name will be given by the Tuple-based encoder) private[ml] val tempColumnName = "I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMI" private[ml] val tempColumnName2 = "I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMJ" private[ml] val tempColumnName3 = "I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMK" }
Example 108
Source File: UnaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.base.unary import com.salesforce.op.UID import com.salesforce.op.features.FeatureSparkTypes import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.{OpPipelineStage1, OpTransformer} import org.apache.spark.ml.Transformer import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.util.ClosureUtils import scala.reflect.runtime.universe.TypeTag import scala.util.Try final class UnaryLambdaTransformer[I <: FeatureType, O <: FeatureType] ( operationName: String, val transformFn: I => O, uid: String = UID[UnaryLambdaTransformer[I, O]] )( implicit tti: TypeTag[I], tto: TypeTag[O], ttov: TypeTag[O#Value] ) extends UnaryTransformer[I, O](operationName = operationName, uid = uid)
Example 109
Source File: SwTransformerSpec.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.test import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import scala.reflect.ClassTag import scala.reflect.runtime.universe.WeakTypeTag def sparkStage: Option[SparkTransformerType] = transformer.getSparkMlStage() it should "have a Spark stage set" in { sparkStage match { case None => fail("Spark stage is not set") case Some(s) => withClue(s"Spark stage type is '${s.getClass.getName}' (expected '${stc.runtimeClass.getName}'):") { s.isInstanceOf[SparkTransformerType] shouldBe true } } } it should "have input column names set" in { transformer.getInputColParamNames() should not be empty } it should "have output column name set" in { transformer.getOutputColParamNames() should not be empty } it should "have inputs set on Spark stage" in { transformer.getInputColParamNames().flatMap(name => sparkStage.flatMap(s => s.get(s.getParam(name)))) shouldBe transformer.getInputFeatures().map(_.name) } it should "have output set on Spark stage" in { transformer.getOutputColParamNames().flatMap(name => sparkStage.flatMap(s => s.get(s.getParam(name)))) shouldBe Array(transformer.getOutputFeatureName) } }
Example 110
Source File: SwBinaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage2 import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import org.apache.spark.sql.{DataFrame, Dataset} import scala.reflect.runtime.universe.TypeTag class SwBinaryTransformer[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType, T <: Transformer with Params] ( val inputParam1Name: String, val inputParam2Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwBinaryTransformer[I1, I2, O, T]] )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformer2[I1, I2, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 111
Source File: SwTernaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage3 import org.apache.spark.ml.param.Params import org.apache.spark.ml.{Model, Transformer} import org.apache.spark.sql._ import scala.reflect.runtime.universe.TypeTag class SwTernaryTransformer[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, O <: FeatureType, T <: Model[T] with Params] ( val inputParam1Name: String, val inputParam2Name: String, val inputParam3Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwTernaryTransformer[I1, I2, I3, O, T]] )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tti3: TypeTag[I3], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformer3[I1, I2, I3, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 112
Source File: SwQuaternaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage4 import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import org.apache.spark.sql._ import scala.reflect.runtime.universe.TypeTag class SwQuaternaryTransformer[I1 <: FeatureType, I2 <: FeatureType, I3 <: FeatureType, I4 <: FeatureType, O <: FeatureType, T <: Transformer with Params] ( val inputParam1Name: String, val inputParam2Name: String, val inputParam3Name: String, val inputParam4Name: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwQuaternaryTransformer[I1, I2, I3, I4, O, T]] )( implicit val tti1: TypeTag[I1], val tti2: TypeTag[I2], val tti3: TypeTag[I3], val tti4: TypeTag[I4], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformer4[I1, I2, I3, I4, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 113
Source File: SwSequenceTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStageN import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import org.apache.spark.sql._ import scala.reflect.runtime.universe.TypeTag class SwSequenceTransformer[I <: FeatureType, O <: FeatureType, T <: Transformer with Params] ( val inputParamName: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwSequenceTransformer[I, O, T]] )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformerN[I, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 114
Source File: SwUnaryTransformer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.generic import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.OpPipelineStage1 import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.Params import org.apache.spark.sql._ import scala.reflect.runtime.universe.TypeTag class SwUnaryTransformer[I <: FeatureType, O <: FeatureType, T <: Transformer with Params] ( val inputParamName: String, val outputParamName: String, val operationName: String, private val sparkMlStageIn: Option[T], val uid: String = UID[SwUnaryTransformer[I, O, T]] )( implicit val tti: TypeTag[I], val tto: TypeTag[O], val ttov: TypeTag[O#Value] ) extends SwTransformer1[I, O, T] { setSparkMlStage(sparkMlStageIn) }
Example 115
Source File: OpNGramTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.stages.sparkwrappers.specific.OpTransformerWrapper import com.salesforce.op.test.{SwTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.NGram import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpNGramTest extends SwTransformerSpec[TextList, NGram, OpNGram] { val data = Seq("a b c d e f g").map(_.split(" ").toSeq.toTextList) val (inputData, textListFeature) = TestFeatureBuilder(data) val expectedResult = Seq(Seq("a b", "b c", "c d", "d e", "e f", "f g").toTextList) val bigrams = textListFeature.ngram() val transformer = bigrams.originStage.asInstanceOf[OpNGram] it should "generate unigrams" in { val unigrams = textListFeature.ngram(n = 1) val transformedData = unigrams.originStage.asInstanceOf[Transformer].transform(inputData) val results = transformedData.collect(unigrams) results(0) shouldBe data.head } it should "generate trigrams" in { val trigrams = textListFeature.ngram(n = 3) val transformedData = trigrams.originStage.asInstanceOf[Transformer].transform(inputData) val results = transformedData.collect(trigrams) results(0) shouldBe Seq("a b c", "b c d", "c d e", "d e f", "e f g").toTextList } it should "not allow n < 1" in { the[IllegalArgumentException] thrownBy textListFeature.ngram(n = 0) the[IllegalArgumentException] thrownBy textListFeature.ngram(n = -1) } }
Example 116
Source File: ToOccurTransformerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.Transformer import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class ToOccurTransformerTest extends OpTransformerSpec[RealNN, ToOccurTransformer[Real]] { val testData = Seq(2.0.toReal, 0.0.toReal, Real(None)) val (inputData, f1) = TestFeatureBuilder(testData) val expectedResult = Seq(1.0.toRealNN, 0.0.toRealNN, 0.0.toRealNN) val transformer: ToOccurTransformer[Real] = new ToOccurTransformer[Real]().setInput(f1) val extendedTestData = Seq( ("001", 0, None, Option(true), None), ("002", 1, None, None, Option(2.0)), ("003", 2, Option("abc"), Option(false), Option(0.0)), ("004", 0, Option("def"), Option(false), Option(1.0)) ).map { case (leadId, numEmails, opptyId, doNotContact, numFormSubmits) => ( Text(leadId), numEmails.toRealNN, Text(opptyId), Binary(doNotContact), Real(numFormSubmits) ) } lazy val (ds, leadId, numEmails, opptyId, doNotContact, numFormSubmits) = TestFeatureBuilder("leadId", "numEmails", "opptyId", "doNotContact", "numFormSubmits", extendedTestData) Spec[ToOccurTransformer[_]] should "convert features to boolean using shortcuts" in { val occurEmailOutput = numEmails.occurs(_.value.exists(_ > 1)) val toOccurEmail = occurEmailOutput.originStage.asInstanceOf[Transformer] val occurFormOutput = numFormSubmits.occurs() val toOccurForm = occurFormOutput.originStage.asInstanceOf[Transformer] val occurContactOutput = doNotContact.occurs() val toOccurContact = occurContactOutput.originStage.asInstanceOf[Transformer] val toOccurDF = toOccurContact.transform(toOccurForm.transform(toOccurEmail.transform(ds))) val expected = Array( (Text("001"), 0.0.toRealNN, 0.0.toRealNN, 1.0.toRealNN), (Text("002"), 0.0.toRealNN, 1.0.toRealNN, 0.0.toRealNN), (Text("003"), 1.0.toRealNN, 0.0.toRealNN, 0.0.toRealNN), (Text("004"), 0.0.toRealNN, 1.0.toRealNN, 0.0.toRealNN) ) toOccurDF.orderBy("leadId").collect(leadId, occurEmailOutput, occurFormOutput, occurContactOutput) shouldBe expected } it should "convert features to doolean" in { val toOccurEmail = new ToOccurTransformer[RealNN]().setInput(numEmails) val occurEmailOutput = toOccurEmail.getOutput() val toOccurForm = new ToOccurTransformer[Real]().setInput(numFormSubmits) val occurFormOutput = toOccurForm.getOutput() val toOccurContact = new ToOccurTransformer[Binary]().setInput(doNotContact) val occurContactOutput = toOccurContact.getOutput() val toOccurOppty = new ToOccurTransformer[Text](matchFn = _.nonEmpty).setInput(opptyId) val occurOpptyOutput = toOccurOppty.getOutput() val toOccurDF = toOccurOppty.transform(toOccurContact.transform(toOccurForm.transform(toOccurEmail.transform(ds)))) val expected = Array( (Text("001"), 0.0.toRealNN, 0.0.toRealNN, 1.0.toRealNN, 0.0.toRealNN), (Text("002"), 1.0.toRealNN, 1.0.toRealNN, 0.0.toRealNN, 0.0.toRealNN), (Text("003"), 1.0.toRealNN, 0.0.toRealNN, 0.0.toRealNN, 1.0.toRealNN), (Text("004"), 0.0.toRealNN, 1.0.toRealNN, 0.0.toRealNN, 1.0.toRealNN) ) toOccurDF.orderBy("leadId").collect(leadId, occurEmailOutput, occurFormOutput, occurContactOutput, occurOpptyOutput) shouldBe expected } }
Example 117
Source File: TextNGramSimilarityTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.Transformer import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class TextNGramSimilarityTest extends OpTransformerSpec[RealNN, TextNGramSimilarity[Text]]{ val(inputData, f1, f2) = TestFeatureBuilder( Seq[(Text, Text)]( (Text("Hamlet: To be or not to be - that is the question."), Text("I like like Hamlet")), (Text("that is the question"), Text("There is no question")), (Text("Just some random text"), Text("I like like Hamlet")), (Text("Adobe CreativeSuite 5 Master Collection from cheap 4zp"), Text("Adobe CreativeSuite 5 Master Collection from cheap d1x")), (Text.empty, Text.empty), (Text(""), Text("")), (Text(""), Text.empty), (Text("asdf"), Text.empty), (Text.empty, Text("asdf")) ) ) val expectedResult = Seq(0.12666672468185425, 0.6083333492279053, 0.15873020887374878, 0.9629629850387573, 0.0, 0.0, 0.0, 0.0, 0.0).toRealNN val nGramSimilarity = f1.toNGramSimilarity(f2, toLowerCase = false) val transformer = nGramSimilarity.originStage.asInstanceOf[TextNGramSimilarity[Text]] it should "correctly compute char-n-gram similarity with nondefault ngram param" in { val nGramSimilarity = f1.toNGramSimilarity(f2, nGramSize = 4, toLowerCase = false) val transformedDs = nGramSimilarity.originStage.asInstanceOf[Transformer].transform(inputData) val actualOutput = transformedDs.collect(nGramSimilarity) actualOutput shouldBe Seq(0.11500000953674316, 0.5666666626930237, 0.1547619104385376, 0.9722222089767456, 0.0, 0.0, 0.0, 0.0, 0.0).toRealNN } }
Example 118
Source File: IsotonicRegressionCalibratorTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.features.{Feature, FeatureLike} import com.salesforce.op.stages.impl.regression.IsotonicRegressionCalibrator import com.salesforce.op.stages.sparkwrappers.specific.OpBinaryEstimatorWrapper import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import org.apache.spark.ml.Transformer import org.apache.spark.ml.regression.{IsotonicRegression, IsotonicRegressionModel} import org.apache.spark.sql._ import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class IsotonicRegressionCalibratorTest extends FlatSpec with TestSparkContext { val isoExpectedPredictions = Array(1, 2, 2, 2, 6, 16.5, 16.5, 17, 18) val isoExpectedModelBoundaries = Array(0, 1, 3, 4, 5, 6, 7, 8) val isoExpectedModelPredictions = Array(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0) val isoDataLabels = Seq(1, 2, 3, 1, 6, 17, 16, 17, 18) val isoTestData = isoDataLabels.zipWithIndex.map { case (label, i) => label.toRealNN -> i.toRealNN } val (isoScoresDF, isoLabels, isoScores): (DataFrame, Feature[RealNN], Feature[RealNN]) = TestFeatureBuilder(isoTestData) val antiExpectedPredictions = Array(7.0, 5.0, 4.0, 4.0, 1.0) val antiExpectedModelBoundaries = Array(0, 1, 2, 3, 4) val antiExpectedModelPredictions = Array(7.0, 5.0, 4.0, 4.0, 1.0) val antiDataLabels = Seq(7, 5, 3, 5, 1) val antiTestData = antiDataLabels.zipWithIndex.map { case (label, i) => label.toRealNN -> i.toRealNN } val (antiScoresDF, antiLabels, antiScores): (DataFrame, Feature[RealNN], Feature[RealNN]) = TestFeatureBuilder(antiTestData) Spec[IsotonicRegressionCalibrator] should "isotonically calibrate scores using shortcut" in { val calibratedScores = isoScores.toIsotonicCalibrated(isoLabels) val estimator = calibratedScores.originStage .asInstanceOf[OpBinaryEstimatorWrapper[RealNN, RealNN, RealNN, IsotonicRegression, IsotonicRegressionModel]] val model = estimator.fit(isoScoresDF).getSparkMlStage().get val predictionsDF = model.asInstanceOf[Transformer] .transform(isoScoresDF) validateOutput(calibratedScores, model, predictionsDF, true, isoExpectedPredictions, isoExpectedModelBoundaries, isoExpectedModelPredictions) } it should "isotonically calibrate scores" in { val isotonicCalibrator = new IsotonicRegressionCalibrator().setInput(isoLabels, isoScores) val calibratedScores = isotonicCalibrator.getOutput() val model = isotonicCalibrator.fit(isoScoresDF).getSparkMlStage().get val predictionsDF = model.asInstanceOf[Transformer] .transform(isoScoresDF) validateOutput(calibratedScores, model, predictionsDF, true, isoExpectedPredictions, isoExpectedModelBoundaries, isoExpectedModelPredictions) } it should "antitonically calibrate scores" in { val isIsotonic: Boolean = false val isotonicCalibrator = new IsotonicRegressionCalibrator().setIsotonic(isIsotonic).setInput(isoLabels, isoScores) val calibratedScores = isotonicCalibrator.getOutput() val model = isotonicCalibrator.fit(antiScoresDF).getSparkMlStage().get val predictionsDF = model.asInstanceOf[Transformer] .transform(antiScoresDF) validateOutput(calibratedScores, model, predictionsDF, isIsotonic, antiExpectedPredictions, antiExpectedModelBoundaries, antiExpectedModelPredictions) } def validateOutput(calibratedScores: FeatureLike[RealNN], model: IsotonicRegressionModel, predictionsDF: DataFrame, expectedIsIsotonic: Boolean, expectedPredictions: Array[Double], expectedModelBoundaries: Array[Int], expectedModelPredictions: Array[Double]): Unit = { val predictions = predictionsDF.select(calibratedScores.name).rdd.map { case Row(pred) => pred }.collect() val isIsotonic = model.getIsotonic isIsotonic should be(expectedIsIsotonic) predictions should contain theSameElementsInOrderAs expectedPredictions model.boundaries.toArray should contain theSameElementsInOrderAs expectedModelBoundaries model.predictions.toArray should contain theSameElementsInOrderAs expectedModelPredictions } }
Example 119
Source File: TimePeriodMapTransformerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.FeatureLike import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.date.DateTimeUtils import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.Transformer import org.joda.time.{DateTime => JDateTime} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class TimePeriodMapTransformerTest extends OpTransformerSpec[IntegralMap, TimePeriodMapTransformer[DateMap]] { val names: Seq[String] = Seq("n1", "n2", "n3", "n4") val dates: Seq[Long] = Seq( new JDateTime(1879, 3, 14, 0, 0, DateTimeUtils.DefaultTimeZone).getMillis, new JDateTime(1955, 11, 12, 10, 4, DateTimeUtils.DefaultTimeZone).getMillis, new JDateTime(1999, 3, 8, 12, 0, DateTimeUtils.DefaultTimeZone).getMillis, new JDateTime(2019, 4, 30, 13, 0, DateTimeUtils.DefaultTimeZone).getMillis ) val dateMap: DateMap = names.zip(dates).toMap.toDateMap val (inputData, f1) = TestFeatureBuilder(Seq[DateMap](dateMap)) override val transformer: TimePeriodMapTransformer[DateMap] = new TimePeriodMapTransformer(TimePeriod.DayOfMonth).setInput(f1) override val expectedResult: Seq[IntegralMap] = Seq( names.zip(Seq(14L, 12L, 8L, 30L)).toMap.toIntegralMap ) it should "transform with rich shortcuts" in { val n = "n1" val dmap = Map(n -> new JDateTime(1879, 3, 14, 0, 0, DateTimeUtils.DefaultTimeZone).getMillis) val (inputData2, d1, d2) = TestFeatureBuilder( Seq[(DateMap, DateTimeMap)]((dmap.toDateMap, dmap.toDateTimeMap)) ) def assertFeature(feature: FeatureLike[IntegralMap], expected: Seq[IntegralMap]): Unit = { val transformed = feature.originStage.asInstanceOf[Transformer].transform(inputData2) val actual = transformed.collect(feature) actual shouldBe expected } assertFeature(d1.toTimePeriod(TimePeriod.DayOfMonth), Seq(IntegralMap(Map(n -> 14)))) assertFeature(d2.toTimePeriod(TimePeriod.DayOfMonth), Seq(IntegralMap(Map(n -> 14)))) } }
Example 120
Source File: OpHashingTFTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ import com.salesforce.op.test.{SwTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.HashingTF import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.DataFrame import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpHashingTFTest extends SwTransformerSpec[OPVector, HashingTF, OpHashingTF] { // scalastyle:off val testData = Seq( "Hamlet: To be or not to be - that is the question.", "Гамлет: Быть или не быть - вот в чём вопрос.", "המלט: להיות או לא להיות - זאת השאלה.", "Hamlet: Être ou ne pas être - telle est la question." ).map(_.toLowerCase.split(" ").toSeq.toTextList) // scalastyle:on val (inputData, f1): (DataFrame, Feature[TextList]) = TestFeatureBuilder(testData) val hashed = f1.tf(numTerms = 5) val transformer = hashed.originStage.asInstanceOf[OpHashingTF] val expectedResult: Seq[OPVector] = Seq( Vectors.sparse(5, Array(0, 1, 2, 3, 4), Array(2.0, 4.0, 2.0, 3.0, 1.0)), Vectors.sparse(5, Array(0, 1, 2, 3, 4), Array(4.0, 1.0, 3.0, 1.0, 1.0)), Vectors.sparse(5, Array(0, 2, 3, 4), Array(2.0, 2.0, 2.0, 2.0)), Vectors.sparse(5, Array(0, 1, 2, 4), Array(3.0, 5.0, 1.0, 2.0)) ).map(_.toOPVector) def hash( s: String, numOfFeatures: Int = TransmogrifierDefaults.DefaultNumOfFeatures, binary: Boolean = false ): Int = new org.apache.spark.mllib.feature.HashingTF(numOfFeatures).setBinary(binary).indexOf(s) it should "hash categorical data" in { val hashed = f1.tf() val transformedData = hashed.originStage.asInstanceOf[Transformer].transform(inputData) val results = transformedData.select(hashed.name).collect(hashed) hashed.name shouldBe hashed.originStage.getOutputFeatureName // scalastyle:off results.forall(_.value.size == TransmogrifierDefaults.DefaultNumOfFeatures) shouldBe true results(0).value(hash("be")) shouldBe 2.0 results(0).value(hash("that")) shouldBe 1.0 results(1).value(hash("быть")) shouldBe 2.0 results(2).value(hash("להיות")) shouldBe 2.0 results(3).value(hash("être")) shouldBe 2.0 // scalastyle:on } it should "hash categorical data with custom numFeatures" in { val numFeatures = 100 val hashed = f1.tf(numTerms = numFeatures) val transformedData = hashed.originStage.asInstanceOf[Transformer].transform(inputData) val results = transformedData.select(hashed.name).collect(hashed) // scalastyle:off results.forall(_.value.size == numFeatures) shouldBe true results(0).value(hash("be", numOfFeatures = numFeatures)) shouldBe 2.0 results(1).value(hash("быть", numOfFeatures = numFeatures)) shouldBe 2.0 results(2).value(hash("question", numOfFeatures = numFeatures)) shouldBe 0.0 // scalastyle:on } it should "hash categorical data when binary = true" in { val binary = true val hashed = f1.tf(binary = binary) val transformedData = hashed.originStage.asInstanceOf[Transformer].transform(inputData) val results = transformedData.select(hashed.name).collect(hashed) // scalastyle:off val values = Set(0.0, 1.0) results.forall(_.value.toArray.forall(values contains _)) shouldBe true results(0).value(hash("be", binary = binary)) shouldBe 1.0 results(1).value(hash("быть", binary = binary)) shouldBe 1.0 results(2).value(hash("question", binary = binary)) shouldBe 0.0 // scalastyle:on } }
Example 121
Source File: DateMapToUnitCircleVectorizerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.sequence.SequenceModel import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.OpVectorMetadata import org.apache.spark.ml.{Estimator, Transformer} import org.apache.spark.ml.linalg.Vectors import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.RichMetadata._ import org.joda.time.{DateTime => JDateTime} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, SequenceModel[DateMap, OPVector], DateMapToUnitCircleVectorizer[DateMap]] with AttributeAsserts { val eps = 1E-4 val sampleDateTimes = Seq[JDateTime]( new JDateTime(2018, 2, 11, 0, 0, 0, 0), new JDateTime(2018, 11, 28, 6, 0, 0, 0), new JDateTime(2018, 2, 17, 12, 0, 0, 0), new JDateTime(2017, 4, 17, 18, 0, 0, 0), new JDateTime(1918, 2, 13, 3, 0, 0, 0) ) val (inputData, f1) = TestFeatureBuilder( sampleDateTimes.map(x => Map("a" -> x.getMillis, "b" -> x.getMillis).toDateMap) ) override val expectedResult: Seq[OPVector] = sampleDateTimes .map{ v => val rad = DateToUnitCircle.convertToRandians(Option(v.getMillis), TimePeriod.HourOfDay) (rad ++ rad).toOPVector } it should "work with its shortcut as a DateMap" in { val output = f1.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]] .fit(inputData).transform(inputData) val field = transformed.schema(output.name) val actual = transformed.collect(output) assertNominal(field, Array.fill(actual.head.value.size)(false), actual) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "work with its shortcut as a DateTimeMap" in { val (inputDataDT, f1DT) = TestFeatureBuilder( sampleDateTimes.map(x => Map("a" -> x.getMillis, "b" -> x.getMillis).toDateTimeMap) ) val output = f1DT.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]] .fit(inputData).transform(inputData) val field = transformed.schema(output.name) val actual = transformed.collect(output) assertNominal(field, Array.fill(actual.head.value.size)(false), actual) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "make the correct metadata" in { val fitted = estimator.fit(inputData) val meta = OpVectorMetadata(fitted.getOutputFeatureName, fitted.getMetadata()) meta.columns.length shouldBe 4 meta.columns.flatMap(_.grouping) shouldEqual Seq("a", "a", "b", "b") meta.columns.flatMap(_.descriptorValue) shouldEqual Seq("x_HourOfDay", "y_HourOfDay", "x_HourOfDay", "y_HourOfDay") } }
Example 122
Source File: SetNGramSimilarityTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.Transformer import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class SetNGramSimilarityTest extends OpTransformerSpec[RealNN, SetNGramSimilarity] { val (inputData, f1, f2) = TestFeatureBuilder( Seq( (Seq("Red", "Green"), Seq("Red")), (Seq("Red", "Green"), Seq("Yellow, Blue")), (Seq("Red", "Yellow"), Seq("Red", "Yellow")), (Seq[String](), Seq("Red", "Yellow")), (Seq[String](), Seq[String]()), (Seq[String](""), Seq[String]("asdf")), (Seq[String](""), Seq[String]("")), (Seq[String]("", ""), Seq[String]("", "")) ).map(v => v._1.toMultiPickList -> v._2.toMultiPickList) ) val expectedResult = Seq(0.3333333134651184, 0.09722214937210083, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0).toRealNN val catNGramSimilarity = f1.toNGramSimilarity(f2) val transformer = catNGramSimilarity.originStage.asInstanceOf[SetNGramSimilarity] it should "correctly compute char-n-gram similarity with nondefault ngram param" in { val cat5GramSimilarity = f1.toNGramSimilarity(f2, 5) val transformedDs = cat5GramSimilarity.originStage.asInstanceOf[Transformer].transform(inputData) val actualOutput = transformedDs.collect(cat5GramSimilarity) actualOutput shouldBe Seq(0.3333333432674408, 0.12361115217208862, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0).toRealNN } }
Example 123
Source File: IDFTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.feature.IDF import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.{Estimator, Transformer} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class IDFTest extends FlatSpec with TestSparkContext { val data = Seq( Vectors.sparse(4, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(4, Array(1), Array(1.0)) ) lazy val (ds, f1) = TestFeatureBuilder(data.map(_.toOPVector)) Spec[IDF] should "compute inverted document frequency" in { val idf = f1.idf() val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((data.length + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } it should "compute inverted document frequency when minDocFreq is 1" in { val idf = f1.idf(minDocFreq = 1) val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((data.length + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } private def scaleDataWithIDF(dataSet: Seq[Vector], model: Vector): Seq[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } }
Example 124
Source File: LangDetectorTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.text.Language import org.apache.spark.ml.Transformer import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class LangDetectorTest extends OpTransformerSpec[RealMap, LangDetector[Text]] { // scalastyle:off val (inputData, f1, f2, f3) = TestFeatureBuilder( Seq( ( "I've got a lovely bunch of coconuts".toText, "文化庁によりますと、世界文化遺産への登録を目指している、福岡県の「宗像・沖ノ島と関連遺産群」について、ユネスコの諮問機関は、8つの構成資産のうち、沖ノ島など4つについて、「世界遺産に登録することがふさわしい」とする勧告をまとめました。".toText, "Première détection d’une atmosphère autour d’une exoplanète de la taille de la Terre".toText ), ( "There they are, all standing in a row".toText, "地磁気発生の謎に迫る地球内部の環境、再現実験".toText, "Les deux commissions, créées respectivement en juin 2016 et janvier 2017".toText ), ( "Big ones, small ones, some as big as your head".toText, "大学レスリング界で「黒船」と呼ばれたカザフスタン出身の大型レスラーが、日本の男子グレコローマンスタイルの重量級強化のために一役買っている。山梨学院大をこの春卒業したオレッグ・ボルチン(24)。4月から新日本プロレスの親会社ブシロードに就職。自身も日本を拠点に、アマチュアレスリングで2020年東京五輪を目指す。".toText, "Il publie sa théorie de la relativité restreinte en 1905".toText ) ) ) // scalastyle:on val transformer = new LangDetector[Text]().setInput(f1) private val langMap = f1.detectLanguages() // English result val expectedResult: Seq[RealMap] = Seq( Map("en" -> 0.9999984360934321), Map("en" -> 0.9999900853228016), Map("en" -> 0.9999900116744931) ).map(_.toRealMap) it should "return empty RealMap when input text is empty" in { transformer.transformFn(Text.empty) shouldBe RealMap.empty } it should "detect Japanese language" in { assertDetectionResults( results = transformer.setInput(f2).transform(inputData).collect(transformer.getOutput()), expectedLanguage = Language.Japanese ) } it should "detect French language" in { assertDetectionResults( results = transformer.setInput(f3).transform(inputData).collect(transformer.getOutput()), expectedLanguage = Language.French ) } it should "has a working shortcut" in { val tokenized = f1.detectLanguages() assertDetectionResults( results = tokenized.originStage.asInstanceOf[Transformer].transform(inputData).collect(tokenized), expectedLanguage = Language.English ) } private def assertDetectionResults ( results: Array[RealMap], expectedLanguage: Language, confidence: Double = 0.99 ): Unit = results.foreach(res => { res.value.size shouldBe 1 res.value.contains(expectedLanguage.entryName) shouldBe true res.value(expectedLanguage.entryName) should be >= confidence }) }
Example 125
Source File: TimePeriodListTransformerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.FeatureLike import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.date.DateTimeUtils import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.Transformer import org.joda.time.{DateTime => JDateTime} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class TimePeriodListTransformerTest extends OpTransformerSpec[OPVector, TimePeriodListTransformer[DateList]] { val dateList: DateList = Seq[Long]( new JDateTime(1879, 3, 14, 0, 0, DateTimeUtils.DefaultTimeZone).getMillis, new JDateTime(1955, 11, 12, 10, 4, DateTimeUtils.DefaultTimeZone).getMillis, new JDateTime(1999, 3, 8, 12, 0, DateTimeUtils.DefaultTimeZone).getMillis, new JDateTime(2019, 4, 30, 13, 0, DateTimeUtils.DefaultTimeZone).getMillis ).toDateList val (inputData, f1) = TestFeatureBuilder(Seq(dateList)) override val transformer: TimePeriodListTransformer[DateList] = new TimePeriodListTransformer(TimePeriod.DayOfMonth).setInput(f1) override val expectedResult: Seq[OPVector] = Seq(Seq(14, 12, 8, 30).map(_.toDouble).toVector.toOPVector) it should "transform with rich shortcuts" in { val dlist = List(new JDateTime(1879, 3, 14, 0, 0, DateTimeUtils.DefaultTimeZone).getMillis) val (inputData2, d1, d2) = TestFeatureBuilder( Seq[(DateList, DateTimeList)]((dlist.toDateList, dlist.toDateTimeList)) ) def assertFeature(feature: FeatureLike[OPVector], expected: Seq[OPVector]): Unit = { val transformed = feature.originStage.asInstanceOf[Transformer].transform(inputData2) val actual = transformed.collect(feature) actual shouldBe expected } assertFeature(d1.toTimePeriod(TimePeriod.DayOfMonth), Seq(Vector(14.0).toOPVector)) assertFeature(d2.toTimePeriod(TimePeriod.DayOfMonth), Seq(Vector(14.0).toOPVector)) } }
Example 126
Source File: TimePeriodTransformerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.FeatureLike import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.date.DateTimeUtils import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.Transformer import org.joda.time.{DateTime => JDateTime} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class TimePeriodTransformerTest extends OpTransformerSpec[Integral, TimePeriodTransformer[Date]] { val (inputData, f1) = TestFeatureBuilder(Seq[Date]( new JDateTime(1879, 3, 14, 0, 0, DateTimeUtils.DefaultTimeZone).getMillis.toDate, new JDateTime(1955, 11, 12, 10, 4, DateTimeUtils.DefaultTimeZone).getMillis.toDate, new JDateTime(1999, 3, 8, 12, 0, DateTimeUtils.DefaultTimeZone).getMillis.toDate, Date.empty, new JDateTime(2019, 4, 30, 13, 0, DateTimeUtils.DefaultTimeZone).getMillis.toDate )) override val transformer: TimePeriodTransformer[Date] = new TimePeriodTransformer(TimePeriod.DayOfMonth).setInput(f1) override val expectedResult: Seq[Integral] = Seq(Integral(14), Integral(12), Integral(8), Integral.empty, Integral(30)) it should "correctly transform for all TimePeriod types" in { def assertFeature(feature: FeatureLike[Integral], expected: Seq[Integral]): Unit = { val transformed = feature.originStage.asInstanceOf[Transformer].transform(inputData) val actual = transformed.collect(feature) actual shouldBe expected } TimePeriod.values.foreach(tp => { val expected = tp match { case TimePeriod.DayOfMonth => Array(Integral(14), Integral(12), Integral(8), Integral.empty, Integral(30)) case TimePeriod.DayOfWeek => Array(Integral(5), Integral(6), Integral(1), Integral.empty, Integral(2)) case TimePeriod.DayOfYear => Array(Integral(73), Integral(316), Integral(67), Integral.empty, Integral(120)) case TimePeriod.HourOfDay => Array(Integral(0), Integral(10), Integral(12), Integral.empty, Integral(13)) case TimePeriod.MonthOfYear => Array(Integral(3), Integral(11), Integral(3), Integral.empty, Integral(4)) case TimePeriod.WeekOfMonth => Array(Integral(3), Integral(2), Integral(2), Integral.empty, Integral(5)) case TimePeriod.WeekOfYear => Array(Integral(11), Integral(46), Integral(11), Integral.empty, Integral(18)) case _ => throw new Exception(s"Unexpected TimePeriod encountered, $tp") } withClue(s"Assertion failed for TimePeriod $tp: ") { assertFeature(f1.toTimePeriod(tp), expected) } }) } }
Example 127
Source File: OpPipelineStageReaderWriterTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.features._ import com.salesforce.op.features.types._ import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.test.PassengerSparkFixtureTest import com.salesforce.op.utils.reflection.ReflectionUtils import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.{Model, Transformer} import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder} import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods.{compact, parse, pretty, render} import org.json4s.{JArray, JObject} import org.scalatest.FlatSpec import org.slf4j.LoggerFactory // TODO: consider adding a read/write test for a spark wrapped stage as well private[stages] abstract class OpPipelineStageReaderWriterTest extends FlatSpec with PassengerSparkFixtureTest { val meta = new MetadataBuilder().putString("foo", "bar").build() val expectedFeaturesLength = 1 def stage: OpPipelineStageBase with Transformer val expected: Array[Real] val hasOutputName = true private val log = LoggerFactory.getLogger(this.getClass) private lazy val savePath = tempDir + "/" + this.getClass.getSimpleName + "-" + System.currentTimeMillis() private lazy val writer = new OpPipelineStageWriter(stage) private lazy val stageJsonString: String = writer.writeToJsonString(savePath) private lazy val stageJson: JValue = parse(stageJsonString) private lazy val isModel = stage.isInstanceOf[Model[_]] private val FN = FieldNames Spec(this.getClass) should "write stage uid" in { log.info(pretty(stageJson)) (stageJson \ FN.Uid.entryName).extract[String] shouldBe stage.uid } it should "write class name" in { (stageJson \ FN.Class.entryName).extract[String] shouldBe stage.getClass.getName } it should "write params map" in { val params = extractParams(stageJson).extract[Map[String, Any]] if (hasOutputName) { params should have size 4 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema", "outputFeatureName") } else { params should have size 3 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema") } } it should "write outputMetadata" in { val params = extractParams(stageJson) val metadataStr = compact(render(extractParams(stageJson) \ "outputMetadata")) val metadata = Metadata.fromJson(metadataStr) metadata shouldBe stage.getMetadata() } it should "write inputSchema" in { val schemaStr = compact(render(extractParams(stageJson) \ "inputSchema")) val schema = DataType.fromJson(schemaStr) schema shouldBe stage.getInputSchema() } it should "write input features" in { val jArray = (extractParams(stageJson) \ "inputFeatures").extract[JArray] jArray.values should have length expectedFeaturesLength val obj = jArray(0).extract[JObject] obj.values.keys shouldBe Set("name", "isResponse", "isRaw", "uid", "typeName", "stages", "originFeatures") } it should "write model ctor args" in { if (stage.isInstanceOf[Model[_]]) { val ctorArgs = (stageJson \ FN.CtorArgs.entryName).extract[JObject] val (_, args) = ReflectionUtils.bestCtorWithArgs(stage) ctorArgs.values.keys shouldBe args.map(_._1).toSet } } it should "load stage correctly" in { val reader = new OpPipelineStageReader(stage) val stageLoaded = reader.loadFromJsonString(stageJsonString, path = savePath) stageLoaded shouldBe a[OpPipelineStageBase] stageLoaded shouldBe a[Transformer] stageLoaded.getOutput() shouldBe a[FeatureLike[_]] val _ = stage.asInstanceOf[Transformer].transform(passengersDataSet) val transformed = stageLoaded.asInstanceOf[Transformer].transform(passengersDataSet) transformed.collect(stageLoaded.getOutput().asInstanceOf[FeatureLike[Real]]) shouldBe expected stageLoaded.uid shouldBe stage.uid stageLoaded.operationName shouldBe stage.operationName stageLoaded.getInputFeatures() shouldBe stage.getInputFeatures() stageLoaded.getInputSchema() shouldBe stage.getInputSchema() } private def extractParams(stageJson: JValue): JValue = { val defaultParamsMap = stageJson \ FN.DefaultParamMap.entryName val paramsMap = stageJson \ FN.ParamMap.entryName defaultParamsMap.merge(paramsMap) } }
Example 128
Source File: FeatureTestBase.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.test import com.salesforce.op.features._ import com.salesforce.op.features.types._ import com.salesforce.op.utils.spark.RichDataset.RichDataset import org.apache.spark.ml.{Estimator, Transformer} import org.apache.spark.sql.Dataset import org.scalatest.prop.PropertyChecks import org.scalatest.{Assertion, Suite} import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag def testOp[A <: FeatureType : TypeTag, B <: FeatureType : TypeTag, C <: FeatureType : TypeTag : FeatureTypeSparkConverter : ClassTag] ( op: FeatureLike[A] => FeatureLike[B] => FeatureLike[C] ): BinaryTester[A, B, C] = new BinaryTester[A, B, C] { def of(v: (A, B)*): Checker[C] = new Checker[C] { def expecting(z: C*): Assertion = { val (data, f1, f2) = TestFeatureBuilder[A, B](v) val f = op(f1)(f2) checkFeature(f, data, expected = z, clue = s"Testing ${f.originStage.operationName} on $v: ") } } } sealed abstract class UnaryTester[A <: FeatureType, C <: FeatureType : TypeTag : FeatureTypeSparkConverter : ClassTag] { def of(x: A*): Checker[C] } sealed abstract class BinaryTester[A <: FeatureType, B <: FeatureType, C <: FeatureType : TypeTag : FeatureTypeSparkConverter : ClassTag] { def of(x: A, y: B): Checker[C] = of((x, y)) def of(x: (A, B)*): Checker[C] } sealed abstract class Checker[C <: FeatureType : TypeTag : FeatureTypeSparkConverter : ClassTag] { def expecting(z: C*): Assertion protected def checkFeature(f: FeatureLike[C], data: Dataset[_], clue: String, expected: Seq[C]): Assertion = { val transformed = f.originStage match { case e: Estimator[_] => e.fit(data).transform(data) case t: Transformer => t.transform(data) } withClue(clue)( new RichDataset(transformed).collect[C](f) should contain theSameElementsInOrderAs expected ) } } }
Example 129
Source File: HashingTF.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 130
Source File: Binarizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) }
Example 131
Source File: LanguageAwareAnalyzer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.util.StopwordAnalyzerBase import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.HasOutputCol import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } def this() = this(Identifiable.randomUID("languageAnalyzer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF } @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputColText) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true)) } else { SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true)) } } } object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] { override def load(path: String): LanguageAwareAnalyzer = super.load(path) }
Example 132
Source File: LanguageDetectorTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import com.google.common.base.Optional import com.optimaize.langdetect.LanguageDetector import com.optimaize.langdetect.i18n.LdLocale import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import scala.collection.Map def setOutputCol(value: String): this.type = set(outputCol, value) def this() = this(Identifiable.randomUID("languageDetector")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } @transient object languageDetectorWrapped extends Serializable { val languageDetector: LanguageDetector = LanguageDetectorUtils.buildLanguageDetector( LanguageDetectorUtils.readListLangsBuiltIn(), $(minimalConfidence), $(languagePriors).toMap) } }
Example 133
Source File: HashBasedDeduplicator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import odkl.analysis.spark.util.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.linalg.Vectors.norm import org.apache.spark.ml.linalg.{BLAS, Vector} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuffer def setSimilarityTreshold(value: Double): this.type = set(similarityThreshold, value) setDefault(new ParamPair[String](inputColHash,"hash"), new ParamPair[Double](similarityThreshold,0.9)) def this() = this(Identifiable.randomUID("hashBasedDeduplication")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sqlContext.createDataFrame( dataset.toDF .repartition(dataset.col($(inputColHash))) .sortWithinPartitions($(inputColHash)) .rdd .mapPartitions((f: Iterator[Row]) => { if (f.hasNext) { var curHash: Long = -1L val vectorsBuffer = new ArrayBuffer[Vector](0) // unique vectors buffer for this bucket for (it <- f) yield { val newHash = it.getAs[Long]($(inputColHash)) if (newHash == curHash) { val currentVector = it.getAs[Vector]($(inputColVector)) val isUnique = vectorsBuffer.forall(storedVector => { //are this vector is "different" with other in buffer? (BLAS.dot(storedVector, currentVector) / (norm(storedVector, 2) * norm(currentVector, 2))) < $(similarityThreshold) //is unsimilar? }) if (isUnique) { vectorsBuffer.append(currentVector) it } else { Row.empty //dummy Row } } else { vectorsBuffer.clear() vectorsBuffer.append(it.getAs[Vector]($(inputColVector))) curHash = newHash it } } } else { new Array[Row](0).toIterator //empty partition? } }).filter(!_.equals(Row.empty)), //filter dummy transformSchema(dataset.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) }
Example 134
Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1)) override def transform(dataset: Dataset[_]): DataFrame = { val lowerBound = $(lowerN) val upperBound = $(upperN) val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound)) dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), new ArrayType(StringType, true)) } else { schema } } } object NGramExtractor extends DefaultParamsReadable[NGramExtractor] { override def load(path: String): NGramExtractor = super.load(path) }
Example 135
Source File: RegexpReplaceTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("RegexpReplaceTransformer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), regexp_replace(dataset.col($(inputCol)), $(regexpPattern), $(regexpReplacement))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputCol))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), StringType) } else { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } } } object RegexpReplaceTransformer extends DefaultParamsReadable[RegexpReplaceTransformer] { override def load(path: String): RegexpReplaceTransformer = super.load(path) }
Example 136
Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import java.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{LongType, StructType} def setDim(value: Long): this.type = set(dim, value) def this() = this(Identifiable.randomUID("randomProjectionsHasher")) override def transform(dataset: Dataset[_]): DataFrame = { val dimensity = { if (!isSet(dim)) {//If dimensions is not set - will search AttributeGroup in metadata as it comes from OdklCountVectorizer val vectorsIndex = dataset.schema.fieldIndex($(inputCol)) AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size } else { $(dim).toInt } } val projectionMatrix = dataset.sqlContext.sparkContext.broadcast( Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix]) //the matrix of random vectors to costruct hash val binHashSparseVectorColumn = udf((vector: Vector) => { projectionMatrix.value.multiply(vector).values .map(f => if (f>0) 1L else 0L) .view.zipWithIndex .foldLeft(0L) {case (acc,(v, i)) => acc | (v << i) } }) dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), LongType) } }
Example 137
Source File: URLElimminator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, Params} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("URLEliminator")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), filterTextUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), StringType) } else { schema } } } object URLElimminator extends DefaultParamsReadable[URLElimminator] { override def load(path: String): URLElimminator = super.load(path) }
Example 138
Source File: NameAssigner.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasInputCols import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Dataset, functions} import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType} class NameAssigner(override val uid: String) extends Transformer with HasInputCols{ def setInputCols(column: String*) : this.type = set(inputCols, column.toArray) def this() = this(Identifiable.randomUID("NameAssigner")) override def transform(dataset: Dataset[_]): DataFrame = { $(inputCols) $(inputCols).foldLeft(dataset.toDF)((data, column) => { val metadata: Metadata = dataset.schema(column).metadata val attributes = AttributeGroup.fromStructField( StructField(column, new VectorUDT, nullable = false, metadata = metadata)) val map = attributes.attributes .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) val func = functions.udf[String, Number](x => if(x == null) { null } else { val i = x.intValue() map.getOrElse(i, i.toString) }) data.withColumn(column, func(data(column)).as(column, metadata)) }).toDF } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.map(f => if ($(inputCols).contains(f.name)) { StructField(f.name, StringType, f.nullable, f.metadata) } else { f })) }
Example 139
Source File: VectorExplode.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.util.collection.OpenHashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.odkl.SparkSqlUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, functions} class VectorExplode(override val uid: String) extends Transformer with DefaultParamsWritable { val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.") def setValueCol(value: String) : this.type = set(valueCol, value) setDefault(valueCol -> "value") def this() = this(Identifiable.randomUID("vectorExplode")) override def transform(dataset: Dataset[_]): DataFrame = { val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT]) val resultSchema = StructType(Seq( StructField($(valueCol), StringType, nullable = false)) ++ vectors.map(f => StructField(f.name, DoubleType, nullable = true)) ) val arraySize = resultSchema.size - 1 val names: Array[Map[Int, String]] = vectors.map( f => { AttributeGroup.fromStructField(f).attributes .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) }) val maxCapacity = names.map(_.size).max val explodeVectors : (Row => Array[Row]) = (r: Row ) => { val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity) for(i <- 0 until r.length) { val vector = r.getAs[Vector](i) vector.foreachActive((index, value) => { val name = names(i).getOrElse(index, s"${vectors(i).name}_$index") accumulator.changeValue( name, Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN}, v => {v(i) = value; v}) }) } accumulator.map(x => new GenericRowWithSchema( (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray, resultSchema)).toArray } val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*) val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType))) val expression = functions.explode(explodeUDF(vectorsStruct)) dataset .withColumn(uid, expression) .select( dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++ resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.fields.map(x => x.dataType match { case vector: VectorUDT => StructField(x.name, typeFromVector(x)) case _ => x } )) def typeFromVector(field: StructField): StructType = { val attributes = AttributeGroup.fromStructField(field) StructType(attributes.attributes .map(_.map(a => a.name.getOrElse(s"_${a.index.get}"))) .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" }) .map(name => StructField(name, DoubleType, nullable = false))) } }
Example 140
Source File: HashingTF.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 141
Source File: SQLTransformer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) // Call SessionCatalog.dropTempView to avoid unpersisting the possibly cached dataset. dataset.sparkSession.sessionState.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 142
Source File: Binarizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val schema = dataset.schema val inputType = schema($(inputCol)).dataType val td = $(threshold) val binarizerDouble = udf { in: Double => if (in > td) 1.0 else 0.0 } val binarizerVector = udf { (data: Vector) => val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] data.foreachActive { (index, value) => if (value > td) { indices += index values += 1.0 } } Vectors.sparse(data.size, indices.result(), values.result()).compressed } val metadata = outputSchema($(outputCol)).metadata inputType match { case DoubleType => dataset.select(col("*"), binarizerDouble(col($(inputCol))).as($(outputCol), metadata)) case _: VectorUDT => dataset.select(col("*"), binarizerVector(col($(inputCol))).as($(outputCol), metadata)) } } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) val outCol: StructField = inputType match { case DoubleType => BinaryAttribute.defaultAttr.withName(outputColName).toStructField() case _: VectorUDT => StructField(outputColName, new VectorUDT) case _ => throw new IllegalArgumentException(s"Data type $inputType is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ outCol) } @Since("1.4.1") override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 143
Source File: HashingTF.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 144
Source File: SQLTransformer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkContext import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.param.{ParamMap, Param} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{SQLContext, DataFrame, Row} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("1.6.0") override def transform(dataset: DataFrame): DataFrame = { val tableName = Identifiable.randomUID(uid) dataset.registerTempTable(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val outputDF = dataset.sqlContext.sql(realStatement) outputDF } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val sc = SparkContext.getOrCreate() val sqlContext = SQLContext.getOrCreate(sc) val dummyRDD = sc.parallelize(Seq(Row.empty)) val dummyDF = sqlContext.createDataFrame(dummyRDD, schema) dummyDF.registerTempTable(tableIdentifier) val outputSchema = sqlContext.sql($(statement)).schema outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 145
Source File: Binarizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 146
Source File: IntermediateCacher.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} class IntermediateCacher(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("intermediateCacher")) } val inputCols = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) setDefault(inputCols -> Array.empty[String]) override def transformSchema(schema: StructType): StructType = { schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) val intermediateDF = if ($(inputCols).isEmpty) dataset.toDF() else dataset.select($(inputCols).map(col(_)): _*) intermediateDF.cache() } override def copy(extra: ParamMap): IntermediateCacher = { defaultCopy(extra) } } object IntermediateCacher extends DefaultParamsReadable[IntermediateCacher]
Example 147
Source File: RankingMetricFormatter.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ import ws.vinta.albedo.evaluators.RankingEvaluator._ class RankingMetricFormatter(override val uid: String, val sourceType: String) extends Transformer with DefaultParamsWritable { def this(sourceType: String) = { this(Identifiable.randomUID("rankingMetricFormatter"), sourceType) } val userCol = new Param[String](this, "userCol", "User column name") def getUserCol: String = $(userCol) def setUserCol(value: String): this.type = set(userCol, value) setDefault(userCol -> "user") val itemCol = new Param[String](this, "itemCol", "Item column name") def getItemCol: String = $(itemCol) def setItemCol(value: String): this.type = set(itemCol, value) setDefault(itemCol -> "item") val predictionCol = new Param[String](this, "predictionCol", "Prediction column name") def getPredictionCol: String = $(predictionCol) def setPredictionCol(value: String): this.type = set(predictionCol, value) setDefault(predictionCol -> "prediction") val topK = new IntParam(this, "topK", "Recommend top-k items for every user") def getTopK: Int = $(topK) def setTopK(value: Int): this.type = set(topK, value) setDefault(topK -> 15) override def transformSchema(schema: StructType): StructType = { Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType) .foreach{ case(columnName: String, expectedDataType: DataType) => { val actualDataType = schema(columnName).dataType require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.") } } schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) sourceType match { case "als" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK))) case "lr" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK))) } } override def copy(extra: ParamMap): RankingMetricFormatter = { val copied = new RankingMetricFormatter(uid, sourceType) copyValues(copied, extra) } } object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter]
Example 148
Source File: UserRepoTransformer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ class UserRepoTransformer(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("userRepoTransformer")) } val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) override def transformSchema(schema: StructType): StructType = { $(inputCols).foreach((inputColName: String) => { require(schema.fieldNames.contains(inputColName), s"Input column $inputColName must exist.") }) val newFields: Array[StructField] = Array( StructField("repo_language_index_in_user_recent_repo_languages", IntegerType, nullable = false), StructField("repo_language_count_in_user_recent_repo_languages", IntegerType, nullable = false) ) StructType(schema.fields ++ newFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) import dataset.sparkSession.implicits._ dataset .withColumn("repo_language_index_in_user_recent_repo_languages", repoLanguageIndexInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) .withColumn("repo_language_count_in_user_recent_repo_languages", repoLanguageCountInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) } override def copy(extra: ParamMap): UserRepoTransformer = { defaultCopy(extra) } } object UserRepoTransformer extends DefaultParamsReadable[UserRepoTransformer]
Example 149
Source File: SimpleVectorAssembler.scala From albedo with MIT License | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuilder def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val schema = dataset.schema val assembleFunc = udf { r: Row => SimpleVectorAssembler.assemble(r.toSeq: _*) } val args = $(inputCols).map { c => schema(c).dataType match { case DoubleType => dataset(c) case _: VectorUDT => dataset(c) case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid") } } dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol))) } override def transformSchema(schema: StructType): StructType = { val inputColNames = $(inputCols) val outputColName = $(outputCol) val inputDataTypes = inputColNames.map(name => schema(name).dataType) inputDataTypes.foreach { case _: NumericType | BooleanType => case t if t.isInstanceOf[VectorUDT] => case other => throw new IllegalArgumentException(s"Data type $other is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) } override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra) } object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] { override def load(path: String): SimpleVectorAssembler = super.load(path) def assemble(vv: Any*): Vector = { val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case null => // TODO: output Double.NaN? throw new SparkException("Values to assemble cannot be null.") case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } Vectors.sparse(cur, indices.result(), values.result()).compressed } }
Example 150
Source File: SparkTransformerBenchmark.scala From mleap with Apache License 2.0 | 5 votes |
package com.truecar.mleap.spark.benchmark import java.io.{FileInputStream, File} import com.esotericsoftware.kryo.io.Input import com.truecar.mleap.runtime.LocalLeapFrame import com.truecar.mleap.spark.benchmark.util.SparkSerializer import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.ml.Transformer import org.scalameter.Bench import scala.collection.JavaConverters._ import org.scalameter.api._ import org.scalameter.picklers.Implicits._ import org.apache.log4j.Logger import org.apache.log4j.Level import com.truecar.mleap.spark.MleapSparkSupport._ import spray.json._ import com.truecar.mleap.serialization.mleap.v1.MleapJsonSupport._ object SparkTransformerBenchmark extends Bench.ForkedTime { lazy override val executor = { SeparateJvmsExecutor( Executor.Warmer.Zero, Aggregator.min[Double], new Measurer.Default) } val classLoader = getClass.getClassLoader val regressionFile = new File("/tmp/spark.transformer.kryo") val frameFile = new File("/tmp/frame.json") val inputStream = new FileInputStream(regressionFile) val input = new Input(inputStream) val regression: Transformer = SparkSerializer().read(input) val lines = scala.io.Source.fromFile(frameFile).mkString val frame = lines.parseJson.convertTo[LocalLeapFrame] Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) val sparkConf = new SparkConf() .setAppName("Spark Transformer Benchmark") .setMaster("local[1]") val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) val rdd = frame.dataset.data.map(a => Row(a.toSeq: _*)).toList.asJava val schema = frame.schema.toSpark val sparkFrame = sqlContext.createDataFrame(rdd, schema) val ranges = for { size <- Gen.range("size")(1000, 10000, 1000) } yield 0 until size measure method "transform" in { using(ranges) in { size => size.foreach { _ => regression.transform(sparkFrame).head } } } // sc.stop() }
Example 151
Source File: SparkTransformerConverter.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter.runtime import com.truecar.mleap.runtime.transformer import org.apache.spark.ml.Transformer import scala.reflect.ClassTag trait SparkTransformerConverter { var converters: Map[String, TransformerToMleap[_ <: Transformer, _ <: transformer.Transformer]] = Map() def addConverter[T <: Transformer, MT <: transformer.Transformer](converter: TransformerToMleap[T, MT]) (implicit ct: ClassTag[T]): TransformerToMleap[T, MT] = { val name = ct.runtimeClass.getCanonicalName converters += (name -> converter) converter } def getConverter(key: String): TransformerToMleap[_ <: Transformer, _ <: transformer.Transformer] = { converters(key) } def convert(t: Transformer): transformer.Transformer = { getConverter(t.getClass.getCanonicalName).toMleapLifted(t) } }
Example 152
Source File: TransformerToMleap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter.runtime import org.apache.spark.ml.Transformer object TransformerToMleap { def apply[T, MT](t: T) (implicit ttm: TransformerToMleap[T, MT]): MT = { ttm.toMleap(t) } def toMleap[T, MT](t: T) (implicit ttm: TransformerToMleap[T, MT]): MT = { ttm.toMleap(t) } } trait TransformerToMleap[T, MT] { def toMleap(t: T): MT def toMleapLifted(t: Transformer): MT = { toMleap(t.asInstanceOf[T]) } }
Example 153
Source File: MleapSparkSupport.scala From mleap with Apache License 2.0 | 5 votes |
package com.truecar.mleap.spark import com.truecar.mleap.core.linalg import com.truecar.mleap.runtime.transformer.{Transformer => MleapTransformer} import com.truecar.mleap.runtime.{types, Row => MleapRow} import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.mleap.converter._ import org.apache.spark.ml.mleap.converter.runtime.{BaseTransformerConverter, TransformerToMleap} import org.apache.spark.ml.mleap.converter.runtime.classification.DecisionTreeClassificationModelToMleap import org.apache.spark.ml.mleap.converter.runtime.regression.DecisionTreeRegressionModelToMleap import org.apache.spark.ml.regression.DecisionTreeRegressionModel import org.apache.spark.ml.tree._ import org.apache.spark.ml.Transformer import org.apache.spark.mllib.linalg._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, SQLContext} trait MleapSparkSupport extends BaseTransformerConverter { import scala.language.implicitConversions implicit def transformerToMleapLifted[T <: Transformer] (t: T) (implicit transformerToMleap: TransformerToMleap[T, _ <: MleapTransformer]): MleapTransformer = { transformerToMleap.toMleapLifted(t) } implicit def mleapTransformerWrapper[T <: MleapTransformer](t: T): MleapTransformerWrapper[T] = { MleapTransformerWrapper(t) } implicit def vectorToSpark(vector: linalg.Vector): VectorToSpark = VectorToSpark(vector) implicit def vectorToMleap(vector: Vector): VectorToMleap = VectorToMleap(vector) implicit def dataFrameToMleap(dataset: DataFrame): DataFrameToMleap = DataFrameToMleap(dataset) implicit def decisionTreeRegressionModelToMleap(tree: DecisionTreeRegressionModel): DecisionTreeRegressionModelToMleap = DecisionTreeRegressionModelToMleap(tree) implicit def decisionTreeClassificationModelToMleap(tree: DecisionTreeClassificationModel): DecisionTreeClassificationModelToMleap = DecisionTreeClassificationModelToMleap(tree) implicit def nodeToMleap(node: Node): NodeToMleap = NodeToMleap(node) implicit def splitToMleap(split: Split): SplitToMleap = SplitToMleap(split) implicit def structTypeToMleap(schema: StructType): StructTypeToMleap = StructTypeToMleap(schema) implicit def rowToSpark(row: MleapRow): RowToSpark = RowToSpark(row) implicit def structTypeToSpark(schema: types.StructType): StructTypeToSpark = StructTypeToSpark(schema) implicit def leapFrameToSpark[T: LeapFrameToSpark](frame: T): LeapFrameToSparkWrapper[T] = { LeapFrameToSparkWrapper(frame) } implicit def leapFrameToSparkConvert[T: LeapFrameToSpark](frame: T) (implicit sqlContext: SQLContext): DataFrame = { implicitly[LeapFrameToSpark[T]].toSpark(frame) } implicit def dataFrameToLeapFrame(dataFrame: DataFrame): SparkLeapFrame = dataFrame.toMleap } object MleapSparkSupport extends MleapSparkSupport
Example 154
Source File: SparkSupport.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml import com.ibm.aardpfark.avro.SchemaConverters import com.ibm.aardpfark.pfa.document.{PFADocument, ToPFA} import org.apache.avro.SchemaBuilder import org.apache.spark.ml.{PipelineModel, Transformer} import org.apache.spark.sql.types.StructType object SparkSupport { def toPFA(t: Transformer, pretty: Boolean): String = { toPFATransformer(t).pfa.toJSON(pretty) } def toPFA(p: PipelineModel, s: StructType, pretty: Boolean): String = { val inputFields = s.map { f => f.copy(nullable = false) } val inputSchema = StructType(inputFields) val pipelineInput = SchemaBuilder.record(s"Input_${p.uid}") val inputAvroSchema = SchemaConverters.convertStructToAvro(inputSchema, pipelineInput, "") Merge.mergePipeline(p, inputAvroSchema).toJSON(pretty) } // testing implicit conversions for Spark ML PipelineModel and Transformer to PFA / JSON implicit private[aardpfark] def toPFATransformer(transformer: org.apache.spark.ml.Transformer): ToPFA = { val pkg = transformer.getClass.getPackage.getName val name = transformer.getClass.getSimpleName val pfaPkg = pkg.replace("org.apache", "com.ibm.aardpfark") val pfaClass = Class.forName(s"$pfaPkg.PFA$name") val ctor = pfaClass.getConstructors()(0) ctor.newInstance(transformer).asInstanceOf[ToPFA] } }
Example 155
Source File: SparkFeaturePFASuiteBase.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.pfa import com.opendatagroup.hadrian.jvmcompiler.PFAEngine import org.json4s.DefaultFormats import org.apache.spark.ml.{PipelineModel, Transformer} import org.apache.spark.sql.types.StructType abstract class SparkPipelinePFASuiteBase[A <: Result](implicit m: Manifest[A]) extends SparkPredictorPFASuiteBase[A] { import com.ibm.aardpfark.spark.ml.SparkSupport._ protected val schema: StructType override protected def transformerToPFA(t: Transformer, pretty: Boolean): String = { toPFA(t.asInstanceOf[PipelineModel], schema, pretty) } } abstract class SparkFeaturePFASuiteBase[A <: Result](implicit m: Manifest[A]) extends SparkPFASuiteBase { implicit val formats = DefaultFormats protected var isDebug = false import com.ibm.aardpfark.spark.ml.SparkSupport._ import org.json4s._ import org.json4s.native.JsonMethods._ test("PFA transformer produces the same results as Spark transformer") { parityTest(sparkTransformer, input, expectedOutput) } protected def transformerToPFA(t: Transformer, pretty: Boolean): String = { toPFA(t, pretty) } protected def testInputVsExpected( engine: PFAEngine[AnyRef, AnyRef], input: Array[String], expectedOutput: Array[String]) = { import ApproxEquality._ input.zip(expectedOutput).foreach { case (in, out) => val pfaResult = engine.action(engine.jsonInput(in)) val actual = parse(pfaResult.toString).extract[A] val expected = parse(out).extract[A] (actual, expected) match { case (a: ScalerResult, e: ScalerResult) => assert(a.scaled === e.scaled) case (a: Result, e: Result) => assert(a === e) } } } def parityTest( sparkTransformer: Transformer, input: Array[String], expectedOutput: Array[String]): Unit = { val PFAJson = transformerToPFA(sparkTransformer, pretty = true) if (isDebug) { println(PFAJson) } val engine = getPFAEngine(PFAJson) testInputVsExpected(engine, input, expectedOutput) } } case class ScalerResult(scaled: Seq[Double]) extends Result