org.apache.spark.ml.param.IntParam Scala Examples
The following examples show how to use org.apache.spark.ml.param.IntParam.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0 | 6 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.log4j import scala.collection.mutable object MNISTBenchmark { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt) val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2" val numPartitions = if(args.length >= 3) args(2).toInt else 10 val models = if(args.length >=4) args(3).split(',') else Array("tree","naive") val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, path) .zipWithIndex() .filter(_._2 < ns.max) .sortBy(_._2, numPartitions = numPartitions) .keys .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) .cache() dataset.count() //force persist val limiter = new Limiter() val knn = new KNNClassifier() .setTopTreeSize(numPartitions * 10) .setFeaturesCol("features") .setPredictionCol("prediction") .setK(1) val naiveKNN = new NaiveKNNClassifier() val pipeline = new Pipeline() .setStages(Array(limiter, knn)) val naivePipeline = new Pipeline() .setStages(Array(limiter, naiveKNN)) val paramGrid = new ParamGridBuilder() .addGrid(limiter.n, ns) .build() val bm = new Benchmarker() .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumTimes(3) val metrics = mutable.ArrayBuffer[String]() if(models.contains("tree")) { val bmModel = bm.setEstimator(pipeline).fit(dataset) metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}" } if(models.contains("naive")) { val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset) metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}" } logger.info(metrics.mkString("\n")) } } class Limiter(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("limiter")) val n: IntParam = new IntParam(this, "n", "number of rows to limit") def setN(value: Int): this.type = set(n, value) // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN) override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF() override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = schema }
Example 2
Source File: VParams.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.{BooleanParam, IntParam, ParamValidators, Params} private trait VParams extends Params{ // column number of each block in feature block matrix val colsPerBlock: IntParam = new IntParam(this, "colsPerBlock", "column number of each block in feature block matrix.", ParamValidators.gt(0)) setDefault(colsPerBlock -> 10000) def getColsPerBlock: Int = $(colsPerBlock) // row number of each block in feature block matrix val rowsPerBlock: IntParam = new IntParam(this, "rowsPerBlock", "row number of each block in feature block matrix.", ParamValidators.gt(0)) setDefault(rowsPerBlock -> 10000) def getRowsPerBlock: Int = $(rowsPerBlock) // row partition number of feature block matrix // equals to partition number of coefficient vector val rowPartitions: IntParam = new IntParam(this, "rowPartitions", "row partition number of feature block matrix.", ParamValidators.gt(0)) setDefault(rowPartitions -> 10) def getRowPartitions: Int = $(rowPartitions) // column partition number of feature block matrix val colPartitions: IntParam = new IntParam(this, "colPartitions", "column partition number of feature block matrix.", ParamValidators.gt(0)) setDefault(colPartitions -> 10) def getColPartitions: Int = $(colPartitions) // Whether to eager persist distributed vector. val eagerPersist: BooleanParam = new BooleanParam(this, "eagerPersist", "Whether to eager persist distributed vector.") setDefault(eagerPersist -> false) def getEagerPersist: Boolean = $(eagerPersist) // The number of corrections used in the LBFGS update. val numCorrections: IntParam = new IntParam(this, "numCorrections", "The number of corrections used in the LBFGS update.") setDefault(numCorrections -> 10) def getNumCorrections: Int = $(numCorrections) val generatingFeatureMatrixBuffer: IntParam = new IntParam(this, "generatingFeatureMatrixBuffer", "Buffer size when generating features block matrix.") setDefault(generatingFeatureMatrixBuffer -> 1000) def getGeneratingFeatureMatrixBuffer: Int = $(generatingFeatureMatrixBuffer) val rowPartitionSplitNumOnGeneratingFeatureMatrix: IntParam = new IntParam(this, "rowPartitionSplitsNumOnGeneratingFeatureMatrix", "row partition splits number on generating features matrix." ) setDefault(rowPartitionSplitNumOnGeneratingFeatureMatrix -> 1) def getRowPartitionSplitNumOnGeneratingFeatureMatrix: Int = $(rowPartitionSplitNumOnGeneratingFeatureMatrix) val compressFeatureMatrix: BooleanParam = new BooleanParam(this, "compressFeatureMatrix", "compress feature matrix." ) setDefault(compressFeatureMatrix -> false) def getCompressFeatureMatrix: Boolean = $(compressFeatureMatrix) }
Example 3
Source File: WordLengthFilter.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.WordLengthFilterModel import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, Params} import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} final def getWordLength: Int = $(wordLength) } class WordLengthFilter(override val uid: String) extends Transformer with WordLengthFilterParams with DefaultParamsWritable { val defaultLength = 3 var model: WordLengthFilterModel = new WordLengthFilterModel(defaultLength) //Initialize with default filter length 3 def this(model: WordLengthFilterModel) = this(uid = Identifiable.randomUID("filter_words")) def this() = this(new WordLengthFilterModel) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) def setWordLength(value: Int = defaultLength): this.type = set(wordLength, value) override def transform(dataset: Dataset[_]): DataFrame = { if(defaultLength != getWordLength) model = new WordLengthFilterModel(getWordLength) val filterWordsUdf = udf { (words: Seq[String]) => model(words) } dataset.withColumn($(outputCol), filterWordsUdf(dataset($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { require(schema($(inputCol)).dataType.isInstanceOf[ArrayType], s"Input column must be of type ArrayType(StringType,true) but got ${schema($(inputCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(outputCol)), s"Output column ${$(outputCol)} already exists.") StructType(schema.fields :+ StructField($(outputCol), ArrayType(StringType, true))) } } object WordLengthFilter extends DefaultParamsReadable[WordLengthFilter] { override def load(path: String): WordLengthFilter = super.load(path) }
Example 4
Source File: GaussianProcessParams.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.commons.kernel.{Kernel, RBFKernel} import org.apache.spark.ml.param.shared.{HasAggregationDepth, HasMaxIter, HasSeed, HasTol} import org.apache.spark.ml.param.{DoubleParam, IntParam, Param} private[ml] trait GaussianProcessParams extends PredictorParams with HasMaxIter with HasTol with HasAggregationDepth with HasSeed { final val activeSetProvider = new Param[ActiveSetProvider](this, "activeSetProvider", "the class which provides the active set used by Projected Process Approximation") final val kernel = new Param[() => Kernel](this, "kernel", "function of no arguments which returns " + "the kernel of the prior Gaussian Process") final val datasetSizeForExpert = new IntParam(this, "datasetSizeForExpert", "The number of data points fed to each expert. " + "Time and space complexity of training quadratically grows with it.") final val sigma2 = new DoubleParam(this, "sigma2", "The variance of noise in the inputs. The value is added to the diagonal of the " + "kernel Matrix. Also prevents numerical issues associated with inversion " + "of a computationally singular matrix ") final val activeSetSize = new IntParam(this, "activeSetSize", "Number of latent functions to project the process onto. " + "The size of the produced model and prediction complexity " + "linearly depend on this value.") def setActiveSetProvider(value : ActiveSetProvider): this.type = set(activeSetProvider, value) setDefault(activeSetProvider -> RandomActiveSetProvider) def setDatasetSizeForExpert(value: Int): this.type = set(datasetSizeForExpert, value) setDefault(datasetSizeForExpert -> 100) def setMaxIter(value: Int): this.type = set(maxIter, value) setDefault(maxIter -> 100) def setSigma2(value: Double): this.type = set(sigma2, value) setDefault(sigma2 -> 1e-3) def setKernel(value: () => Kernel): this.type = set(kernel, value) setDefault(kernel -> (() => new RBFKernel())) def setTol(value: Double): this.type = set(tol, value) setDefault(tol -> 1E-6) def setActiveSetSize(value: Int): this.type = set(activeSetSize, value) setDefault(activeSetSize -> 100) def setSeed(value: Long): this.type = set(seed, value) }
Example 5
Source File: MovingAverage.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types._ def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(windowSize -> 3) override def transform(dataSet: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataSet.schema) val sparkContext = dataSet.sqlContext.sparkContext val inputType = outputSchema($(inputCol)).dataType val inputTypeBr = sparkContext.broadcast(inputType) val dataSetRdd = dataSet.rdd val inputColName = sparkContext.broadcast($(inputCol)) val inputColIndex = dataSet.columns.indexOf($(inputCol)) val inputColIndexBr = sparkContext.broadcast(inputColIndex) val windowSizeBr = sparkContext.broadcast($(windowSize)) val maRdd = dataSetRdd.map { case (row: Row) => val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) { val vector = row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value) (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1))) } else { val iterable = row.getAs[Iterable[Double]](inputColName.value) (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1))) } val (before, after) = row.toSeq.splitAt(inputColIndexBr.value) Row( (before :+ rawValue) ++ after.tail :+ MovingAverageCalc .simpleMovingAverageArray(array, windowSizeBr.value): _* ) } dataSet.sqlContext.createDataFrame(maRdd, outputSchema) } override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra) } object MovingAverageCalc { private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = { (for (i <- 1 to values.length) yield //TODO rollback this comment with the right size of features to make the meanaverage return // the features values for the first values of the calc if (i < period) 0d //values(i) else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d) } } object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] { override def load(path: String): MovingAverage[_] = super.load(path) }
Example 6
Source File: Cleaner.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions.clean import com.hankcs.hanlp.HanLP import config.paramconf.{HasOutputCol, HasInputCol} import functions.MySchemaUtils import functions.clean.chinese.BCConvert import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1) override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val cleanFunc = udf {line: String => var cleaned = "" getFanJian match { case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line) case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line) case _ => cleaned = line } getQuanBan match { case "q2b" => cleaned = BCConvert.qj2bj(cleaned) case "b2q" => cleaned = BCConvert.bj2qj(cleaned) case _ => cleaned = cleaned } cleaned } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record => val outputIndex = record.fieldIndex($(outputCol)) record.getString(outputIndex).length >= getMinLineLen } } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.typeName.equals(StringType.typeName), s"Input type must be StringType but got $inputType.") MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable) } } object Cleaner extends DefaultParamsReadable[Cleaner] { override def load(path: String): Cleaner = super.load(path) }
Example 7
Source File: HasEmbeddingsProperties.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.AnnotatorType import org.apache.spark.ml.param.{BooleanParam, IntParam, Params} import org.apache.spark.sql.Column import org.apache.spark.sql.types.MetadataBuilder trait HasEmbeddingsProperties extends Params { val dimension = new IntParam(this, "dimension", "Number of embedding dimensions") def setDimension(value: Int): this.type = set(this.dimension, value) def getDimension: Int = $(dimension) protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = { val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS) metadataBuilder.putLong("dimension", embeddingsDim.toLong) embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref)) col.as(col.toString, metadataBuilder.build) } protected def wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = { val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", AnnotatorType.SENTENCE_EMBEDDINGS) metadataBuilder.putLong("dimension", embeddingsDim.toLong) embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref)) col.as(col.toString, metadataBuilder.build) } }
Example 8
Source File: S2CellTransformer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import com.google.common.geometry.{S2LatLng, S2CellId} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} class S2CellTransformer(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("S2CellTransformer")) // Input/Output column names val latCol: Param[String] = new Param[String](this, "latCol", "latitude column") val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column") val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column") val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]", (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i)) // Default parameters setDefault( latCol -> "lat", lonCol -> "lon", cellCol -> "cell", level -> 10 ) def getLatCol: String = $(latCol) def getLonCol: String = $(lonCol) def getCellCol: String = $(cellCol) def getLevel: Int = $(level) def setLatCol(value: String): this.type = set(latCol, value) def setLonCol(value: String): this.type = set(lonCol, value) def setCellCol(value: String): this.type = set(cellCol, value) def setLevel(value: Int): this.type = set(level, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val currentLevel = $(level) val t = udf { (lat: Double, lon: Double) => val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon)) cellId.parent(currentLevel).toToken } val metadata = outputSchema($(cellCol)).metadata dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val latColumnName = $(latCol) val latDataType = schema(latColumnName).dataType require(latDataType == DoubleType, s"The latitude column $latColumnName must be Double type, " + s"but got $latDataType.") val lonColumnName = $(lonCol) val lonDataType = schema(lonColumnName).dataType require(lonDataType == DoubleType, s"The longitude column $lonColumnName must be Double type, " + s"but got $lonDataType.") val inputFields = schema.fields val outputColName = $(cellCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = NominalAttribute.defaultAttr.withName($(cellCol)) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) }
Example 9
Source File: HashingTF.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 10
Source File: PercentileCalibrator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.{UnaryEstimator, UnaryModel} import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.ml.feature.QuantileDiscretizer import org.apache.spark.ml.param.IntParam import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.MetadataBuilder import scala.collection.Searching._ class PercentileCalibrator(uid: String = UID[PercentileCalibrator]) extends UnaryEstimator[RealNN, RealNN](operationName = "percentCalibrator", uid = uid) { final val expectedNumBuckets = new IntParam( this, "expectedNumBuckets", "number of buckets to divide input data into" ) setDefault(expectedNumBuckets, 100) def setExpectedNumBuckets(buckets: Int): this.type = set(expectedNumBuckets, buckets) def fitFn(dataset: Dataset[Option[Double]]): UnaryModel[RealNN, RealNN] = { val estimator: QuantileDiscretizer = new QuantileDiscretizer() .setNumBuckets($(expectedNumBuckets)) .setRelativeError(0) .setInputCol(dataset.columns(0)) .setOutputCol(dataset.columns(0) + "-out") val bucketizerModel = estimator.fit(dataset) val model = new PercentileCalibratorModel( splits = bucketizerModel.getSplits, actualNumBuckets = bucketizerModel.getSplits.length, expectedNumBuckets = $(expectedNumBuckets), operationName = operationName, uid = uid ) val scaledBuckets = bucketizerModel.getSplits.map(v => model.transformFn(v.toRealNN).v.get) val meta = new MetadataBuilder() .putStringArray(PercentileCalibrator.OrigSplitsKey, bucketizerModel.getSplits.map(_.toString)) .putStringArray(PercentileCalibrator.ScaledSplitsKey, scaledBuckets.map(_.toString)).build() setMetadata(meta.toSummaryMetadata()) model } } final class PercentileCalibratorModel private[op] ( val splits: Array[Double], val actualNumBuckets: Int, val expectedNumBuckets: Int, operationName: String, uid: String ) extends UnaryModel[RealNN, RealNN](operationName = operationName, uid = uid) { def transformFn: RealNN => RealNN = (inScalar: RealNN) => { val calibrated = splits.search(inScalar.v.get) match { case Found(idx) => idx case InsertionPoint(idx) => idx } scale(actualNumBuckets, expectedNumBuckets, calibrated).toRealNN } private def scale(actualNumBuckets: Int, expectedBuckets: Int, calibrated: Int): Long = { if (actualNumBuckets >= expectedBuckets) { calibrated - 1 // make it start at zero } else { val (oldMin, newMin) = (0, 0) val (oldMax, newMax) = (Math.max(actualNumBuckets - 2, 0), Math.max(expectedBuckets - 1, 0)) val oldRange = oldMax - oldMin oldRange match { case 0 => newMin case _ => val newRange = (newMax - newMin).toDouble val newValue = (((calibrated - oldMin) * newRange) / oldRange) + newMin Math.min(newValue.round, newMax) } } } } case object PercentileCalibrator { val OrigSplitsKey: String = "origSplits" val ScaledSplitsKey: String = "scaledSplits" }
Example 11
Source File: HashingTF.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 12
Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1)) override def transform(dataset: Dataset[_]): DataFrame = { val lowerBound = $(lowerN) val upperBound = $(upperN) val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound)) dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), new ArrayType(StringType, true)) } else { schema } } } object NGramExtractor extends DefaultParamsReadable[NGramExtractor] { override def load(path: String): NGramExtractor = super.load(path) }
Example 13
Source File: HasParallelism.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.param.shared import scala.concurrent.ExecutionContext import org.apache.spark.ml.param.{IntParam, Params, ParamValidators} import org.apache.spark.util.ThreadUtils private[ml] def getExecutionContext: ExecutionContext = { getParallelism match { case 1 => ThreadUtils.sameThread case n => ExecutionContext.fromExecutorService(ThreadUtils .newDaemonCachedThreadPool(s"${this.getClass.getSimpleName}-thread-pool", n)) } } }
Example 14
Source File: SageMakerAlgorithmParams.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.algorithms import org.apache.spark.ml.param.{IntParam, Param, Params, ParamValidators} val featureDim : IntParam = new IntParam(this, "feature_dim", "The dimension of the input vectors. Must be > 0.", ParamValidators.gtEq(1)) def getFeatureDim: Int = $(featureDim) protected def autoOrAboveParamValidator(lowerBound: Double, inclusive: Boolean): String => Boolean = { (value: String) => try { value == "auto" || { if (inclusive) { value.toDouble >= lowerBound } else { value.toDouble > lowerBound } } } catch { case e: NumberFormatException => false } } protected def inArrayOrAboveParamValidator(validValues: Array[String], lowerBound: Double): String => Boolean = { (value: String) => try { validValues.contains(value) || value.toDouble > lowerBound } catch { case e: NumberFormatException => false } } protected def parseTrueAndFalse(param: Param[String]): Boolean = { $(param) match { case "True" => true case "False" => false case _ => throw new IllegalArgumentException("Param is neither 'True' nor 'False'") } } }
Example 15
Source File: HashingTF.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 16
Source File: RankingMetricFormatter.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ import ws.vinta.albedo.evaluators.RankingEvaluator._ class RankingMetricFormatter(override val uid: String, val sourceType: String) extends Transformer with DefaultParamsWritable { def this(sourceType: String) = { this(Identifiable.randomUID("rankingMetricFormatter"), sourceType) } val userCol = new Param[String](this, "userCol", "User column name") def getUserCol: String = $(userCol) def setUserCol(value: String): this.type = set(userCol, value) setDefault(userCol -> "user") val itemCol = new Param[String](this, "itemCol", "Item column name") def getItemCol: String = $(itemCol) def setItemCol(value: String): this.type = set(itemCol, value) setDefault(itemCol -> "item") val predictionCol = new Param[String](this, "predictionCol", "Prediction column name") def getPredictionCol: String = $(predictionCol) def setPredictionCol(value: String): this.type = set(predictionCol, value) setDefault(predictionCol -> "prediction") val topK = new IntParam(this, "topK", "Recommend top-k items for every user") def getTopK: Int = $(topK) def setTopK(value: Int): this.type = set(topK, value) setDefault(topK -> 15) override def transformSchema(schema: StructType): StructType = { Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType) .foreach{ case(columnName: String, expectedDataType: DataType) => { val actualDataType = schema(columnName).dataType require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.") } } schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) sourceType match { case "als" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK))) case "lr" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK))) } } override def copy(extra: ParamMap): RankingMetricFormatter = { val copied = new RankingMetricFormatter(uid, sourceType) copyValues(copied, extra) } } object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter]