org.apache.spark.ml.param.IntParam Scala Example

Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0

6 votes

package com.github.saurfang.spark.ml.knn.examples

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.log4j

import scala.collection.mutable


object MNISTBenchmark {

  val logger = log4j.Logger.getLogger(getClass)

  def main(args: Array[String]) {
    val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt)
    val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2"
    val numPartitions = if(args.length >= 3) args(2).toInt else 10
    val models = if(args.length >=4) args(3).split(',') else Array("tree","naive")

    val spark = SparkSession.builder().getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._

    //read in raw label and features
    val rawDataset = MLUtils.loadLibSVMFile(sc, path)
      .zipWithIndex()
      .filter(_._2 < ns.max)
      .sortBy(_._2, numPartitions = numPartitions)
      .keys
      .toDF()

    // convert "features" from mllib.linalg.Vector to ml.linalg.Vector
    val dataset =  MLUtils.convertVectorColumnsToML(rawDataset)
      .cache()
    dataset.count() //force persist

    val limiter = new Limiter()
    val knn = new KNNClassifier()
      .setTopTreeSize(numPartitions * 10)
      .setFeaturesCol("features")
      .setPredictionCol("prediction")
      .setK(1)
    val naiveKNN = new NaiveKNNClassifier()

    val pipeline = new Pipeline()
      .setStages(Array(limiter, knn))
    val naivePipeline = new Pipeline()
      .setStages(Array(limiter, naiveKNN))

    val paramGrid = new ParamGridBuilder()
      .addGrid(limiter.n, ns)
      .build()

    val bm = new Benchmarker()
      .setEvaluator(new MulticlassClassificationEvaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumTimes(3)

    val metrics = mutable.ArrayBuffer[String]()
    if(models.contains("tree")) {
      val bmModel = bm.setEstimator(pipeline).fit(dataset)
      metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}"
    }
    if(models.contains("naive")) {
      val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset)
      metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}"
    }
    logger.info(metrics.mkString("\n"))
  }
}

class Limiter(override val uid: String) extends Transformer {
  def this() = this(Identifiable.randomUID("limiter"))

  val n: IntParam = new IntParam(this, "n", "number of rows to limit")

  def setN(value: Int): this.type = set(n, value)

  // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN)
  override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF()

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = schema
}

Source File: VParams.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.ml

import org.apache.spark.ml.param.{BooleanParam, IntParam, ParamValidators, Params}

private trait VParams extends Params{
  // column number of each block in feature block matrix
  val colsPerBlock: IntParam = new IntParam(this, "colsPerBlock",
    "column number of each block in feature block matrix.", ParamValidators.gt(0))
  setDefault(colsPerBlock -> 10000)

  def getColsPerBlock: Int = $(colsPerBlock)

  // row number of each block in feature block matrix
  val rowsPerBlock: IntParam = new IntParam(this, "rowsPerBlock",
    "row number of each block in feature block matrix.", ParamValidators.gt(0))
  setDefault(rowsPerBlock -> 10000)

  def getRowsPerBlock: Int = $(rowsPerBlock)

  // row partition number of feature block matrix
  // equals to partition number of coefficient vector
  val rowPartitions: IntParam = new IntParam(this, "rowPartitions",
    "row partition number of feature block matrix.", ParamValidators.gt(0))
  setDefault(rowPartitions -> 10)

  def getRowPartitions: Int = $(rowPartitions)

  // column partition number of feature block matrix
  val colPartitions: IntParam = new IntParam(this, "colPartitions",
    "column partition number of feature block matrix.", ParamValidators.gt(0))
  setDefault(colPartitions -> 10)

  def getColPartitions: Int = $(colPartitions)

  // Whether to eager persist distributed vector.
  val eagerPersist: BooleanParam = new BooleanParam(this, "eagerPersist",
    "Whether to eager persist distributed vector.")
  setDefault(eagerPersist -> false)

  def getEagerPersist: Boolean = $(eagerPersist)

  // The number of corrections used in the LBFGS update.
  val numCorrections: IntParam = new IntParam(this, "numCorrections",
    "The number of corrections used in the LBFGS update.")
  setDefault(numCorrections -> 10)

  def getNumCorrections: Int = $(numCorrections)

  val generatingFeatureMatrixBuffer: IntParam = new IntParam(this, "generatingFeatureMatrixBuffer",
    "Buffer size when generating features block matrix.")
  setDefault(generatingFeatureMatrixBuffer -> 1000)

  def getGeneratingFeatureMatrixBuffer: Int = $(generatingFeatureMatrixBuffer)

  val rowPartitionSplitNumOnGeneratingFeatureMatrix: IntParam = new IntParam(this,
    "rowPartitionSplitsNumOnGeneratingFeatureMatrix",
    "row partition splits number on generating features matrix."
  )
  setDefault(rowPartitionSplitNumOnGeneratingFeatureMatrix -> 1)

  def getRowPartitionSplitNumOnGeneratingFeatureMatrix: Int =
    $(rowPartitionSplitNumOnGeneratingFeatureMatrix)

  val compressFeatureMatrix: BooleanParam = new BooleanParam(this,
    "compressFeatureMatrix",
    "compress feature matrix."
  )
  setDefault(compressFeatureMatrix -> false)

  def getCompressFeatureMatrix: Boolean = $(compressFeatureMatrix)
}

Source File: WordLengthFilter.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.WordLengthFilterModel
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, Params}
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}



  final def getWordLength: Int = $(wordLength)
}

class WordLengthFilter(override val uid: String) extends Transformer
  with WordLengthFilterParams
  with DefaultParamsWritable {

  val defaultLength = 3
  var model: WordLengthFilterModel = new WordLengthFilterModel(defaultLength) //Initialize with default filter length 3

  def this(model: WordLengthFilterModel) = this(uid = Identifiable.randomUID("filter_words"))
  def this() = this(new WordLengthFilterModel)

  def setInputCol(value: String): this.type = set(inputCol, value)
  def setOutputCol(value: String): this.type = set(outputCol, value)
  def setWordLength(value: Int = defaultLength): this.type = set(wordLength, value)

  override def transform(dataset: Dataset[_]): DataFrame = {
    if(defaultLength != getWordLength) model = new WordLengthFilterModel(getWordLength)
    val filterWordsUdf = udf {
      (words: Seq[String]) => model(words)
    }

    dataset.withColumn($(outputCol), filterWordsUdf(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer =  defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    require(schema($(inputCol)).dataType.isInstanceOf[ArrayType],
      s"Input column must be of type ArrayType(StringType,true) but got ${schema($(inputCol)).dataType}")
    val inputFields = schema.fields

    require(!inputFields.exists(_.name == $(outputCol)),
      s"Output column ${$(outputCol)} already exists.")

    StructType(schema.fields :+ StructField($(outputCol), ArrayType(StringType, true)))

  }
}

object WordLengthFilter extends  DefaultParamsReadable[WordLengthFilter] {
  override def load(path: String): WordLengthFilter = super.load(path)
}

Source File: GaussianProcessParams.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.commons

import org.apache.spark.ml.PredictorParams
import org.apache.spark.ml.commons.kernel.{Kernel, RBFKernel}
import org.apache.spark.ml.param.shared.{HasAggregationDepth, HasMaxIter, HasSeed, HasTol}
import org.apache.spark.ml.param.{DoubleParam, IntParam, Param}

private[ml] trait GaussianProcessParams extends PredictorParams
  with HasMaxIter with HasTol with HasAggregationDepth with HasSeed {

  final val activeSetProvider = new Param[ActiveSetProvider](this, "activeSetProvider",
    "the class which provides the active set used by Projected Process Approximation")

  final val kernel = new Param[() => Kernel](this,
    "kernel", "function of no arguments which returns " +
      "the kernel of the prior Gaussian Process")

  final val datasetSizeForExpert = new IntParam(this,
    "datasetSizeForExpert", "The number of data points fed to each expert. " +
      "Time and space complexity of training quadratically grows with it.")

  final val sigma2 = new DoubleParam(this,
    "sigma2", "The variance of noise in the inputs. The value is added to the diagonal of the " +
      "kernel Matrix. Also prevents numerical issues associated with inversion " +
      "of a computationally singular matrix ")

  final val activeSetSize = new IntParam(this,
    "activeSetSize", "Number of latent functions to project the process onto. " +
      "The size of the produced model and prediction complexity " +
      "linearly depend on this value.")

  def setActiveSetProvider(value : ActiveSetProvider): this.type = set(activeSetProvider, value)
  setDefault(activeSetProvider -> RandomActiveSetProvider)

  def setDatasetSizeForExpert(value: Int): this.type = set(datasetSizeForExpert, value)
  setDefault(datasetSizeForExpert -> 100)

  def setMaxIter(value: Int): this.type = set(maxIter, value)
  setDefault(maxIter -> 100)

  def setSigma2(value: Double): this.type = set(sigma2, value)
  setDefault(sigma2 -> 1e-3)

  def setKernel(value: () => Kernel): this.type = set(kernel, value)
  setDefault(kernel -> (() => new RBFKernel()))

  def setTol(value: Double): this.type = set(tol, value)
  setDefault(tol -> 1E-6)

  def setActiveSetSize(value: Int): this.type = set(activeSetSize, value)
  setDefault(activeSetSize -> 100)

  def setSeed(value: Long): this.type = set(seed, value)
}

Source File: MovingAverage.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types._


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(windowSize -> 3)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataSet.schema)
    val sparkContext = dataSet.sqlContext.sparkContext
    val inputType = outputSchema($(inputCol)).dataType
    val inputTypeBr = sparkContext.broadcast(inputType)
    val dataSetRdd = dataSet.rdd
    val inputColName = sparkContext.broadcast($(inputCol))
    val inputColIndex = dataSet.columns.indexOf($(inputCol))
    val inputColIndexBr = sparkContext.broadcast(inputColIndex)
    val windowSizeBr = sparkContext.broadcast($(windowSize))
    val maRdd = dataSetRdd.map { case (row: Row) =>
      val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) {
        val vector =
          row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value)
        (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1)))
      } else {
        val iterable = row.getAs[Iterable[Double]](inputColName.value)
        (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1)))
      }
      val (before, after) = row.toSeq.splitAt(inputColIndexBr.value)
      Row(
        (before :+ rawValue) ++ after.tail :+ MovingAverageCalc
          .simpleMovingAverageArray(array, windowSizeBr.value): _*
      )
    }
    dataSet.sqlContext.createDataFrame(maRdd, outputSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra)
}

object MovingAverageCalc {
  private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = {
    (for (i <- 1 to values.length)
      yield
      //TODO rollback this comment with the right size of features to make the meanaverage return
      // the features values for the first values of the calc
      if (i < period) 0d //values(i)
      else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d)
  }
}

object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] {

  override def load(path: String): MovingAverage[_] = super.load(path)
}

Source File: Cleaner.scala From CkoocNLP with Apache License 2.0

5 votes

package functions.clean

import com.hankcs.hanlp.HanLP
import config.paramconf.{HasOutputCol, HasInputCol}
import functions.MySchemaUtils
import functions.clean.chinese.BCConvert
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}



  setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1)

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema, logging = true)

    val cleanFunc = udf {line: String =>
      var cleaned = ""
      getFanJian match {
        case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line)
        case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line)
        case _ => cleaned = line
      }

      getQuanBan match {
        case "q2b" => cleaned = BCConvert.qj2bj(cleaned)
        case "b2q" => cleaned = BCConvert.bj2qj(cleaned)
        case _ => cleaned = cleaned
      }

      cleaned
    }

    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record =>
      val outputIndex = record.fieldIndex($(outputCol))
      record.getString(outputIndex).length >= getMinLineLen
    }
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.typeName.equals(StringType.typeName),
      s"Input type must be StringType but got $inputType.")
    MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
  }
}


object Cleaner extends DefaultParamsReadable[Cleaner] {
  override def load(path: String): Cleaner = super.load(path)
}

Source File: HasEmbeddingsProperties.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp.embeddings

import com.johnsnowlabs.nlp.AnnotatorType
import org.apache.spark.ml.param.{BooleanParam, IntParam, Params}
import org.apache.spark.sql.Column
import org.apache.spark.sql.types.MetadataBuilder

trait HasEmbeddingsProperties extends Params {

  val dimension = new IntParam(this, "dimension", "Number of embedding dimensions")

  def setDimension(value: Int): this.type = set(this.dimension, value)
  def getDimension: Int = $(dimension)

  protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = {
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS)
    metadataBuilder.putLong("dimension", embeddingsDim.toLong)
    embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref))
    col.as(col.toString, metadataBuilder.build)
  }

  protected def wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = {
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", AnnotatorType.SENTENCE_EMBEDDINGS)
    metadataBuilder.putLong("dimension", embeddingsDim.toLong)
    embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref))
    col.as(col.toString, metadataBuilder.build)
  }

}

Source File: S2CellTransformer.scala From spark-ext with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import com.google.common.geometry.{S2LatLng, S2CellId}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, StructType}


class S2CellTransformer(override val uid: String) extends Transformer {

  def this() = this(Identifiable.randomUID("S2CellTransformer"))

  // Input/Output column names

  val latCol: Param[String] = new Param[String](this, "latCol", "latitude column")

  val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column")

  val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column")

  val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]",
    (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i))

  // Default parameters

  setDefault(
    latCol  -> "lat",
    lonCol  -> "lon",
    cellCol -> "cell",
    level   -> 10
  )

  def getLatCol: String = $(latCol)

  def getLonCol: String = $(lonCol)

  def getCellCol: String = $(cellCol)

  def getLevel: Int = $(level)

  def setLatCol(value: String): this.type = set(latCol, value)

  def setLonCol(value: String): this.type = set(lonCol, value)

  def setCellCol(value: String): this.type = set(cellCol, value)

  def setLevel(value: Int): this.type = set(level, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val currentLevel = $(level)
    val t = udf { (lat: Double, lon: Double) =>
      val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon))
      cellId.parent(currentLevel).toToken
    }
    val metadata = outputSchema($(cellCol)).metadata
    dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val latColumnName = $(latCol)
    val latDataType = schema(latColumnName).dataType
    require(latDataType == DoubleType,
      s"The latitude column $latColumnName must be Double type, " +
        s"but got $latDataType.")

    val lonColumnName = $(lonCol)
    val lonDataType = schema(lonColumnName).dataType
    require(lonDataType == DoubleType,
      s"The longitude column $lonColumnName must be Double type, " +
        s"but got $lonDataType.")

    val inputFields = schema.fields
    val outputColName = $(cellCol)
    require(inputFields.forall(_.name != outputColName),
      s"Output column $outputColName already exists.")

    val attr = NominalAttribute.defaultAttr.withName($(cellCol))
    val outputFields = inputFields :+ attr.toStructField()
    StructType(outputFields)
  }

  override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra)
}

Source File: HashingTF.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

Source File: PercentileCalibrator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.unary.{UnaryEstimator, UnaryModel}
import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.spark.ml.feature.QuantileDiscretizer
import org.apache.spark.ml.param.IntParam
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.MetadataBuilder

import scala.collection.Searching._



class PercentileCalibrator(uid: String = UID[PercentileCalibrator])
  extends UnaryEstimator[RealNN, RealNN](operationName = "percentCalibrator", uid = uid) {

  final val expectedNumBuckets = new IntParam(
    this, "expectedNumBuckets", "number of buckets to divide input data into"
  )
  setDefault(expectedNumBuckets, 100)

  def setExpectedNumBuckets(buckets: Int): this.type = set(expectedNumBuckets, buckets)

  def fitFn(dataset: Dataset[Option[Double]]): UnaryModel[RealNN, RealNN] = {

    val estimator: QuantileDiscretizer = new QuantileDiscretizer()
      .setNumBuckets($(expectedNumBuckets))
      .setRelativeError(0)
      .setInputCol(dataset.columns(0))
      .setOutputCol(dataset.columns(0) + "-out")

    val bucketizerModel = estimator.fit(dataset)

    val model = new PercentileCalibratorModel(
      splits = bucketizerModel.getSplits,
      actualNumBuckets = bucketizerModel.getSplits.length,
      expectedNumBuckets = $(expectedNumBuckets),
      operationName = operationName,
      uid = uid
    )

    val scaledBuckets = bucketizerModel.getSplits.map(v => model.transformFn(v.toRealNN).v.get)

    val meta = new MetadataBuilder()
      .putStringArray(PercentileCalibrator.OrigSplitsKey, bucketizerModel.getSplits.map(_.toString))
      .putStringArray(PercentileCalibrator.ScaledSplitsKey, scaledBuckets.map(_.toString)).build()
    setMetadata(meta.toSummaryMetadata())

    model
  }

}


final class PercentileCalibratorModel private[op]
(
  val splits: Array[Double],
  val actualNumBuckets: Int,
  val expectedNumBuckets: Int,
  operationName: String,
  uid: String
) extends UnaryModel[RealNN, RealNN](operationName = operationName, uid = uid) {

  def transformFn: RealNN => RealNN = (inScalar: RealNN) => {
    val calibrated = splits.search(inScalar.v.get) match {
      case Found(idx) => idx
      case InsertionPoint(idx) => idx
    }
    scale(actualNumBuckets, expectedNumBuckets, calibrated).toRealNN
  }

  private def scale(actualNumBuckets: Int, expectedBuckets: Int, calibrated: Int): Long = {
    if (actualNumBuckets >= expectedBuckets) {
      calibrated - 1 // make it start at zero
    } else {
      val (oldMin, newMin) = (0, 0)
      val (oldMax, newMax) = (Math.max(actualNumBuckets - 2, 0), Math.max(expectedBuckets - 1, 0))
      val oldRange = oldMax - oldMin
      oldRange match {
        case 0 => newMin
        case _ =>
          val newRange = (newMax - newMin).toDouble
          val newValue = (((calibrated - oldMin) * newRange) / oldRange) + newMin
          Math.min(newValue.round, newMax)
      }
    }
  }

}


case object PercentileCalibrator {
  val OrigSplitsKey: String = "origSplits"
  val ScaledSplitsKey: String = "scaledSplits"
}

Source File: HashingTF.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val lowerBound = $(lowerN)
    val upperBound = $(upperN)
    val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound))
    dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol))))
  }


  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) != $(outputCol)) {
      schema.add($(outputCol), new ArrayType(StringType, true))
    } else {
      schema
    }
  }
}
object NGramExtractor extends DefaultParamsReadable[NGramExtractor] {
  override def load(path: String): NGramExtractor = super.load(path)
}

Source File: HasParallelism.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.param.shared

import scala.concurrent.ExecutionContext

import org.apache.spark.ml.param.{IntParam, Params, ParamValidators}
import org.apache.spark.util.ThreadUtils


  private[ml] def getExecutionContext: ExecutionContext = {
    getParallelism match {
      case 1 =>
        ThreadUtils.sameThread
      case n =>
        ExecutionContext.fromExecutorService(ThreadUtils
          .newDaemonCachedThreadPool(s"${this.getClass.getSimpleName}-thread-pool", n))
    }
  }
}

Source File: SageMakerAlgorithmParams.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.algorithms

import org.apache.spark.ml.param.{IntParam, Param, Params, ParamValidators}


  val featureDim : IntParam = new IntParam(this, "feature_dim",
    "The dimension of the input vectors. Must be > 0.", ParamValidators.gtEq(1))
  def getFeatureDim: Int = $(featureDim)

  protected def autoOrAboveParamValidator(lowerBound: Double,
                                          inclusive: Boolean): String => Boolean = {
    (value: String) =>
      try {
        value == "auto" || {
          if (inclusive) {
            value.toDouble >= lowerBound
          }
          else {
            value.toDouble > lowerBound
          }
        }
      } catch {
        case e: NumberFormatException => false
      }
  }

  protected def inArrayOrAboveParamValidator(validValues: Array[String],
                                             lowerBound: Double): String => Boolean = {
    (value: String) =>
      try {
        validValues.contains(value) || value.toDouble > lowerBound
      } catch {
        case e: NumberFormatException => false
      }
  }

  protected def parseTrueAndFalse(param: Param[String]): Boolean = {
    $(param) match {
      case "True" => true
      case "False" => false
      case _ => throw new IllegalArgumentException("Param is neither 'True' nor 'False'")
    }
  }
}

Source File: HashingTF.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Source File: RankingMetricFormatter.scala From albedo with MIT License

5 votes

package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.closures.UDFs._
import ws.vinta.albedo.evaluators.RankingEvaluator._

class RankingMetricFormatter(override val uid: String, val sourceType: String)
  extends Transformer with DefaultParamsWritable {

  def this(sourceType: String) = {
    this(Identifiable.randomUID("rankingMetricFormatter"), sourceType)
  }

  val userCol = new Param[String](this, "userCol", "User column name")

  def getUserCol: String = $(userCol)

  def setUserCol(value: String): this.type = set(userCol, value)
  setDefault(userCol -> "user")

  val itemCol = new Param[String](this, "itemCol", "Item column name")

  def getItemCol: String = $(itemCol)

  def setItemCol(value: String): this.type = set(itemCol, value)
  setDefault(itemCol -> "item")

  val predictionCol = new Param[String](this, "predictionCol", "Prediction column name")

  def getPredictionCol: String = $(predictionCol)

  def setPredictionCol(value: String): this.type = set(predictionCol, value)
  setDefault(predictionCol -> "prediction")

  val topK = new IntParam(this, "topK", "Recommend top-k items for every user")

  def getTopK: Int = $(topK)

  def setTopK(value: Int): this.type = set(topK, value)
  setDefault(topK -> 15)

  override def transformSchema(schema: StructType): StructType = {
    Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType)
      .foreach{
        case(columnName: String, expectedDataType: DataType) => {
          val actualDataType = schema(columnName).dataType
          require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.")
        }
      }

    schema
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    sourceType match {
      case "als" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK)))
      case "lr" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK)))
    }
  }

  override def copy(extra: ParamMap): RankingMetricFormatter = {
    val copied = new RankingMetricFormatter(uid, sourceType)
    copyValues(copied, extra)
  }
}

object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter]

org.apache.spark.ml.param.IntParam Scala Examples