org.apache.spark.ml.param.shared.HasSeed Scala Examples

The following examples show how to use org.apache.spark.ml.param.shared.HasSeed. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: GaussianProcessParams.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.commons

import org.apache.spark.ml.PredictorParams
import org.apache.spark.ml.commons.kernel.{Kernel, RBFKernel}
import org.apache.spark.ml.param.shared.{HasAggregationDepth, HasMaxIter, HasSeed, HasTol}
import org.apache.spark.ml.param.{DoubleParam, IntParam, Param}

private[ml] trait GaussianProcessParams extends PredictorParams
  with HasMaxIter with HasTol with HasAggregationDepth with HasSeed {

  final val activeSetProvider = new Param[ActiveSetProvider](this, "activeSetProvider",
    "the class which provides the active set used by Projected Process Approximation")

  final val kernel = new Param[() => Kernel](this,
    "kernel", "function of no arguments which returns " +
      "the kernel of the prior Gaussian Process")

  final val datasetSizeForExpert = new IntParam(this,
    "datasetSizeForExpert", "The number of data points fed to each expert. " +
      "Time and space complexity of training quadratically grows with it.")

  final val sigma2 = new DoubleParam(this,
    "sigma2", "The variance of noise in the inputs. The value is added to the diagonal of the " +
      "kernel Matrix. Also prevents numerical issues associated with inversion " +
      "of a computationally singular matrix ")

  final val activeSetSize = new IntParam(this,
    "activeSetSize", "Number of latent functions to project the process onto. " +
      "The size of the produced model and prediction complexity " +
      "linearly depend on this value.")

  def setActiveSetProvider(value : ActiveSetProvider): this.type = set(activeSetProvider, value)
  setDefault(activeSetProvider -> RandomActiveSetProvider)

  def setDatasetSizeForExpert(value: Int): this.type = set(datasetSizeForExpert, value)
  setDefault(datasetSizeForExpert -> 100)

  def setMaxIter(value: Int): this.type = set(maxIter, value)
  setDefault(maxIter -> 100)

  def setSigma2(value: Double): this.type = set(sigma2, value)
  setDefault(sigma2 -> 1e-3)

  def setKernel(value: () => Kernel): this.type = set(kernel, value)
  setDefault(kernel -> (() => new RBFKernel()))

  def setTol(value: Double): this.type = set(tol, value)
  setDefault(tol -> 1E-6)

  def setActiveSetSize(value: Int): this.type = set(activeSetSize, value)
  setDefault(activeSetSize -> 100)

  def setSeed(value: Long): this.type = set(seed, value)
} 
Example 2
Source File: StratifiedRepartition.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.contracts.{HasLabelCol, Wrappable}
import org.apache.spark.RangePartitioner
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.HasSeed
import org.apache.spark.ml.util._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}


  override def transform(dataset: Dataset[_]): DataFrame = {
    // Count unique values in label column
    val distinctLabelCounts = dataset.select(getLabelCol).groupBy(getLabelCol).count().collect()
    val labelToCount = distinctLabelCounts.map(row => (row.getInt(0), row.getLong(1)))
    val labelToFraction =
      getMode match {
        case SPConstants.Equal => getEqualLabelCount(labelToCount, dataset)
        case SPConstants.Mixed =>
          val equalLabelToCount = getEqualLabelCount(labelToCount, dataset)
          val normalizedRatio = equalLabelToCount.map { case (label, count) => count }.sum / labelToCount.length
          labelToCount.map { case (label, count) => (label, count / normalizedRatio)}.toMap
        case SPConstants.Original => labelToCount.map { case (label, count) => (label, 1.0) }.toMap
        case _ => throw new Exception(s"Unknown mode specified to StratifiedRepartition: $getMode")
      }
    val labelColIndex = dataset.schema.fieldIndex(getLabelCol)
    val spdata = dataset.toDF().rdd.keyBy(row => row.getInt(labelColIndex))
      .sampleByKeyExact(true, labelToFraction, getSeed)
      .mapPartitions(keyToRow => keyToRow.zipWithIndex.map { case ((key, row), index) => (index, row) })
    val rangePartitioner = new RangePartitioner(dataset.rdd.getNumPartitions, spdata)
    val rspdata = spdata.partitionBy(rangePartitioner).mapPartitions(keyToRow =>
      keyToRow.map{case (key, row) => row}).persist()
    dataset.sqlContext.createDataFrame(rspdata, dataset.schema)
  }

  private def getEqualLabelCount(labelToCount: Array[(Int, Long)], dataset: Dataset[_]): Map[Int, Double] = {
    val maxLabelCount = Math.max(labelToCount.map { case (label, count) => count }.max, dataset.rdd.getNumPartitions)
    labelToCount.map { case (label, count) => (label, maxLabelCount.toDouble / count) }.toMap
  }

  def transformSchema(schema: StructType): StructType = schema

  def copy(extra: ParamMap): DropColumns = defaultCopy(extra)
} 
Example 3
Source File: RandomProjectionsHasher.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl.texts

import java.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed}
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{LongType, StructType}


  def setDim(value: Long): this.type = set(dim, value)


  def this() = this(Identifiable.randomUID("randomProjectionsHasher"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val dimensity = {
      if (!isSet(dim)) {//If dimensions is not set - will search  AttributeGroup in metadata as it comes from OdklCountVectorizer
        val vectorsIndex = dataset.schema.fieldIndex($(inputCol))
        AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size
      } else {
        $(dim).toInt
      }
    }
    val projectionMatrix = dataset.sqlContext.sparkContext.broadcast(
      Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix])
  //the matrix of random vectors to costruct hash

    val binHashSparseVectorColumn = udf((vector: Vector) => {
      projectionMatrix.value.multiply(vector).values
        .map(f =>  if (f>0) 1L else 0L)
        .view.zipWithIndex
        .foldLeft(0L) {case  (acc,(v, i)) => acc | (v << i) }

    })
    dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), LongType)
  }

}