org.apache.spark.ml.param.shared.HasSeed Scala Examples
The following examples show how to use org.apache.spark.ml.param.shared.HasSeed.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GaussianProcessParams.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.commons.kernel.{Kernel, RBFKernel} import org.apache.spark.ml.param.shared.{HasAggregationDepth, HasMaxIter, HasSeed, HasTol} import org.apache.spark.ml.param.{DoubleParam, IntParam, Param} private[ml] trait GaussianProcessParams extends PredictorParams with HasMaxIter with HasTol with HasAggregationDepth with HasSeed { final val activeSetProvider = new Param[ActiveSetProvider](this, "activeSetProvider", "the class which provides the active set used by Projected Process Approximation") final val kernel = new Param[() => Kernel](this, "kernel", "function of no arguments which returns " + "the kernel of the prior Gaussian Process") final val datasetSizeForExpert = new IntParam(this, "datasetSizeForExpert", "The number of data points fed to each expert. " + "Time and space complexity of training quadratically grows with it.") final val sigma2 = new DoubleParam(this, "sigma2", "The variance of noise in the inputs. The value is added to the diagonal of the " + "kernel Matrix. Also prevents numerical issues associated with inversion " + "of a computationally singular matrix ") final val activeSetSize = new IntParam(this, "activeSetSize", "Number of latent functions to project the process onto. " + "The size of the produced model and prediction complexity " + "linearly depend on this value.") def setActiveSetProvider(value : ActiveSetProvider): this.type = set(activeSetProvider, value) setDefault(activeSetProvider -> RandomActiveSetProvider) def setDatasetSizeForExpert(value: Int): this.type = set(datasetSizeForExpert, value) setDefault(datasetSizeForExpert -> 100) def setMaxIter(value: Int): this.type = set(maxIter, value) setDefault(maxIter -> 100) def setSigma2(value: Double): this.type = set(sigma2, value) setDefault(sigma2 -> 1e-3) def setKernel(value: () => Kernel): this.type = set(kernel, value) setDefault(kernel -> (() => new RBFKernel())) def setTol(value: Double): this.type = set(tol, value) setDefault(tol -> 1E-6) def setActiveSetSize(value: Int): this.type = set(activeSetSize, value) setDefault(activeSetSize -> 100) def setSeed(value: Long): this.type = set(seed, value) }
Example 2
Source File: StratifiedRepartition.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasLabelCol, Wrappable} import org.apache.spark.RangePartitioner import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.HasSeed import org.apache.spark.ml.util._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} override def transform(dataset: Dataset[_]): DataFrame = { // Count unique values in label column val distinctLabelCounts = dataset.select(getLabelCol).groupBy(getLabelCol).count().collect() val labelToCount = distinctLabelCounts.map(row => (row.getInt(0), row.getLong(1))) val labelToFraction = getMode match { case SPConstants.Equal => getEqualLabelCount(labelToCount, dataset) case SPConstants.Mixed => val equalLabelToCount = getEqualLabelCount(labelToCount, dataset) val normalizedRatio = equalLabelToCount.map { case (label, count) => count }.sum / labelToCount.length labelToCount.map { case (label, count) => (label, count / normalizedRatio)}.toMap case SPConstants.Original => labelToCount.map { case (label, count) => (label, 1.0) }.toMap case _ => throw new Exception(s"Unknown mode specified to StratifiedRepartition: $getMode") } val labelColIndex = dataset.schema.fieldIndex(getLabelCol) val spdata = dataset.toDF().rdd.keyBy(row => row.getInt(labelColIndex)) .sampleByKeyExact(true, labelToFraction, getSeed) .mapPartitions(keyToRow => keyToRow.zipWithIndex.map { case ((key, row), index) => (index, row) }) val rangePartitioner = new RangePartitioner(dataset.rdd.getNumPartitions, spdata) val rspdata = spdata.partitionBy(rangePartitioner).mapPartitions(keyToRow => keyToRow.map{case (key, row) => row}).persist() dataset.sqlContext.createDataFrame(rspdata, dataset.schema) } private def getEqualLabelCount(labelToCount: Array[(Int, Long)], dataset: Dataset[_]): Map[Int, Double] = { val maxLabelCount = Math.max(labelToCount.map { case (label, count) => count }.max, dataset.rdd.getNumPartitions) labelToCount.map { case (label, count) => (label, maxLabelCount.toDouble / count) }.toMap } def transformSchema(schema: StructType): StructType = schema def copy(extra: ParamMap): DropColumns = defaultCopy(extra) }
Example 3
Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import java.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{LongType, StructType} def setDim(value: Long): this.type = set(dim, value) def this() = this(Identifiable.randomUID("randomProjectionsHasher")) override def transform(dataset: Dataset[_]): DataFrame = { val dimensity = { if (!isSet(dim)) {//If dimensions is not set - will search AttributeGroup in metadata as it comes from OdklCountVectorizer val vectorsIndex = dataset.schema.fieldIndex($(inputCol)) AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size } else { $(dim).toInt } } val projectionMatrix = dataset.sqlContext.sparkContext.broadcast( Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix]) //the matrix of random vectors to costruct hash val binHashSparseVectorColumn = udf((vector: Vector) => { projectionMatrix.value.multiply(vector).values .map(f => if (f>0) 1L else 0L) .view.zipWithIndex .foldLeft(0L) {case (acc,(v, i)) => acc | (v << i) } }) dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), LongType) } }