org.apache.spark.sql.functions.rand Scala Examples
The following examples show how to use org.apache.spark.sql.functions.rand.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: UserActionsRateSource.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.source.rate import org.apache.spark.sql.functions.{col, lit, pmod, rand} import org.apache.spark.sql.{DataFrame, SparkSession} class UserActionsRateSource(val spark: SparkSession, val rowsPerSecond: String = "5", val numPartitions: String = "1") extends RateSource { def loadUserActions(): DataFrame = { readStream() .where((rand() * 100).cast("integer") < 30) // 30 out of every 100 user actions .select(pmod(col("value"), lit(9)).as("userId"), col("timestamp").as("actionTime")) } }
Example 2
Source File: AdRateSources.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.source.rate import org.apache.spark.sql.functions.{col, rand} import org.apache.spark.sql.{DataFrame, SparkSession} class AdRateSources(val spark: SparkSession, val rowsPerSecond: String = "5", val numPartitions: String = "1") extends RateSource { def loadImpressions(): DataFrame = { readStream() .select( col("value").as("adId"), col("timestamp").as("impressionTime")) } def loadClicks(): DataFrame = { readStream() .where((rand() * 100).cast("integer") < 10) // 10 out of every 100 impressions result in a click .select((col("value") - 50).as("adId"), col("timestamp").as("clickTime")) // -50 so that a click with same id as impression is generated much later (i.e. delayed data). .where("adId > 0") } }
Example 3
Source File: SchemaColumnSelection.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import scala.reflect.runtime.universe.TypeTag import java.sql.{Date, Timestamp} import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{rand, udf} case class SchemaColumnSelection[T](override val name: String, values: List[T])(implicit tag: TypeTag[T]) extends SchemaColumn { override def column(rowID: Option[Column] = None): Column = { val intToSelectionUDF = udf((index: Int) => { values(index) }) intToSelectionUDF(rand() * values.length % values.length) } } object SchemaColumnSelectionProtocol extends SchemaColumnSelectionProtocol trait SchemaColumnSelectionProtocol extends YamlParserProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnSelectionFormat extends YamlFormat[SchemaColumnSelection[_]] { override def read(yaml: YamlValue): SchemaColumnSelection[_] = { val fields = yaml.asYamlObject.fields val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError("data_type not set")) val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val values = fields.getOrElse(YamlString("values"), deserializationError("selection values not set")) dataType match { case SchemaColumnDataType.Int => SchemaColumnSelection(name, values.convertTo[List[Int]]) case SchemaColumnDataType.Long => SchemaColumnSelection(name, values.convertTo[List[Long]]) case SchemaColumnDataType.Float => SchemaColumnSelection(name, values.convertTo[List[Float]]) case SchemaColumnDataType.Double => SchemaColumnSelection(name, values.convertTo[List[Double]]) case SchemaColumnDataType.Date => SchemaColumnSelection(name, values.convertTo[List[Date]]) case SchemaColumnDataType.Timestamp => SchemaColumnSelection(name, values.convertTo[List[Timestamp]]) case SchemaColumnDataType.String => SchemaColumnSelection(name, values.convertTo[List[String]]) case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Selection}") } } override def write(obj: SchemaColumnSelection[_]): YamlValue = ??? } }
Example 4
Source File: SchemaColumnRandom.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import java.sql.{Date, Timestamp} import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{to_utc_timestamp, round, rand, from_unixtime, to_date} import org.apache.spark.sql.types.{IntegerType, LongType} trait SchemaColumnRandom[T] extends SchemaColumn object SchemaColumnRandom { val FloatDP = 3 val DoubleDP = 3 def apply(name: String, min: Int, max: Int): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Long, max: Long): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Float, max: Float): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Double, max: Double): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Date, max: Date): SchemaColumn = SchemaColumnRandomDate(name, min, max) def apply(name: String, min: Timestamp, max: Timestamp): SchemaColumn = SchemaColumnRandomTimestamp(name, min, max) def apply(name: String): SchemaColumn = SchemaColumnRandomBoolean(name) } private case class SchemaColumnRandomNumeric[T: Numeric](override val name: String, min: T, max: T) extends SchemaColumnRandom[T] { override def column(rowID: Option[Column] = None): Column = { import Numeric.Implicits._ (min, max) match { case (_: Int, _: Int) => round(rand() * (max - min) + min, 0).cast(IntegerType) case (_: Long, _: Long) => round(rand() * (max - min) + min, 0).cast(LongType) case (_: Float, _: Float) => round(rand() * (max - min) + min, SchemaColumnRandom.FloatDP) case (_: Double, _: Double) => round(rand() * (max - min) + min, SchemaColumnRandom.DoubleDP) } } } private case class SchemaColumnRandomTimestamp(override val name: String, min: Timestamp, max: Timestamp) extends SchemaColumnRandom[Timestamp] { override def column(rowID: Option[Column] = None): Column = { val minTime = min.getTime / 1000 val maxTime = max.getTime / 1000 to_utc_timestamp(from_unixtime(rand() * (maxTime - minTime) + minTime), "UTC") } } private case class SchemaColumnRandomDate(override val name: String, min: Date, max: Date) extends SchemaColumnRandom[Date] { val timestamp = SchemaColumnRandomTimestamp(name, new Timestamp(min.getTime), new Timestamp(max.getTime + 86400000)) override def column(rowID: Option[Column] = None): Column = to_date(timestamp.column()) } private case class SchemaColumnRandomBoolean(override val name: String) extends SchemaColumnRandom[Boolean] { override def column(rowID: Option[Column] = None): Column = rand() < 0.5f } object SchemaColumnRandomProtocol extends SchemaColumnRandomProtocol trait SchemaColumnRandomProtocol extends YamlParserProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnRandomFormat extends YamlFormat[SchemaColumnRandom[_]] { override def read(yaml: YamlValue): SchemaColumnRandom[_] = { val fields = yaml.asYamlObject.fields val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name")) if (dataType == SchemaColumnDataType.Boolean) { SchemaColumnRandomBoolean(name) } else { val min = fields.getOrElse(YamlString("min"), deserializationError(s"min not set for $name")) val max = fields.getOrElse(YamlString("max"), deserializationError(s"max not set for $name")) dataType match { case SchemaColumnDataType.Int => SchemaColumnRandomNumeric(name, min.convertTo[Int], max.convertTo[Int]) case SchemaColumnDataType.Long => SchemaColumnRandomNumeric(name, min.convertTo[Long], max.convertTo[Long]) case SchemaColumnDataType.Float => SchemaColumnRandomNumeric(name, min.convertTo[Float], max.convertTo[Float]) case SchemaColumnDataType.Double => SchemaColumnRandomNumeric(name, min.convertTo[Double], max.convertTo[Double]) case SchemaColumnDataType.Date => SchemaColumnRandomDate(name, min.convertTo[Date], max.convertTo[Date]) case SchemaColumnDataType.Timestamp => SchemaColumnRandomTimestamp(name, min.convertTo[Timestamp], max.convertTo[Timestamp]) case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Random}") } } } override def write(obj: SchemaColumnRandom[_]): YamlValue = ??? } }
Example 5
Source File: SampleRowsUniformly.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.algebra.sampling import org.apache.spark.sql.functions.{ rand, udf } import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter } import play.api.libs.json._ import mimir.algebra._ case class SampleRowsUniformly(probability:Double) extends SamplingMode { override def toString = s"WITH PROBABILITY $probability" def apply(plan: LogicalPlan, seed: Long): LogicalPlan = { // Adapted from Spark's df.stat.sampleBy method val r = rand(seed) val f = udf { (x: Double) => x < probability } Filter( f(r).expr, plan ) } def expressions: Seq[Expression] = Seq() def rebuildExpressions(x: Seq[Expression]): SamplingMode = this def toJson: JsValue = JsObject(Map[String,JsValue]( "mode" -> JsString(SampleRowsUniformly.MODE), "probability" -> JsNumber(probability) )) } object SampleRowsUniformly { val MODE = "uniform_probability" def parseJson(json:Map[String, JsValue]): Option[SampleRowsUniformly] = { if(json("mode").as[String].equals(MODE)){ Some(SampleRowsUniformly(json("probability").as[Double])) } else { None } } }
Example 6
Source File: SampleStratifiedOn.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.algebra.sampling import org.apache.spark.sql.functions.{rand, udf, col} import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter } import play.api.libs.json._ import mimir.algebra._ import mimir.exec.spark.RAToSpark import mimir.serialization.{ Json => MimirJson } case class SampleStratifiedOn(column:ID, t:Type, strata:Map[PrimitiveValue,Double]) extends SamplingMode { val sparkStrata = strata.map { case (v, p) => RAToSpark.getNative(v, t) -> p } .toMap override def toString = s"ON $column WITH STRATA ${strata.map { case (v,p) => s"$v -> $p"}.mkString(" | ")}" def apply(plan: LogicalPlan, seed: Long): LogicalPlan = { // Adapted from Spark's df.stat.sampleBy method val c = col(column.id) val r = rand(seed) val f = udf { (stratum: Any, x: Double) => x < sparkStrata.getOrElse(stratum, 0.0) } Filter( f(c, r).expr, plan ) } def expressions: Seq[Expression] = Seq(Var(column)) def rebuildExpressions(x: Seq[Expression]): SamplingMode = { x(0) match { case Var(newColumn) => SampleStratifiedOn(newColumn, t, strata) case _ => throw new RAException("Internal Error: Rewriting stratification variable with arbitrary expression") } } def toJson: JsValue = JsObject(Map[String,JsValue]( "mode" -> JsString(SampleStratifiedOn.MODE), "column" -> JsString(column.id), "type" -> MimirJson.ofType(t), "strata" -> JsArray( strata .toSeq .map { case (v, p) => JsObject(Map[String,JsValue]( "value" -> MimirJson.ofPrimitive(v), "probability" -> JsNumber(p) )) } ) )) } object SampleStratifiedOn { val MODE = "stratified_on" def parseJson(json:Map[String, JsValue]): Option[SampleStratifiedOn] = { if(json("mode").as[String].equals(MODE)){ val t = MimirJson.toType(json("type")) Some(SampleStratifiedOn( ID(json("column").as[String]), t, json("strata") .as[Seq[Map[String,JsValue]]] .map { stratum => MimirJson.toPrimitive(t, stratum("value")) -> stratum("probability").as[Double] } .toMap )) } else { None } } }