org.apache.spark.sql.functions.when Scala Examples
The following examples show how to use org.apache.spark.sql.functions.when.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AnyValInstances.scala From cleanframes with Apache License 2.0 | 5 votes |
package cleanframes.instances import cleanframes.Cleaner import org.apache.spark.sql.functions.{lower, trim, when, lit} import org.apache.spark.sql.types._ trait AnyValInstances extends IntInstances with ByteInstances with CharInstances with ShortInstances with LongInstances with FloatInstances with DoubleInstances with BooleanInstances with NumericAnyValInstance trait IntInstances { implicit lazy val integerType: SparkDataType[Int] = new SparkDataType[Int] { override def getDataType: DataType = IntegerType } } trait ByteInstances { implicit lazy val byteType: SparkDataType[Byte] = new SparkDataType[Byte] { override def getDataType: DataType = ByteType } } trait CharInstances { implicit val stdStringToChar: String => Char = _.charAt(0) } trait ShortInstances { implicit lazy val shortType: SparkDataType[Short] = new SparkDataType[Short] { override def getDataType: DataType = ShortType } } trait LongInstances { implicit lazy val longType: SparkDataType[Long] = new SparkDataType[Long] { override def getDataType: DataType = LongType } } trait FloatInstances { implicit lazy val floatType: SparkDataType[Float] = new SparkDataType[Float] { override def getDataType: DataType = FloatType } } trait DoubleInstances { implicit lazy val doubleType: SparkDataType[Double] = new SparkDataType[Double] { override def getDataType: DataType = DoubleType } } trait BooleanInstances { implicit lazy val booleanCleaner: Cleaner[Option[Boolean]] = { Cleaner.materialize { (frame, name, alias) => List( when( trim(lower(frame.col(name.get))) === "true", lit(true) cast BooleanType ).otherwise(false) as alias.get ) } } }
Example 2
Source File: package.scala From amadou with Apache License 2.0 | 5 votes |
package com.mediative import org.apache.spark.sql._ package object amadou { type Config = com.typesafe.config.Config type Gauge = io.prometheus.client.Gauge type Counter = io.prometheus.client.Counter implicit class SparkHdfsUrlReaderOps(val self: DataFrameReader) extends AnyVal { def csv(url: HdfsUrl*) = self.csv(url.map(_.toString): _*) def json(url: HdfsUrl*) = self.json(url.map(_.toString): _*) def load(url: HdfsUrl*) = self.load(url.map(_.toString): _*) def orc(url: HdfsUrl*) = self.orc(url.map(_.toString): _*) def parquet(url: HdfsUrl*) = self.parquet(url.map(_.toString): _*) def text(url: HdfsUrl*) = self.text(url.map(_.toString): _*) def textFile(url: HdfsUrl*) = self.textFile(url.map(_.toString): _*) } implicit class SparkHdfsUrlWriteOps[T](val self: DataFrameWriter[T]) extends AnyVal { def csv(url: HdfsUrl) = self.csv(url.toString) def json(url: HdfsUrl) = self.json(url.toString) def save(url: HdfsUrl) = self.save(url.toString) def orc(url: HdfsUrl) = self.orc(url.toString) def parquet(url: HdfsUrl) = self.parquet(url.toString) def text(url: HdfsUrl) = self.text(url.toString) } implicit class SymbolToStage(val self: Symbol) extends AnyVal { def stage[I, T](f: Stage.Context[I] => T) = Stage(self.name)(f) def source[T](read: Stage.Context[SparkSession] => Dataset[T]) = Stage.source(self.name)(read) def transform[S, T](transform: Stage.Context[Dataset[S]] => Dataset[T]) = Stage.transform(self.name)(transform) def sink[T](write: Stage.Context[Dataset[T]] => Unit) = Stage.sink(self.name)(write) } def nullify: Column = when(self === "null", null).otherwise(self) } }
Example 3
Source File: TriangleCount.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{array, col, explode, when} import org.graphframes.GraphFrame import org.graphframes.GraphFrame.{DST, ID, LONG_DST, LONG_SRC, SRC} class TriangleCount private[graphframes] (private val graph: GraphFrame) extends Arguments { def run(): DataFrame = { TriangleCount.run(graph) } } private object TriangleCount { private def run(graph: GraphFrame): DataFrame = { // Dedup edges by flipping them to have LONG_SRC < LONG_DST // TODO (when we drop support for Spark 1.4): Use functions greatest, smallest instead of UDFs val dedupedE = graph.indexedEdges .filter(s"$LONG_SRC != $LONG_DST") .selectExpr( s"if($LONG_SRC < $LONG_DST, $SRC, $DST) as $SRC", s"if($LONG_SRC < $LONG_DST, $DST, $SRC) as $DST") .dropDuplicates(Seq(SRC, DST)) val g2 = GraphFrame(graph.vertices, dedupedE) // Because SRC < DST, there exists only one type of triangles: // - Non-cycle with one edge flipped. These are counted 1 time each by motif finding. val triangles = g2.find("(a)-[]->(b); (b)-[]->(c); (a)-[]->(c)") val triangleCounts = triangles .select(explode(array(col("a.id"), col("b.id"), col("c.id"))).as(ID)) .groupBy(ID) .count() val v = graph.vertices val countsCol = when(col("count").isNull, 0L).otherwise(col("count")) val newV = v.join(triangleCounts, v(ID) === triangleCounts(ID), "left_outer") .select(countsCol.as(COUNT_ID) +: v.columns.map(v.apply) :_ *) newV } private val COUNT_ID = "count" }
Example 4
Source File: PythonColumnTransformationExample.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations.examples import org.apache.spark.sql.functions.when import ai.deepsense.deeplang.{DOperable, ExecutionContext} import ai.deepsense.deeplang.doperables.{PythonColumnTransformer, TargetTypeChoices} import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.doperations.PythonColumnTransformation import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class PythonColumnTransformationExample extends AbstractOperationExample[PythonColumnTransformation] { val inputColumnName = "Weight" val outputColumnName = "WeightCutoff" // This is mocked because Python executor is not available in tests. class PythonColumnTransformationMock extends PythonColumnTransformation { override def execute(arg: DataFrame)(context: ExecutionContext): (DataFrame, PythonColumnTransformer) = { val sdf = arg.sparkDataFrame val resultSparkDataFrame = sdf.select( sdf("*"), when(sdf(inputColumnName) > 2.0, 2.0).otherwise(sdf(inputColumnName)) .alias(outputColumnName)) (DataFrame.fromSparkDataFrame(resultSparkDataFrame), mock[PythonColumnTransformer]) } } override def dOperation: PythonColumnTransformation = { val op = new PythonColumnTransformationMock() val inPlace = NoInPlaceChoice() .setOutputColumn(s"$outputColumnName") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection(inputColumnName)) .setInPlace(inPlace) op.transformer .setTargetType(TargetTypeChoices.DoubleTargetTypeChoice()) .setSingleOrMultiChoice(single) .setCodeParameter( "def transform_value(value, column_name):\n" + " return min(value, 2.0)") op.set(op.transformer.extractParamMap()) } override def fileNames: Seq[String] = Seq("example_animals") }
Example 5
Source File: PythonColumnTransformationExample.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations.examples import org.apache.spark.sql.functions.when import io.deepsense.deeplang.{DOperable, ExecutionContext} import io.deepsense.deeplang.doperables.{PythonColumnTransformer, TargetTypeChoices} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.doperations.PythonColumnTransformation import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class PythonColumnTransformationExample extends AbstractOperationExample[PythonColumnTransformation] { val inputColumnName = "Weight" val outputColumnName = "WeightCutoff" // This is mocked because Python executor is not available in tests. class PythonColumnTransformationMock extends PythonColumnTransformation { override def execute(arg: DataFrame)(context: ExecutionContext): (DataFrame, PythonColumnTransformer) = { val sdf = arg.sparkDataFrame val resultSparkDataFrame = sdf.select( sdf("*"), when(sdf(inputColumnName) > 2.0, 2.0).otherwise(sdf(inputColumnName)) .alias(outputColumnName)) (DataFrame.fromSparkDataFrame(resultSparkDataFrame), mock[PythonColumnTransformer]) } } override def dOperation: PythonColumnTransformation = { val op = new PythonColumnTransformationMock() val inPlace = NoInPlaceChoice() .setOutputColumn(s"$outputColumnName") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection(inputColumnName)) .setInPlace(inPlace) op.transformer .setTargetType(TargetTypeChoices.DoubleTargetTypeChoice()) .setSingleOrMultiChoice(single) .setCodeParameter( "def transform_value(value, column_name):\n" + " return min(value, 2.0)") op.set(op.transformer.extractParamMap()) } override def fileNames: Seq[String] = Seq("example_animals") }