org.apache.spark.sql.functions Scala Examples
The following examples show how to use org.apache.spark.sql.functions.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: NestedCaseClassesTest.scala From cleanframes with Apache License 2.0 | 8 votes |
package cleanframes import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.sql.functions import org.scalatest.{FlatSpec, Matchers} class NestedCaseClassesTest extends FlatSpec with Matchers with DataFrameSuiteBase { "Cleaner" should "compile and use a custom transformer for a custom type" in { import cleanframes.syntax._ // to use `.clean` import spark.implicits._ // define test data for a dataframe val input = Seq( // @formatter:off ("1", "1", "1", "1", null), (null, "2", null, "2", "corrupted"), ("corrupted", null, "corrupted", null, "true"), ("4", "corrupted", "4", "4", "false"), ("5", "5", "5", "corrupted", "false"), ("6", "6", "6", "6", "true") // @formatter:on ) // give column names that are known to you .toDF("col1", "col2", "col3", "col4", "col5") // import standard functions for conversions shipped with the library import cleanframes.instances.all._ // !important: you need to give a new structure to allow to access sub elements val renamed = input.select( functions.struct( input.col("col1") as "a_col_1", input.col("col2") as "a_col_2" ) as "a", functions.struct( input.col("col3") as "b_col_1", input.col("col4") as "b_col_2" ) as "b", input.col("col5") as "c" ) val result = renamed.clean[AB] .as[AB] .collect result should { contain theSameElementsAs Seq( // @formatter:off AB( A(Some(1), Some(1)), B(Some(1), Some(1.0)), Some(false)), AB( A(None, Some(2)), B(None, Some(2.0)), Some(false)), AB( A(None, None), B(None, None), Some(true)), AB( A(Some(4), None), B(Some(4), Some(4.0)), Some(false)), AB( A(Some(5), Some(5)), B(Some(5), None), Some(false)), AB( A(Some(6), Some(6)), B(Some(6), Some(6.0)), Some(true)) // @formatter:on ) } } } case class A(a_col_1: Option[Int], a_col_2: Option[Float]) case class B(b_col_1: Option[Float], b_col_2: Option[Double]) case class AB(a: A, b: B, c: Option[Boolean])
Example 2
Source File: Cleaner.scala From cleanframes with Apache License 2.0 | 6 votes |
package cleanframes import org.apache.spark.sql.{Column, DataFrame, functions} import shapeless.labelled.FieldType import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness} trait Cleaner[A] { def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column] } object Cleaner { def apply[A](frame: DataFrame, name: Option[String], alias: Option[String])(implicit env: Cleaner[A]): DataFrame = { frame.select( env.clean(frame, name, alias): _* ) } def materialize[A](func: (DataFrame, Option[String], Option[String]) => List[Column]): Cleaner[A] = new Cleaner[A] { override def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column] = func(frame, name, alias) } implicit val hnilCleaner: Cleaner[HNil] = materialize((_, _, _) => Nil) implicit def genericObjectCleaner[A, H <: HList](implicit gen: LabelledGeneric.Aux[A, H], hCleaner: Lazy[Cleaner[H]]): Cleaner[A] = materialize((frame, name, alias) => { val structColumn = functions.struct( hCleaner.value.clean(frame, name, alias): _* ) List( alias .map(structColumn.as) .getOrElse(structColumn) ) }) implicit def hlistObjectCleaner[K <: Symbol, H, T <: HList](implicit witness: Witness.Aux[K], hCleaner: Lazy[Cleaner[H]], tCleaner: Cleaner[T]): Cleaner[FieldType[K, H] :: T] = { val fieldName: String = witness.value.name materialize { (frame, name, alias) => val columnName = alias match { case None | Some(`reserved_root_level_alias`) => fieldName case Some(alias) => s"$alias.$fieldName" } val hColumns = hCleaner.value.clean(frame, Some(columnName), alias = Some(fieldName)) val tColumns = tCleaner.clean(frame, name, alias) hColumns ::: tColumns } } }
Example 3
Source File: MergeClauseSuite.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.merge import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{AnalysisException, functions} class MergeClauseSuite extends SparkFunSuite { def insertClause(addCondition : Boolean = true): MergeWhenNotInsert = { if (addCondition) { MergeWhenNotInsert(Some(functions.expr("x > 2").expr), Seq(functions.col("x").expr, functions.col("y").expr)) } else { MergeWhenNotInsert(None, Seq(functions.col("x").expr, functions.col("y").expr)) } } def updateClause(addCondition : Boolean = true): MergeWhenUpdateClause = { if (addCondition) { val updateCondition = Some(functions.expr("a > 2").expr) MergeWhenUpdateClause(updateCondition, Map("b" -> functions.lit(3).expr), isStar = false) } else { MergeWhenUpdateClause(None, Map("b" -> functions.lit(3).expr), isStar = false) } } def deleteClause(addCondition : Boolean = true): MergeWhenDelete = { if (addCondition) { MergeWhenDelete(Some(functions.expr("a < 1").expr)) } else { MergeWhenDelete(None) } } test("Validate MergeClauses") { val clauses = Seq(insertClause(), updateClause(), deleteClause()) MergeWhenClause.validate(clauses) } test("Invalid MergeClause cases") { val invalidMerge = "MERGE Validation Error: " //empty clauses checkInvalidMergeClause(invalidMerge + MergeWhenClause.atleastOneClauseError, Seq()) // multi update or insert clauses val multiUpdateClauses = Seq(updateClause(), updateClause(), insertClause()) checkInvalidMergeClause(invalidMerge + MergeWhenClause.justOneClausePerTypeError, multiUpdateClauses) // multi match clauses with first clause without condition val invalidMultiMatch = Seq(updateClause(false), deleteClause()) checkInvalidMergeClause(invalidMerge + MergeWhenClause.matchClauseConditionError, invalidMultiMatch) // invalid Update Clause val invalidUpdateClause = MergeWhenUpdateClause(None, Map(), isStar = false) val thrown = intercept[IllegalArgumentException] { MergeWhenClause.validate(Seq(invalidUpdateClause)) } assert(thrown.getMessage === "UPDATE Clause in MERGE should have one or more SET Values") } private def checkInvalidMergeClause(invalidMessage: String, multiUpdateClauses: Seq[MergeWhenClause]) = { val thrown = intercept[AnalysisException] { MergeWhenClause.validate(multiUpdateClauses) } assert(thrown.message === invalidMessage) } }
Example 4
Source File: VectorExplodeSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.TestEnv import odkl.analysis.spark.util.SQLOperations import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.{functions, Row} import org.apache.spark.sql.types.{StructType, StructField, DoubleType} import org.scalatest.FlatSpec class VectorExplodeSpec extends FlatSpec with TestEnv with org.scalatest.Matchers with SQLOperations with WithModels with HasMetricsBlock { case class Point(id: Int, vector: Vector, mean: Vector) lazy val data = sqlc.createDataFrame(Seq( Point(1, Vectors.dense(1.0, 3.0), Vectors.dense(10.0, 30.0)), Point(2, Vectors.dense(2.0, 4.0), Vectors.sparse(2, Array(1), Array(20.0))) )) lazy val withMetadata = data.withColumn( "vector", data("vector").as("vector", new AttributeGroup("vector", Array[Attribute]( NumericAttribute.defaultAttr.withName("fixed"), NumericAttribute.defaultAttr.withName("var") )).toMetadata())) .withColumn( "mean", data("mean").as("mean", new AttributeGroup("vector", Array[Attribute]( NumericAttribute.defaultAttr.withName("fixed"), NumericAttribute.defaultAttr.withName("var") )).toMetadata())) lazy val explode = new VectorExplode().transform(withMetadata) "Explode " should " add data" in { val result = explode.orderBy("id", "value").collect() result(0).getInt(0) should be(1) result(0).getString(1) should be("fixed") result(0).getDouble(2) should be(1.0) result(0).getDouble(3) should be(10.0) result(1).getInt(0) should be(1) result(1).getString(1) should be("var") result(1).getDouble(2) should be(3.0) result(1).getDouble(3) should be(30.0) result(2).getInt(0) should be(2) result(2).getString(1) should be("fixed") result(2).getDouble(2) should be(2.0) result(2).isNullAt(3) should be(true) result(3).getInt(0) should be(2) result(3).getString(1) should be("var") result(3).getDouble(2) should be(4.0) result(3).getDouble(3) should be(20.0) } "Explode " should " create schema" in { val fields = explode.schema.fields fields(0).name should be("id") fields(1).name should be("value") fields(2).name should be("vector") fields(3).name should be("mean") } }
Example 5
Source File: SimpleReproContext.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.repro import org.apache.spark.ml.param.{Param, ParamPair, Params} import org.apache.spark.ml.util.MLWritable import org.apache.spark.sql.{DataFrame, Dataset, SparkSession, functions} class SimpleReproContext private (spark: SparkSession, basePath: String, tags: Seq[(String,String)]) extends ReproContext { def this(basePath: String)(implicit spark: SparkSession) = this(spark, basePath, Seq()) var accumulatedMetrics : Seq[DataFrame] = Seq() var accumulatedParams: Seq[(Seq[String], Iterable[ParamPair[_]])] = Seq() override def persistEstimator(estimator: MLWritable): Unit = { estimator.save(basePath + "/estimator") } override def persistModel(model: MLWritable): Unit = { model.save(basePath + "/model") } override def dive(tags: Seq[(String, String)]): ReproContext = new SimpleReproContext( spark, basePath, this.tags ++ tags) override def logParamPairs(params: Iterable[ParamPair[_]], path: Seq[String]): Unit = accumulatedParams = accumulatedParams :+ path -> params override def logMetircs(metrics: => DataFrame): Unit = accumulatedMetrics = accumulatedMetrics :+ metrics override def start(): Unit = { import spark.implicits._ accumulatedParams.map { case (path, params) => params.view .map(x => x.param.name -> x.param.asInstanceOf[Param[Any]].jsonEncode(x.value)) .toSeq .toDF("param", "value") .withColumn("path", functions.lit(path.mkString("/"))) }.reduce(_ unionByName _) .write.parquet(taggedPrefix + "/params") } override def finish(): Unit = { accumulatedMetrics.reduceOption(_ unionByName _).foreach( _.write.parquet(taggedPrefix + "/metrics")) } private def taggedPrefix: String = { tags.map(x => x._1 + "=" + x._2).mkString(basePath + "/", "/", "") } }
Example 6
Source File: VectorExplode.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.util.collection.OpenHashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.odkl.SparkSqlUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, functions} class VectorExplode(override val uid: String) extends Transformer with DefaultParamsWritable { val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.") def setValueCol(value: String) : this.type = set(valueCol, value) setDefault(valueCol -> "value") def this() = this(Identifiable.randomUID("vectorExplode")) override def transform(dataset: Dataset[_]): DataFrame = { val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT]) val resultSchema = StructType(Seq( StructField($(valueCol), StringType, nullable = false)) ++ vectors.map(f => StructField(f.name, DoubleType, nullable = true)) ) val arraySize = resultSchema.size - 1 val names: Array[Map[Int, String]] = vectors.map( f => { AttributeGroup.fromStructField(f).attributes .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) }) val maxCapacity = names.map(_.size).max val explodeVectors : (Row => Array[Row]) = (r: Row ) => { val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity) for(i <- 0 until r.length) { val vector = r.getAs[Vector](i) vector.foreachActive((index, value) => { val name = names(i).getOrElse(index, s"${vectors(i).name}_$index") accumulator.changeValue( name, Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN}, v => {v(i) = value; v}) }) } accumulator.map(x => new GenericRowWithSchema( (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray, resultSchema)).toArray } val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*) val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType))) val expression = functions.explode(explodeUDF(vectorsStruct)) dataset .withColumn(uid, expression) .select( dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++ resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.fields.map(x => x.dataType match { case vector: VectorUDT => StructField(x.name, typeFromVector(x)) case _ => x } )) def typeFromVector(field: StructField): StructType = { val attributes = AttributeGroup.fromStructField(field) StructType(attributes.attributes .map(_.map(a => a.name.getOrElse(s"_${a.index.get}"))) .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" }) .map(name => StructField(name, DoubleType, nullable = false))) } }
Example 7
Source File: HasConfigurations.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.hyperopt import org.apache.spark.ml.odkl.ModelWithSummary import org.apache.spark.ml.odkl.ModelWithSummary.Block import org.apache.spark.ml.param.{Param, Params} import org.apache.spark.repro.MetricsExtractor import org.apache.spark.repro.ReproContext.logMetircs import org.apache.spark.sql.{DataFrame, functions} trait HasConfigurations extends Params with MetricsExtractor { val configurations: Block = Block("configurations") val configurationIndexColumn = new Param[String](this, "configurationIndexColumn", "Name of the column to store id of config for further analysis.") val resultingMetricColumn = new Param[String](this, "resultingMetricColumn", "Name of the column to store resulting metrics for further analysis.") val errorColumn = new Param[String](this, "errorColumn", "Name of the column to store text of the error if occurs.") def getConfigurationIndexColumn: String = $(configurationIndexColumn) def setConfigurationIndexColumn(value: String): this.type = set(configurationIndexColumn, value) def getResultingMetricColumn: String = $(resultingMetricColumn) def setResultingMetricColumn(value: String): this.type = set(resultingMetricColumn, value) def getErrorColumn: String = $(errorColumn) def setErrorColumn(value: String): this.type = set(errorColumn, value) setDefault( configurationIndexColumn -> "configurationIndex", resultingMetricColumn -> "resultingMetric", errorColumn -> "error" ) protected def extractImpl(model: ModelWithSummary[_]) : Option[DataFrame] = { // Report only resulting metrics to the context assuming that detailed metrics // where reported by forks. model.summary.blocks.get(configurations).map(data => data.select( data(getConfigurationIndexColumn).as("invertedStep"), data(getResultingMetricColumn).as("value"), functions.lit("target").as("metric") ) ) } }
Example 8
Source File: NameAssigner.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasInputCols import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Dataset, functions} import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType} class NameAssigner(override val uid: String) extends Transformer with HasInputCols{ def setInputCols(column: String*) : this.type = set(inputCols, column.toArray) def this() = this(Identifiable.randomUID("NameAssigner")) override def transform(dataset: Dataset[_]): DataFrame = { $(inputCols) $(inputCols).foldLeft(dataset.toDF)((data, column) => { val metadata: Metadata = dataset.schema(column).metadata val attributes = AttributeGroup.fromStructField( StructField(column, new VectorUDT, nullable = false, metadata = metadata)) val map = attributes.attributes .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) val func = functions.udf[String, Number](x => if(x == null) { null } else { val i = x.intValue() map.getOrElse(i, i.toString) }) data.withColumn(column, func(data(column)).as(column, metadata)) }).toDF } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.map(f => if ($(inputCols).contains(f.name)) { StructField(f.name, StringType, f.nullable, f.metadata) } else { f })) }
Example 9
Source File: XGBoostUtils.scala From pravda-ml with Apache License 2.0 | 5 votes |
package ml.dmlc.xgboost4j.scala.spark import ml.dmlc.xgboost4j.scala.Booster import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{BooleanParam, Params} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} import org.apache.spark.sql.{Dataset, functions} object XGBoostUtils { def getBooster(x: XGBoostClassificationModel): Booster = x._booster def getBooster(x: XGBoostRegressionModel): Booster = x._booster } trait OkXGBoostParams extends HasFeaturesCol with HasPredictionCol { this: Params => val densifyInput = new BooleanParam(this, "densifyInput", "In order to fix the difference between spark abd xgboost sparsity treatment") val predictAsDouble = new BooleanParam(this, "predictAsDouble", "Whenver to cast XGBoost prediction to double matching common behavior for other predictors.") val addRawTrees = new BooleanParam(this, "addRawTrees", "Whenever to add raw trees block to model summary.") val addSignificance = new BooleanParam(this, "addSignificance", "Whenever to add feature significance block to model summary.") def setAddSignificance(value: Boolean): this.type = set(addSignificance, value) def setAddRawTrees(value: Boolean): this.type = set(addRawTrees, value) def setDensifyInput(value: Boolean): this.type = set(densifyInput, value) def setPredictAsDouble(value: Boolean): this.type = set(predictAsDouble, value) protected def densifyIfNeeded(dataset: Dataset[_]) : Dataset[_] = { if ($(densifyInput)) { val densify = functions.udf((x: Vector) => x.toDense) val col = getFeaturesCol val metadata = dataset.schema(col).metadata dataset.withColumn( col, densify(dataset(col)).as(col, metadata)) } else { dataset } } } trait OkXGBoostClassifierParams extends XGBoostClassifierParams with OkXGBoostParams trait OkXGBoostRegressorParams extends XGBoostRegressorParams with OkXGBoostParams
Example 10
Source File: SparkTableTest.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, functions} import org.opencypher.morpheus.impl.table.SparkTable.{DataFrameTable, _} import org.opencypher.morpheus.testing.MorpheusTestSuite import org.opencypher.okapi.testing.Bag import org.opencypher.okapi.testing.Bag._ import org.scalatest.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks import scala.collection.mutable.WrappedArray.ofLong class SparkTableTest extends MorpheusTestSuite with Matchers with ScalaCheckDrivenPropertyChecks { import morpheus.sparkSession.sqlContext.implicits._ it("it should cast integer columns to long") { val df = sparkSession.createDataFrame(List( Row(1, 2L, Array(42), Array(42), Array(42L), Row(42, 42L)) ).asJava, StructType(Seq( StructField("a", IntegerType, nullable = true), StructField("b", LongType, nullable = false), StructField("c", ArrayType(IntegerType, containsNull = true), nullable = true), StructField("d", ArrayType(IntegerType, containsNull = false), nullable = false), StructField("e", ArrayType(LongType, containsNull = false), nullable = false), StructField("f", StructType(Seq( StructField("foo", IntegerType, true), StructField("bar", LongType, false) )), nullable = true) ))) val updatedDf = df.castToLong updatedDf.schema should equal(StructType(Seq( StructField("a", LongType, nullable = true), StructField("b", LongType, nullable = false), StructField("c", ArrayType(LongType, containsNull = true), nullable = true), StructField("d", ArrayType(LongType, containsNull = false), nullable = false), StructField("e", ArrayType(LongType, containsNull = false), nullable = false), StructField("f", StructType(Seq( StructField("foo", LongType, true), StructField("bar", LongType, true) )), nullable = false) ))) updatedDf.collect().toBag should equal(Bag( Row(1L, 2L, new ofLong(Array(42L)), new ofLong(Array(42L)), new ofLong(Array(42L)), Row(42L, 42L)) )) } // These tests verifies that https://issues.apache.org/jira/browse/SPARK-26572 is still fixed describe("distinct workaround") { it("detects if the Spark bug is still fixed") { val baseTable = Seq(1, 1).toDF("idx") // Uses Spark distinct val distinctWithId = baseTable.distinct.withColumn("id", functions.monotonically_increasing_id()) val monotonicallyOnLeft = distinctWithId.join(baseTable, "idx") // Bug in Spark: "monotonically_increasing_id" is pushed down when it shouldn't be. Push down only happens when the // DF containing the "monotonically_increasing_id" expression is on the left side of the join. monotonicallyOnLeft.select("id").collect().map(_.get(0)).distinct.length shouldBe 1 } } }
Example 11
Source File: EncodeLongTest.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.encoders import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection import org.apache.spark.sql.catalyst.expressions.{Alias, GenericInternalRow} import org.apache.spark.sql.functions import org.apache.spark.sql.functions.typedLit import org.opencypher.morpheus.api.value.MorpheusElement._ import org.opencypher.morpheus.impl.expressions.EncodeLong import org.opencypher.morpheus.impl.expressions.EncodeLong._ import org.opencypher.morpheus.testing.MorpheusTestSuite import org.scalatestplus.scalacheck.Checkers class EncodeLongTest extends MorpheusTestSuite with Checkers { it("encodes longs correctly") { check((l: Long) => { val scala = l.encodeAsMorpheusId.toList val spark = typedLit[Long](l).encodeLongAsMorpheusId.expr.eval().asInstanceOf[Array[Byte]].toList scala === spark }, minSuccessful(1000)) } it("encoding/decoding is symmetric") { check((l: Long) => { val encoded = l.encodeAsMorpheusId val decoded = decodeLong(encoded) decoded === l }, minSuccessful(1000)) } it("scala version encodes longs correctly") { 0L.encodeAsMorpheusId.toList should equal(List(0.toByte)) } it("spark version encodes longs correctly") { typedLit[Long](0L).encodeLongAsMorpheusId.expr.eval().asInstanceOf[Array[Byte]].array.toList should equal(List(0.toByte)) } describe("Spark expression") { it("converts longs into byte arrays using expression interpreter") { check((l: Long) => { val positive = l & Long.MaxValue val inputRow = new GenericInternalRow(Array[Any](positive)) val encodeLong = EncodeLong(functions.lit(positive).expr) val interpreted = encodeLong.eval(inputRow).asInstanceOf[Array[Byte]] val decoded = decodeLong(interpreted) decoded === positive }, minSuccessful(1000)) } it("converts longs into byte arrays using expression code gen") { check((l: Long) => { val positive = l & Long.MaxValue val inputRow = new GenericInternalRow(Array[Any](positive)) val encodeLong = EncodeLong(functions.lit(positive).expr) val plan = GenerateMutableProjection.generate(Alias(encodeLong, s"Optimized($encodeLong)")() :: Nil) val codegen = plan(inputRow).get(0, encodeLong.dataType).asInstanceOf[Array[Byte]] val decoded = decodeLong(codegen) decoded === positive }, minSuccessful(1000)) } } }
Example 12
Source File: DataFrameOutputExample.scala From morpheus with Apache License 2.0 | 5 votes |
// tag::full-example[] package org.opencypher.morpheus.examples import org.apache.spark.sql.{DataFrame, functions} import org.opencypher.morpheus.api.MorpheusSession import org.opencypher.morpheus.api.MorpheusSession._ import org.opencypher.morpheus.util.App import org.opencypher.okapi.api.graph.CypherResult object DataFrameOutputUsingAliasExample extends App { // 1) Create Morpheus session and retrieve Spark session implicit val morpheus: MorpheusSession = MorpheusSession.local() // 2) Load social network data via case class instances val socialNetwork = morpheus.readFrom(SocialNetworkData.persons, SocialNetworkData.friendships) // 3) Query graph with Cypher val results = socialNetwork.cypher( """|MATCH (a:Person)-[r:FRIEND_OF]->(b) |RETURN a.name AS person1, b.name AS person2, r.since AS friendsSince""".stripMargin) // 4) Extract DataFrame representing the query result val df: DataFrame = results.records.asDataFrame // 5) Select aliased return items from the query result val projection: DataFrame = df .select("person1", "friendsSince", "person2") .orderBy(functions.to_date(df.col("friendsSince"), "dd/mm/yyyy")) projection.show() } // end::full-example[]
Example 13
Source File: YelpHelpers.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.integration.yelp import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, LongType} import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions} import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey} import org.opencypher.morpheus.impl.table.SparkTable._ import org.opencypher.morpheus.integration.yelp.YelpConstants._ object YelpHelpers { case class YelpTables( userDf: DataFrame, businessDf: DataFrame, reviewDf: DataFrame ) def loadYelpTables(inputPath: String)(implicit spark: SparkSession): YelpTables = { import spark.implicits._ log("read business.json", 2) val rawBusinessDf = spark.read.json(s"$inputPath/business.json") log("read review.json", 2) val rawReviewDf = spark.read.json(s"$inputPath/review.json") log("read user.json", 2) val rawUserDf = spark.read.json(s"$inputPath/user.json") val businessDf = rawBusinessDf.select($"business_id".as(sourceIdKey), $"business_id", $"name", $"address", $"city", $"state") val reviewDf = rawReviewDf.select($"review_id".as(sourceIdKey), $"user_id".as(sourceStartNodeKey), $"business_id".as(sourceEndNodeKey), $"stars", $"date".cast(DateType)) val userDf = rawUserDf.select( $"user_id".as(sourceIdKey), $"name", $"yelping_since".cast(DateType), functions.split($"elite", ",").cast(ArrayType(LongType)).as("elite")) YelpTables(userDf, businessDf, reviewDf) } def printYelpStats(inputPath: String)(implicit spark: SparkSession): Unit = { val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") import spark.implicits._ rawBusinessDf.select($"city", $"state").distinct().show() rawBusinessDf.withColumnRenamed("business_id", "id") .join(rawReviewDf, $"id" === $"business_id") .groupBy($"city", $"state") .count().as("count") .orderBy($"count".desc, $"state".asc) .show(100) } def extractYelpCitySubset(inputPath: String, outputPath: String, city: String)(implicit spark: SparkSession): Unit = { import spark.implicits._ def emailColumn(userId: String): Column = functions.concat($"$userId", functions.lit("@yelp.com")) val rawUserDf = spark.read.json(s"$inputPath/user.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val businessDf = rawBusinessDf.filter($"city" === city) val reviewDf = rawReviewDf .join(businessDf, Seq("business_id"), "left_semi") .withColumn("user_email", emailColumn("user_id")) .withColumnRenamed("stars", "stars_tmp") .withColumn("stars", $"stars_tmp".cast(IntegerType)) .drop("stars_tmp") val userDf = rawUserDf .join(reviewDf, Seq("user_id"), "left_semi") .withColumn("email", emailColumn("user_id")) val friendDf = userDf .select($"email".as("user1_email"), functions.explode(functions.split($"friends", ", ")).as("user2_id")) .withColumn("user2_email", emailColumn("user2_id")) .select(s"user1_email", s"user2_email") businessDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/business.json") reviewDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/review.json") userDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/user.json") friendDf.write.json(s"$outputPath/$cityGraphName/$yelpBookDB/friend.json") } implicit class DataFrameOps(df: DataFrame) { def prependIdColumn(idColumn: String, prefix: String): DataFrame = df.transformColumns(idColumn)(column => functions.concat(functions.lit(prefix), column).as(idColumn)) } }
Example 14
Source File: EdgeListDataSource.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.api.io.edgelist import org.apache.spark.sql.functions import org.apache.spark.sql.types.{LongType, StructField, StructType} import org.opencypher.morpheus.api.MorpheusSession import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey} import org.opencypher.morpheus.api.io.edgelist.EdgeListDataSource._ import org.opencypher.morpheus.api.io.{MorpheusNodeTable, MorpheusRelationshipTable} import org.opencypher.morpheus.schema.MorpheusSchema import org.opencypher.okapi.api.graph.{GraphName, PropertyGraph} import org.opencypher.okapi.api.io.PropertyGraphDataSource import org.opencypher.okapi.api.schema.{PropertyGraphSchema, PropertyKeys} import org.opencypher.okapi.impl.exception.UnsupportedOperationException object EdgeListDataSource { val NODE_LABEL = "V" val REL_TYPE = "E" val GRAPH_NAME = GraphName("graph") val SCHEMA: PropertyGraphSchema = MorpheusSchema.empty .withNodePropertyKeys(Set(NODE_LABEL), PropertyKeys.empty) .withRelationshipPropertyKeys(REL_TYPE, PropertyKeys.empty) } case class EdgeListDataSource(path: String, options: Map[String, String] = Map.empty)(implicit morpheus: MorpheusSession) extends PropertyGraphDataSource { override def hasGraph(name: GraphName): Boolean = name == GRAPH_NAME override def graph(name: GraphName): PropertyGraph = { val reader = options.foldLeft(morpheus.sparkSession.read) { case (current, (key, value)) => current.option(key, value) } val rawRels = reader .schema(StructType(Seq( StructField(sourceStartNodeKey, LongType), StructField(sourceEndNodeKey, LongType)))) .csv(path) .withColumn(sourceIdKey, functions.monotonically_increasing_id()) .select(sourceIdKey, sourceStartNodeKey, sourceEndNodeKey) val rawNodes = rawRels .select(rawRels.col(sourceStartNodeKey).as(sourceIdKey)) .union(rawRels.select(rawRels.col(sourceEndNodeKey).as(sourceIdKey))) .distinct() morpheus.graphs.create(MorpheusNodeTable(Set(NODE_LABEL), rawNodes), MorpheusRelationshipTable(REL_TYPE, rawRels)) } override def schema(name: GraphName): Option[PropertyGraphSchema] = Some(SCHEMA) override def store(name: GraphName, graph: PropertyGraph): Unit = throw UnsupportedOperationException("Storing an edge list is not supported") override def delete(name: GraphName): Unit = throw UnsupportedOperationException("Deleting an edge list is not supported") override val graphNames: Set[GraphName] = Set(GRAPH_NAME) }
Example 15
Source File: ConcatColumnBenchmark.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.jmh import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{DataFrame, SparkSession, functions} import org.apache.spark.storage.StorageLevel import org.opencypher.morpheus.impl.MorpheusFunctions import org.opencypher.morpheus.impl.expressions.EncodeLong._ import org.openjdk.jmh.annotations._ @State(Scope.Benchmark) @BenchmarkMode(Array(Mode.AverageTime)) class ConcatColumnBenchmark { implicit var sparkSession: SparkSession = _ var df: DataFrame = _ @Setup def setUp(): Unit = { sparkSession = SparkSession.builder().master("local[*]").getOrCreate() val fromRow = 100000000L val numRows = 1000000 val rangeDf = sparkSession.range(fromRow, fromRow + numRows).toDF("i") val indexCol = rangeDf.col("i") df = rangeDf .withColumn("s", indexCol.cast(StringType)) .withColumn("b", indexCol.encodeLongAsMorpheusId) .partitionAndCache } @Benchmark def concatWs(): Int = { val result = df.withColumn("c", functions.concat_ws("|", df.col("i"), df.col("s"), df.col("b"))) result.select("c").collect().length } @Benchmark def serialize(): Int = { val result = df.withColumn("c", MorpheusFunctions.serialize(df.col("i"), df.col("s"), df.col("b"))) result.select("c").collect().length } implicit class DataFrameSetup(df: DataFrame) { def partitionAndCache: DataFrame = { val cached = df.repartition(10).persist(StorageLevel.MEMORY_ONLY) cached.count() cached } } }
Example 16
Source File: DeleteScalaSuite.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import io.delta.tables.{DeltaTable, DeltaTableTestUtils} import org.apache.spark.sql.{functions, Row} class DeleteScalaSuite extends DeleteSuiteBase with DeltaSQLCommandTest { import testImplicits._ test("delete usage test - without condition") { append(Seq((1, 10), (2, 20), (3, 30), (4, 40)).toDF("key", "value")) val table = io.delta.tables.DeltaTable.forPath(tempPath) table.delete() checkAnswer(readDeltaTable(tempPath), Nil) } test("delete usage test - with condition") { append(Seq((1, 10), (2, 20), (3, 30), (4, 40)).toDF("key", "value")) val table = io.delta.tables.DeltaTable.forPath(tempPath) table.delete("key = 1 or key = 2") checkAnswer(readDeltaTable(tempPath), Row(3, 30) :: Row(4, 40) :: Nil) } test("delete usage test - with Column condition") { append(Seq((1, 10), (2, 20), (3, 30), (4, 40)).toDF("key", "value")) val table = io.delta.tables.DeltaTable.forPath(tempPath) table.delete(functions.expr("key = 1 or key = 2")) checkAnswer(readDeltaTable(tempPath), Row(3, 30) :: Row(4, 40) :: Nil) } override protected def executeDelete(target: String, where: String = null): Unit = { def parse(tableNameWithAlias: String): (String, Option[String]) = { tableNameWithAlias.split(" ").toList match { case tableName :: Nil => tableName -> None // just table name case tableName :: alias :: Nil => // tablename SPACE alias OR tab SPACE lename val ordinary = (('a' to 'z') ++ ('A' to 'Z') ++ ('0' to '9')).toSet if (!alias.forall(ordinary.contains(_))) { (tableName + " " + alias) -> None } else { tableName -> Some(alias) } case _ => fail(s"Could not build parse '$tableNameWithAlias' for table and optional alias") } } val deltaTable: DeltaTable = { val (tableNameOrPath, optionalAlias) = parse(target) val isPath: Boolean = tableNameOrPath.startsWith("delta.") val table = if (isPath) { val path = tableNameOrPath.stripPrefix("delta.`").stripSuffix("`") io.delta.tables.DeltaTable.forPath(spark, path) } else { DeltaTableTestUtils.createTable(spark.table(tableNameOrPath), DeltaLog.forTable(spark, tableNameOrPath)) } optionalAlias.map(table.as(_)).getOrElse(table) } if (where != null) { deltaTable.delete(where) } else { deltaTable.delete() } } }
Example 17
Source File: DeltaTableOperations.scala From delta with Apache License 2.0 | 5 votes |
package io.delta.tables.execution import scala.collection.Map import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate} import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand} import org.apache.spark.sql.delta.util.AnalysisHelper import io.delta.tables.DeltaTable import org.apache.spark.sql.{functions, Column, DataFrame, Dataset} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.logical._ trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable => protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError { val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition) toDataset(sparkSession, delete) } protected def executeHistory(deltaLog: DeltaLog, limit: Option[Int]): DataFrame = { val history = new DeltaHistoryManager(deltaLog) val spark = self.toDF.sparkSession spark.createDataFrame(history.getHistory(limit)) } protected def executeGenerate(tblIdentifier: String, mode: String): Unit = { val tableId: TableIdentifier = sparkSession .sessionState .sqlParser .parseTableIdentifier(tblIdentifier) val generate = DeltaGenerateCommand(mode, tableId) generate.run(sparkSession) } protected def executeUpdate( set: Map[String, Column], condition: Option[Column]): Unit = improveUnsupportedOpError { val assignments = set.map { case (targetColName, column) => Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr) }.toSeq val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr)) toDataset(sparkSession, update) } protected def executeVacuum( deltaLog: DeltaLog, retentionHours: Option[Double]): DataFrame = { VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours) sparkSession.emptyDataFrame } protected def toStrColumnMap(map: Map[String, String]): Map[String, Column] = { map.toSeq.map { case (k, v) => k -> functions.expr(v) }.toMap } protected def sparkSession = self.toDF.sparkSession }
Example 18
Source File: DataFrameUtils.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.util import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, functions} import org.slf4j.{Logger, LoggerFactory} object DataFrameUtils { private val logger: Logger = LoggerFactory.getLogger(getClass) type FilterFunction = Row => Boolean type PartitionCriteria = Seq[(String, String)] def mapPartitionsToDirectories(partitionCriteria: PartitionCriteria): Seq[String] = { partitionCriteria.map { case (columnName, columnValue) => s"$columnName=$columnValue" } } def buildPartitionsCriteriaMatcherFunc(multiplePartitionsCriteria: Seq[PartitionCriteria], schema: StructType): FilterFunction = { val targetPartitions = multiplePartitionsCriteria.flatten.map(_._1).toSet val fieldNameToMatchFunctionMapping = schema.fields.filter { case StructField(name, _, _, _) => targetPartitions.contains(name) }.map { case StructField(name, _: ByteType, _, _) => name -> ((r: Row, value: String) => r.getAs[Byte](name) == value.toByte) case StructField(name, _: ShortType, _, _) => name -> ((r: Row, value: String) => r.getAs[Short](name) == value.toShort) case StructField(name, _: IntegerType, _, _) => name -> ((r: Row, value: String) => r.getAs[Int](name) == value.toInt) case StructField(name, _: LongType, _, _) => name -> ((r: Row, value: String) => r.getAs[Long](name) == value.toLong) case StructField(name, _: FloatType, _, _) => name -> ((r: Row, value: String) => r.getAs[Float](name) == value.toFloat) case StructField(name, _: DoubleType, _, _) => name -> ((r: Row, value: String) => r.getAs[Double](name) == value.toDouble) case StructField(name, _: BooleanType, _, _) => name -> ((r: Row, value: String) => r.getAs[Boolean](name) == value.toBoolean) case StructField(name, _: StringType, _, _) => name -> ((r: Row, value: String) => r.getAs[String](name) == value) }.toMap def convertPartitionCriteriaToFilterFunctions(partitionCriteria: PartitionCriteria): Seq[FilterFunction] = partitionCriteria.map { case (name, value) => (row: Row) => fieldNameToMatchFunctionMapping(name)(row, value) } def joinSinglePartitionFilterFunctionsWithAnd(partitionFilterFunctions: Seq[FilterFunction]): FilterFunction = partitionFilterFunctions .reduceOption((predicate1, predicate2) => (row: Row) => predicate1(row) && predicate2(row)) .getOrElse((_: Row) => false) multiplePartitionsCriteria .map(convertPartitionCriteriaToFilterFunctions) .map(joinSinglePartitionFilterFunctionsWithAnd) .reduceOption((predicate1, predicate2) => (row: Row) => predicate1(row) || predicate2(row)) .getOrElse((_: Row) => false) } implicit class DataFrameHelper(df: DataFrame) { def collectPartitions(targetPartitions: Seq[String]): Seq[PartitionCriteria] = { logger.info(s"Collecting unique partitions for partitions columns (${targetPartitions.mkString(", ")})") val partitions = df.selectExpr(targetPartitions: _*).distinct().collect() partitions.map { row => targetPartitions.map { columnName => Option(row.getAs[Any](columnName)) match { case Some(columnValue) => columnName -> columnValue.toString case None => throw new RuntimeException(s"Partition column '$columnName' contains null value") } } } } def addMissingColumns(targetSchema: StructType): DataFrame = { val dataFieldsSet = df.schema.fieldNames.toSet val selectColumns = targetSchema.fields.map { field => if (dataFieldsSet.contains(field.name)) { functions.col(field.name) } else { functions.lit(null).cast(field.dataType).as(field.name) } } df.select(selectColumns: _*) } def isEmpty: Boolean = df.head(1).isEmpty def nonEmpty: Boolean = df.head(1).nonEmpty } }