org.apache.spark.sql.Column Scala Examples
The following examples show how to use org.apache.spark.sql.Column.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: Cleaner.scala From cleanframes with Apache License 2.0 | 6 votes |
package cleanframes import org.apache.spark.sql.{Column, DataFrame, functions} import shapeless.labelled.FieldType import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness} trait Cleaner[A] { def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column] } object Cleaner { def apply[A](frame: DataFrame, name: Option[String], alias: Option[String])(implicit env: Cleaner[A]): DataFrame = { frame.select( env.clean(frame, name, alias): _* ) } def materialize[A](func: (DataFrame, Option[String], Option[String]) => List[Column]): Cleaner[A] = new Cleaner[A] { override def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column] = func(frame, name, alias) } implicit val hnilCleaner: Cleaner[HNil] = materialize((_, _, _) => Nil) implicit def genericObjectCleaner[A, H <: HList](implicit gen: LabelledGeneric.Aux[A, H], hCleaner: Lazy[Cleaner[H]]): Cleaner[A] = materialize((frame, name, alias) => { val structColumn = functions.struct( hCleaner.value.clean(frame, name, alias): _* ) List( alias .map(structColumn.as) .getOrElse(structColumn) ) }) implicit def hlistObjectCleaner[K <: Symbol, H, T <: HList](implicit witness: Witness.Aux[K], hCleaner: Lazy[Cleaner[H]], tCleaner: Cleaner[T]): Cleaner[FieldType[K, H] :: T] = { val fieldName: String = witness.value.name materialize { (frame, name, alias) => val columnName = alias match { case None | Some(`reserved_root_level_alias`) => fieldName case Some(alias) => s"$alias.$fieldName" } val hColumns = hCleaner.value.clean(frame, Some(columnName), alias = Some(fieldName)) val tColumns = tCleaner.clean(frame, name, alias) hColumns ::: tColumns } } }
Example 2
Source File: HiveAcidRelation.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.datasource import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession} import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan} import org.apache.spark.sql.types._ import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf} import com.qubole.spark.hiveacid.hive.HiveAcidMetadata import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert} import org.apache.spark.sql.catalyst.AliasIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import collection.JavaConversions._ case class HiveAcidRelation(sparkSession: SparkSession, fullyQualifiedTableName: String, parameters: Map[String, String]) extends BaseRelation with InsertableRelation with PrunedFilteredScan with Logging { private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession( sparkSession, fullyQualifiedTableName ) private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession, hiveAcidMetadata, parameters) private val readOptions = SparkAcidConf(sparkSession, parameters) override def sqlContext: SQLContext = sparkSession.sqlContext override val schema: StructType = if (readOptions.includeRowIds) { hiveAcidMetadata.tableSchemaWithRowId } else { hiveAcidMetadata.tableSchema } override def insert(data: DataFrame, overwrite: Boolean): Unit = { // sql insert into and overwrite if (overwrite) { hiveAcidTable.insertOverwrite(data) } else { hiveAcidTable.insertInto(data) } } def update(condition: Option[Column], newValues: Map[String, Column]): Unit = { hiveAcidTable.update(condition, newValues) } def delete(condition: Column): Unit = { hiveAcidTable.delete(condition) } override def sizeInBytes: Long = { val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong } def merge(sourceDf: DataFrame, mergeExpression: Expression, matchedClause: Seq[MergeWhenClause], notMatched: Option[MergeWhenNotInsert], sourceAlias: Option[AliasIdentifier], targetAlias: Option[AliasIdentifier]): Unit = { hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause, notMatched, sourceAlias, targetAlias) } def getHiveAcidTable(): HiveAcidTable = { hiveAcidTable } // FIXME: should it be true / false. Recommendation seems to // be to leave it as true override val needConversion: Boolean = false override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { val readOptions = SparkAcidConf(sparkSession, parameters) // sql "select *" hiveAcidTable.getRdd(requiredColumns, filters, readOptions) } }
Example 3
Source File: FunctionalDependencyConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class FunctionalDependencyConstraint(determinantSet: Seq[String], dependentSet: Seq[String]) extends Constraint { require(determinantSet.nonEmpty, "determinantSet must not be empty") require(dependentSet.nonEmpty, "dependentSet must not be empty") val fun = (df: DataFrame) => { val determinantColumns = determinantSet.map(columnName => new Column(columnName)) val dependentColumns = dependentSet.map(columnName => new Column(columnName)) val maybeRelevantSelection = Try(df.select(determinantColumns ++ dependentColumns: _*)) val maybeDeterminantValueCounts = maybeRelevantSelection.map(_.distinct.groupBy(determinantColumns: _*).count) val maybeViolatingDeterminantValuesCount = maybeDeterminantValueCounts.map(_.filter(new Column("count") =!= 1).count) FunctionalDependencyConstraintResult( constraint = this, data = maybeViolatingDeterminantValuesCount.toOption.map(FunctionalDependencyConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeViolatingDeterminantValuesCount, _ == 0) ) } } case class FunctionalDependencyConstraintResult(constraint: FunctionalDependencyConstraint, data: Option[FunctionalDependencyConstraintResultData], status: ConstraintStatus) extends ConstraintResult[FunctionalDependencyConstraint] { val message: String = { val maybeFailedRows = data.map(_.failedRows) val maybeRowPluralS = maybeFailedRows.map(failedRows => if (failedRows == 1) "" else "s") val dependentSet = constraint.dependentSet val determinantString = s"${constraint.determinantSet.mkString(", ")}" val dependentString = s"${dependentSet.mkString(", ")}" val (columnPluralS, columnVerb) = if (dependentSet.size == 1) ("", "is") else ("s", "are") (status, maybeFailedRows, maybeRowPluralS) match { case (ConstraintSuccess, Some(0), _) => s"Column$columnPluralS $dependentString $columnVerb functionally dependent on $determinantString." case (ConstraintFailure, Some(failedRows), Some(rowPluralS)) => s"Column$columnPluralS $dependentString $columnVerb not functionally dependent on " + s"$determinantString ($failedRows violating determinant value$rowPluralS)." case (ConstraintError(throwable), None, None) => s"Checking whether column$columnPluralS $dependentString $columnVerb functionally " + s"dependent on $determinantString failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class FunctionalDependencyConstraintResultData(failedRows: Long)
Example 4
Source File: UpdateCommand.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command import com.qubole.spark.hiveacid.HiveAcidErrors import com.qubole.spark.hiveacid.datasource.HiveAcidRelation import org.apache.spark.sql.{Column, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.LogicalRelation case class UpdateCommand( table: LogicalPlan, setExpressions: Map[String, Expression], condition: Option[Expression]) extends RunnableCommand { override def children: Seq[LogicalPlan] = Seq(table) override def output: Seq[Attribute] = Seq.empty override lazy val resolved: Boolean = childrenResolved override def run(sparkSession: SparkSession): Seq[Row] = { if (children.size != 1) { throw new IllegalArgumentException("UPDATE command should have one table to update, whereas this has: " + children.size) } children(0) match { case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => { val setColumns = setExpressions.mapValues(expr => new Column(expr)) val updateFilterColumn = condition.map(new Column(_)) relation.update(updateFilterColumn, setColumns) } case LogicalRelation(_, _, Some(catalogTable), _) => throw HiveAcidErrors.tableNotAcidException(catalogTable.qualifiedName) case _ => throw HiveAcidErrors.tableNotAcidException(table.toString()) } Seq.empty[Row] } }
Example 5
Source File: DeleteCommand.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command import com.qubole.spark.hiveacid.HiveAcidErrors import com.qubole.spark.hiveacid.datasource.HiveAcidRelation import org.apache.spark.sql.{Column, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.LogicalRelation case class DeleteCommand( table: LogicalPlan, condition: Expression) extends RunnableCommand { // We don't want `table` in children as sometimes we don't want to transform it. override def children: Seq[LogicalPlan] = Seq(table) override def output: Seq[Attribute] = Seq.empty override lazy val resolved: Boolean = childrenResolved override def run(sparkSession: SparkSession): Seq[Row] = { if (children.size != 1) { throw new IllegalArgumentException("DELETE command should specify exactly one table, whereas this has: " + children.size) } children(0) match { case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => { relation.delete(new Column(condition)) } case _ => throw HiveAcidErrors.tableNotAcidException(table.toString()) } Seq.empty[Row] } }
Example 6
Source File: SimilarityFunctions.scala From spark-stringmetric with MIT License | 5 votes |
package com.github.mrpowers.spark.stringmetric import com.github.mrpowers.spark.stringmetric.expressions.HammingDistance import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.functions._ import java.util.Locale import org.apache.commons.text.similarity.{ CosineDistance, JaccardSimilarity, JaroWinklerDistance, FuzzyScore } object SimilarityFunctions { private def withExpr(expr: Expression): Column = new Column(expr) val cosine_distance = udf[Option[Double], String, String](cosineDistanceFun) def cosineDistanceFun(s1: String, s2: String): Option[Double] = { val str1 = Option(s1).getOrElse(return None) val str2 = Option(s2).getOrElse(return None) val cd = new CosineDistance() Some(cd(s1, s2)) } val fuzzy_score = udf[Option[Integer], String, String](fuzzyScoreFun) def fuzzyScoreFun(s1: String, s2: String): Option[Integer] = { val str1 = Option(s1).getOrElse(return None) val str2 = Option(s2).getOrElse(return None) val f = new FuzzyScore(Locale.ENGLISH) Some(f.fuzzyScore(str1, str2)) } def hamming(s1: Column, s2: Column): Column = withExpr { HammingDistance(s1.expr, s2.expr) } val jaccard_similarity = udf[Option[Double], String, String](jaccardSimilarityFun) def jaccardSimilarityFun(s1: String, s2: String): Option[Double] = { val str1 = Option(s1).getOrElse(return None) val str2 = Option(s2).getOrElse(return None) val j = new JaccardSimilarity() Some(j.apply(str1, str2)) } val jaro_winkler = udf[Option[Double], String, String](jaroWinlkerFun) def jaroWinlkerFun(s1: String, s2: String): Option[Double] = { val str1 = Option(s1).getOrElse(return None) val str2 = Option(s2).getOrElse(return None) val j = new JaroWinklerDistance() Some(j.apply(str1, str2)) } }
Example 7
Source File: PostgresIntegrationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Connection import java.util.Properties import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.{Literal, If} import org.apache.spark.tags.DockerTest @DockerTest class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite { override val db = new DatabaseOnDocker { override val imageName = "postgres:9.4.5" override val env = Map( "POSTGRES_PASSWORD" -> "rootpass" ) override val jdbcPort = 5432 override def getJdbcUrl(ip: String, port: Int): String = s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass" } override def dataPreparation(conn: Connection): Unit = { conn.prepareStatement("CREATE DATABASE foo").executeUpdate() conn.setCatalog("foo") conn.prepareStatement("CREATE TABLE bar (c0 text, c1 integer, c2 double precision, c3 bigint, " + "c4 bit(1), c5 bit(10), c6 bytea, c7 boolean, c8 inet, c9 cidr, " + "c10 integer[], c11 text[], c12 real[])").executeUpdate() conn.prepareStatement("INSERT INTO bar VALUES ('hello', 42, 1.25, 123456789012345, B'0', " + "B'1000100101', E'\\\\xDEADBEEF', true, '172.16.0.42', '192.168.0.0/16', " + """'{1, 2}', '{"a", null, "b"}', '{0.11, 0.22}')""").executeUpdate() } test("Type mapping for various types") { val df = sqlContext.read.jdbc(jdbcUrl, "bar", new Properties) val rows = df.collect() assert(rows.length == 1) val types = rows(0).toSeq.map(x => x.getClass) assert(types.length == 13) assert(classOf[String].isAssignableFrom(types(0))) assert(classOf[java.lang.Integer].isAssignableFrom(types(1))) assert(classOf[java.lang.Double].isAssignableFrom(types(2))) assert(classOf[java.lang.Long].isAssignableFrom(types(3))) assert(classOf[java.lang.Boolean].isAssignableFrom(types(4))) assert(classOf[Array[Byte]].isAssignableFrom(types(5))) assert(classOf[Array[Byte]].isAssignableFrom(types(6))) assert(classOf[java.lang.Boolean].isAssignableFrom(types(7))) assert(classOf[String].isAssignableFrom(types(8))) assert(classOf[String].isAssignableFrom(types(9))) assert(classOf[Seq[Int]].isAssignableFrom(types(10))) assert(classOf[Seq[String]].isAssignableFrom(types(11))) assert(classOf[Seq[Double]].isAssignableFrom(types(12))) assert(rows(0).getString(0).equals("hello")) assert(rows(0).getInt(1) == 42) assert(rows(0).getDouble(2) == 1.25) assert(rows(0).getLong(3) == 123456789012345L) assert(rows(0).getBoolean(4) == false) // BIT(10)'s come back as ASCII strings of ten ASCII 0's and 1's... assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](5), Array[Byte](49, 48, 48, 48, 49, 48, 48, 49, 48, 49))) assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](6), Array[Byte](0xDE.toByte, 0xAD.toByte, 0xBE.toByte, 0xEF.toByte))) assert(rows(0).getBoolean(7) == true) assert(rows(0).getString(8) == "172.16.0.42") assert(rows(0).getString(9) == "192.168.0.0/16") assert(rows(0).getSeq(10) == Seq(1, 2)) assert(rows(0).getSeq(11) == Seq("a", null, "b")) assert(rows(0).getSeq(12).toSeq == Seq(0.11f, 0.22f)) } test("Basic write test") { val df = sqlContext.read.jdbc(jdbcUrl, "bar", new Properties) // Test only that it doesn't crash. df.write.jdbc(jdbcUrl, "public.barcopy", new Properties) // Test write null values. df.select(df.queryExecution.analyzed.output.map { a => Column(Literal.create(null, a.dataType)).as(a.name) }: _*).write.jdbc(jdbcUrl, "public.barcopy2", new Properties) } }
Example 8
Source File: FrequentItems.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.Logging import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, Column, DataFrame} private[sql] object FrequentItems extends Logging { private[sql] def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4, s"support ($support) must be greater than 1e-4.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes new DataFrame(df.sqlContext, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 9
Source File: FrequentItems.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 10
Source File: UserDefinedFunction.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.ScalaUDF import org.apache.spark.sql.types.DataType def asNondeterministic(): UserDefinedFunction = { if (!_deterministic) { this } else { val udf = copyAll() udf._deterministic = false udf } } }
Example 11
Source File: Minimum.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.min import org.apache.spark.sql.types.{DoubleType, StructType} import Analyzers._ case class MinState(minValue: Double) extends DoubleValuedState[MinState] { override def sum(other: MinState): MinState = { MinState(math.min(minValue, other.minValue)) } override def metricValue(): Double = { minValue } } case class Minimum(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MinState]("Minimum", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { min(conditionalSelection(column, where)).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = { ifNoNullsIn(result, offset) { _ => MinState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 12
Source File: CountDistinct.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.metrics.DoubleMetric import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.count import Analyzers._ case class CountDistinct(columns: Seq[String]) extends ScanShareableFrequencyBasedAnalyzer("CountDistinct", columns) { override def aggregationFunctions(numRows: Long): Seq[Column] = { count("*") :: Nil } override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { toSuccessMetric(result.getLong(offset).toDouble) } } object CountDistinct { def apply(column: String): CountDistinct = { new CountDistinct(column :: Nil) } }
Example 13
Source File: Distinctness.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.functions.{col, sum} import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.Column case class Distinctness(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Distinctness", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { (sum(col(COUNT_COL).geq(1).cast(DoubleType)) / numRows) :: Nil } override def filterCondition: Option[String] = where } object Distinctness { def apply(column: String): Distinctness = { new Distinctness(column :: Nil) } }
Example 14
Source File: Size.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.metrics.Entity import org.apache.spark.sql.{Column, Row} import Analyzers._ case class NumMatches(numMatches: Long) extends DoubleValuedState[NumMatches] { override def sum(other: NumMatches): NumMatches = { NumMatches(numMatches + other.numMatches) } override def metricValue(): Double = { numMatches.toDouble } } case class Size(where: Option[String] = None) extends StandardScanShareableAnalyzer[NumMatches]("Size", "*", Entity.Dataset) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { conditionalCount(where) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[NumMatches] = { ifNoNullsIn(result, offset) { _ => NumMatches(result.getLong(offset)) } } override def filterCondition: Option[String] = where }
Example 15
Source File: MinLength.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers._ import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString} import org.apache.spark.sql.functions.{length, min} import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.sql.{Column, Row} case class MinLength(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MinState]("MinLength", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { min(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = { ifNoNullsIn(result, offset) { _ => MinState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isString(column) :: Nil } override def filterCondition: Option[String] = where }
Example 16
Source File: Uniqueness.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, lit, sum} import org.apache.spark.sql.types.DoubleType case class Uniqueness(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil } override def filterCondition: Option[String] = where } object Uniqueness { def apply(column: String): Uniqueness = { new Uniqueness(column :: Nil) } def apply(column: String, where: Option[String]): Uniqueness = { new Uniqueness(column :: Nil, where) } }
Example 17
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.functions.sum import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.sql.{Column, Row} import Analyzers._ case class SumState(sum: Double) extends DoubleValuedState[SumState] { override def sum(other: SumState): SumState = { SumState(sum + other.sum) } override def metricValue(): Double = { sum } } case class Sum(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[SumState]("Sum", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { sum(conditionalSelection(column, where)).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[SumState] = { ifNoNullsIn(result, offset) { _ => SumState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 18
Source File: MaxLength.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers._ import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString} import org.apache.spark.sql.functions.{length, max} import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.sql.{Column, Row} case class MaxLength(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MaxState]("MaxLength", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { max(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = { ifNoNullsIn(result, offset) { _ => MaxState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column):: isString(column) :: Nil } override def filterCondition: Option[String] = where }
Example 19
Source File: Correlation.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import com.amazon.deequ.metrics.Entity import org.apache.spark.sql.DeequFunctions.stateful_corr import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.types.StructType import Analyzers._ case class CorrelationState( n: Double, xAvg: Double, yAvg: Double, ck: Double, xMk: Double, yMk: Double) extends DoubleValuedState[CorrelationState] { require(n > 0.0, "Correlation undefined for n = 0.") override def sum(other: CorrelationState): CorrelationState = { val n1 = n val n2 = other.n val newN = n1 + n2 val dx = other.xAvg - xAvg val dxN = if (newN == 0.0) 0.0 else dx / newN val dy = other.yAvg - yAvg val dyN = if (newN == 0.0) 0.0 else dy / newN val newXAvg = xAvg + dxN * n2 val newYAvg = yAvg + dyN * n2 val newCk = ck + other.ck + dx * dyN * n1 * n2 val newXMk = xMk + other.xMk + dx * dxN * n1 * n2 val newYMk = yMk + other.yMk + dy * dyN * n1 * n2 CorrelationState(newN, newXAvg, newYAvg, newCk, newXMk, newYMk) } override def metricValue(): Double = { ck / math.sqrt(xMk * yMk) } } case class Correlation( firstColumn: String, secondColumn: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[CorrelationState]("Correlation", s"$firstColumn,$secondColumn", Entity.Mutlicolumn) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { val firstSelection = conditionalSelection(firstColumn, where) val secondSelection = conditionalSelection(secondColumn, where) stateful_corr(firstSelection, secondSelection) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[CorrelationState] = { if (result.isNullAt(offset)) { None } else { val row = result.getAs[Row](offset) val n = row.getDouble(0) if (n > 0.0) { Some(CorrelationState( n, row.getDouble(1), row.getDouble(2), row.getDouble(3), row.getDouble(4), row.getDouble(5))) } else { None } } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(firstColumn) :: isNumeric(firstColumn) :: hasColumn(secondColumn) :: isNumeric(secondColumn) :: Nil } override def filterCondition: Option[String] = where }
Example 20
Source File: Entropy.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, sum, udf} case class Entropy(column: String, where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { val summands = udf { (count: Double) => if (count == 0.0) { 0.0 } else { -(count / numRows) * math.log(count / numRows) } } sum(summands(col(COUNT_COL))) :: Nil } override def filterCondition: Option[String] = where }
Example 21
Source File: StandardDeviation.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.DeequFunctions.stateful_stddev_pop import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.types.StructType import Analyzers._ case class StandardDeviationState( n: Double, avg: Double, m2: Double) extends DoubleValuedState[StandardDeviationState] { require(n > 0.0, "Standard deviation is undefined for n = 0.") override def metricValue(): Double = { math.sqrt(m2 / n) } override def sum(other: StandardDeviationState): StandardDeviationState = { val newN = n + other.n val delta = other.avg - avg val deltaN = if (newN == 0.0) 0.0 else delta / newN StandardDeviationState(newN, avg + deltaN * other.n, m2 + other.m2 + delta * deltaN * n * other.n) } } case class StandardDeviation(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[StandardDeviationState]("StandardDeviation", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { stateful_stddev_pop(conditionalSelection(column, where)) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[StandardDeviationState] = { if (result.isNullAt(offset)) { None } else { val row = result.getAs[Row](offset) val n = row.getDouble(0) if (n == 0.0) { None } else { Some(StandardDeviationState(n, row.getDouble(1), row.getDouble(2))) } } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 22
Source File: ExactEqualityConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class ExactEqualityConstraint(other: DataFrame) extends Constraint { val fun = (df: DataFrame) => { val tryEquality = Try { if (df.schema != other.schema) { throw new IllegalArgumentException("Schemas do not match") } val dfGroupCount = df.groupBy(df.columns.map(new Column(_)):_*).count() val otherGroupCount = other.groupBy(df.columns.map(new Column(_)):_*).count() val diffCount1 = dfGroupCount.except(otherGroupCount).count() val diffCount2 = otherGroupCount.except(dfGroupCount).count() (diffCount1, diffCount2) } ExactEqualityConstraintResult( constraint = this, data = tryEquality.toOption.map { case (leftToRightCount, rightToLeftCount) => ExactEqualityConstraintData(leftToRightCount, rightToLeftCount) }, status = ConstraintUtil.tryToStatus[(Long, Long)](tryEquality, { case (leftToRightCount, rightToLeftCount) => leftToRightCount + rightToLeftCount == 0 }) ) } } case class ExactEqualityConstraintResult(constraint: ExactEqualityConstraint, data: Option[ExactEqualityConstraintData], status: ConstraintStatus) extends ConstraintResult[ExactEqualityConstraint] { val message: String = { val otherName = constraint.other.toString() val maybeNonMatchingRows = data.map(data => (data.numNonMatchingLeftToRight, data.numNonMatchingRightToLeft)) val maybePluralS = maybeNonMatchingRows.map { case (leftToRightCount, rightToLeftCount) => ( if (leftToRightCount == 1) "" else "s", if (rightToLeftCount == 1) "" else "s" ) } val maybeVerb = maybeNonMatchingRows.map { case (leftToRightCount, rightToLeftCount) => ( if (leftToRightCount == 1) "is" else "are", if (rightToLeftCount == 1) "is" else "are" ) } (status, maybeNonMatchingRows, maybePluralS, maybeVerb) match { case (ConstraintSuccess, Some(_), Some(_), Some(_)) => s"It is equal to $otherName." case ( ConstraintFailure, Some((leftToRightRows, rightToLeftRows)), Some((leftToRightPluralS, rightToLeftPluralS)), Some((leftToRightVerb, rightToLeftVerb)) ) => s"It is not equal ($leftToRightRows distinct count row$leftToRightPluralS $leftToRightVerb " + s"present in the checked dataframe but not in the other " + s"and $rightToLeftRows distinct count row$rightToLeftPluralS $rightToLeftVerb " + s"present in the other dataframe but not in the checked one) to $otherName." case (ConstraintError(throwable), None, None, None) => s"Checking equality with $otherName failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class ExactEqualityConstraintData(numNonMatchingLeftToRight: Long, numNonMatchingRightToLeft: Long)
Example 23
Source File: DataFrameFunctions.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import org.apache.spark.sql.{Column, Dataset, Row} class DataFrameFunctions(self: DC[Row]) { def join(right: DC[Row]): DC[Row] = { val f = (left: Dataset[_], right: Dataset[_]) => { left.join(right) } val hashTarget = Seq("join") new MultiDatasetTransformDC(self, right, f, hashTarget) } def join(right: DC[Row], usingColumn: String): DC[Row] = { val f = (left: Dataset[_], right: Dataset[_]) => { left.join(right, usingColumn) } val hashTarget = Seq("join", usingColumn) new MultiDatasetTransformDC(self, right, f, hashTarget) } def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner") def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = { val f = (left: Dataset[_], right: Dataset[_]) => { left.join(right, joinExprs) } val hashTarget = Seq("join", joinType, joinExprs.toString()) new MultiDatasetTransformDC(self, right, f, hashTarget) } }
Example 24
Source File: StructuredRepartition.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.structuredstreaming.application import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig import org.apache.spark.sql.Column import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ class StructuredRepartition() extends StructuredBenchBase { override def process(ds: DataFrame, config: SparkBenchConfig) = { // Get the singleton instance of SparkSession val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate() import spark.implicits._ val results = ds.repartition(config.coreNumber) val query = results.writeStream .foreach(new ForeachWriter[Row] { var reporter: KafkaReporter = _ def open(partitionId: Long, version: Long): Boolean = { val reportTopic = config.reporterTopic val brokerList = config.brokerList reporter = new KafkaReporter(reportTopic, brokerList) true } def close(errorOrNull: Throwable): Unit = {} def process(record: Row): Unit = { val inTime = record(0).asInstanceOf[String].toLong val outTime = System.currentTimeMillis() reporter.report(inTime, outTime) } }) .start() query.awaitTermination() } }
Example 25
Source File: StructuredIdentity.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.structuredstreaming.application import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig import org.apache.spark.sql.Column import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ class StructuredIdentity() extends StructuredBenchBase { override def process(ds: DataFrame, config: SparkBenchConfig) = { // Get the singleton instance of SparkSession val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate() import spark.implicits._ val query = ds.writeStream .foreach(new ForeachWriter[Row] { var reporter: KafkaReporter = _ def open(partitionId: Long, version: Long): Boolean = { val reportTopic = config.reporterTopic val brokerList = config.brokerList reporter = new KafkaReporter(reportTopic, brokerList) true } def close(errorOrNull: Throwable): Unit = {} def process(record: Row): Unit = { val inTime = record(0).asInstanceOf[String].toLong val outTime = System.currentTimeMillis() reporter.report(inTime, outTime) } }) .start() query.awaitTermination() } }
Example 26
Source File: QueryPeopleTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.dataset import com.github.dnvriend.TestSpec import org.apache.spark.sql.{ Column, DataFrame } class QueryPeopleTest extends TestSpec { it should "query using DSL" in withSparkSession { spark => import spark.implicits._ import org.apache.spark.sql.functions._ val people: DataFrame = spark.read.parquet(TestSpec.PeopleParquet).cache() // name, age people.select('name).limit(1).as[String].head() shouldBe "foo" people.select($"name").limit(1).as[String].head() shouldBe "foo" people.select("name").limit(1).as[String].head() shouldBe "foo" people.select('age).limit(1).as[Int].head() shouldBe 30 people.select($"age").limit(1).as[Int].head() shouldBe 30 people.select("age").limit(1).as[Int].head() shouldBe 30 // select a column from the Dataset val col1: Column = people("name") val col2: Column = people.col("name") val departments: DataFrame = Seq((1, "sales"), (2, "administration"), (3, "human resources")) .toDF("department_id", "department_name").cache() people .withColumn("department_id", lit(1)) .withColumn("age_plus_ten", people("age") + 10) .as[(String, Int, Int, Int)].limit(1).head() shouldBe ("foo", 30, 1, 40) people .withColumn("department_id", lit(1)) .withColumn("age_plus_ten", people("age") + 10) .as('people_dep_age) .join(departments, col("people_dep_age.department_id").equalTo(departments.col("department_id"))) .select($"people_dep_age.name", col("people_dep_age.age"), departments.col("department_name")) .as[(String, Int, String)].limit(1).head() shouldBe ("foo", 30, "sales") val peopleDepAge: DataFrame = people .withColumn("department_id", lit(1)) .withColumn("age_plus_ten", people("age") + 10) peopleDepAge .join(departments, peopleDepAge("department_id") === departments("department_id")) .select(peopleDepAge("name"), peopleDepAge("age"), departments("department_name")) .as[(String, Int, String)].limit(1).head() shouldBe ("foo", 30, "sales") peopleDepAge.filter($"age" > 30) .join(departments, peopleDepAge("department_id") === departments("department_id")) .agg(avg($"age"), max($"age")).limit(1) .as[(Double, Int)].head() shouldBe (45.0, 50) } }
Example 27
Source File: DatasetUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata} import org.apache.spark.sql.{Column, DataFrame, Dataset} object DatasetUtil { def withColumns[T](ds: Dataset[T], colNames: Seq[String], cols: Seq[Column], metadata: Seq[Metadata]): DataFrame = { require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") require(colNames.size == metadata.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of metadata elements: ${metadata.size}") val sparkSession = ds.sparkSession val queryExecution = ds.queryExecution val resolver = sparkSession.sessionState.analyzer.resolver val output = queryExecution.analyzed.output checkColumnNameDuplication(colNames, "in given column names", sparkSession.sessionState.conf.caseSensitiveAnalysis) val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) => colName -> col.as(colName, metadata) }.toMap val replacedAndExistingColumns = output.map { field => columnMap.find { case (colName, _) => resolver(field.name, colName) } match { case Some((colName: String, col: Column)) => col.as(colName) case _ => new Column(field) } } val newColumns = columnMap.filter { case (colName, col) => !output.exists(f => resolver(f.name, colName)) }.map { case (colName, col) => col.as(colName) } ds.select(replacedAndExistingColumns ++ newColumns: _*) } def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = { withColumns(ds, Seq(colName), Seq(col), Seq(metadata)) } private def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } /** * Cast a column in a Dataset to Vector type. * * The supported data types of the input column are * - Vector * - float/double type Array. * * Note: The returned column does not have Metadata. * * @param dataset input DataFrame * @param colName column name. * @return Vector column */ def columnToVector(dataset: Dataset[_], colName: String): Column = { val columnDataType = dataset.schema(colName).dataType columnDataType match { case _: VectorUDT => col(colName) case fdt: ArrayType => val transferUDF = fdt.elementType match { case _: FloatType => udf(f = (vector: Seq[Float]) => { val inputArray = Array.fill[Double](vector.size)(0.0) vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble) Vectors.dense(inputArray) }) case _: DoubleType => udf((vector: Seq[Double]) => { Vectors.dense(vector.toArray) }) case other => throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector") } transferUDF(col(colName)) case other => throw new IllegalArgumentException(s"$other column cannot be cast to Vector") } } }
Example 28
Source File: NumberOfRowsConstraintTest.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import de.frosner.ddq.core.Check import de.frosner.ddq.testutils.{SparkContexts, TestData} import org.apache.spark.sql.Column import org.scalatest.{FlatSpec, Matchers} class NumberOfRowsConstraintTest extends FlatSpec with Matchers with SparkContexts { "A NumberOfRowsConstraint" should "succeed if the actual number of rows is equal to the expected" in { val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).hasNumRows(_ === 3) val constraint = check.constraints.head val result = NumberOfRowsConstraintResult( constraint = NumberOfRowsConstraint(new Column(NumberOfRowsConstraint.countKey) === 3), actual = 3L, status = ConstraintSuccess ) check.run().constraintResults shouldBe Map(constraint -> result) } it should "fail if the number of rows is not in the expected range" in { val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).hasNumRows( numRows => numRows < 3 || numRows > 3 ) val constraint = check.constraints.head val numRowsColumn = new Column(NumberOfRowsConstraint.countKey) val result = NumberOfRowsConstraintResult( constraint = NumberOfRowsConstraint(numRowsColumn < 3 || numRowsColumn > 3), actual = 3L, status = ConstraintFailure ) check.run().constraintResults shouldBe Map(constraint -> result) } "A NumberOfRowsConstraintResult" should "have the correct success message" in { val constraint = NumberOfRowsConstraint(new Column("count") > 5L) val result = NumberOfRowsConstraintResult( constraint = constraint, actual = 5L, status = ConstraintSuccess ) result.message shouldBe "The number of rows satisfies (count > 5)." } it should "have the correct failure message" in { val constraint = NumberOfRowsConstraint(new Column("count") === 5L) val result = NumberOfRowsConstraintResult( constraint = constraint, actual = 4L, status = ConstraintFailure ) result.message shouldBe "The actual number of rows 4 does not satisfy (count = 5)." } it should "throw an exception if it is created with an illegal combination of fields" in { intercept[IllegalConstraintResultException] { NumberOfRowsConstraintResult( constraint = NumberOfRowsConstraint(new Column("count") === 5L), status = ConstraintError(new IllegalArgumentException("error")), actual = 4L ) } } "NumberOfRowsConstraint.greaterThan" should "create a correct NumberOfRowsConstraint" in { val expected = 10 val constraint = NumberOfRowsConstraint.greaterThan(expected) constraint shouldBe NumberOfRowsConstraint(new Column("count") > expected) } "NumberOfRowsConstraint.lessThan" should "create a correct NumberOfRowsConstraint" in { val expected = 10 val constraint = NumberOfRowsConstraint.lessThan(expected) constraint shouldBe NumberOfRowsConstraint(new Column("count") < expected) } "NumberOfRowsConstraint.equalTo" should "create a correct NumberOfRowsConstraint" in { val expected = 10 val constraint = NumberOfRowsConstraint.equalTo(expected) constraint shouldBe NumberOfRowsConstraint(new Column("count") === expected) } }
Example 29
Source File: RegexConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import java.util.regex.Pattern import org.apache.spark.sql.functions._ import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class RegexConstraint(columnName: String, regex: String) extends Constraint { val fun = (df: DataFrame) => { val pattern = Pattern.compile(regex) val doesNotMatch = udf((column: String) => column != null && !pattern.matcher(column).find()) val maybeDoesNotMatchCount = Try(df.filter(doesNotMatch(new Column(columnName))).count) RegexConstraintResult( constraint = this, data = maybeDoesNotMatchCount.toOption.map(RegexConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeDoesNotMatchCount, _ == 0) ) } } case class RegexConstraintResult(constraint: RegexConstraint, data: Option[RegexConstraintResultData], status: ConstraintStatus) extends ConstraintResult[RegexConstraint] { val message: String = { val columnName = constraint.columnName val regex = constraint.regex val maybeFailedRows = data.map(_.failedRows) val maybePluralSAndVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) ("", "does") else ("s", "do")) (status, maybeFailedRows, maybePluralSAndVerb) match { case (ConstraintSuccess, Some(0), _) => s"Column $columnName matches $regex" case (ConstraintFailure, Some(failedRows), Some((pluralS, verb))) => s"Column $columnName contains $failedRows row$pluralS that $verb not match $regex" case (ConstraintError(throwable), None, None) => s"Checking whether column $columnName matches $regex failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class RegexConstraintResultData(failedRows: Long)
Example 30
Source File: UniqueKeyConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class UniqueKeyConstraint(columnNames: Seq[String]) extends Constraint { require(columnNames.nonEmpty) val fun = (df: DataFrame) => { val columns = columnNames.map(name => new Column(name)) val maybeNonUniqueRows = Try(df.groupBy(columns: _*).count.filter(new Column("count") > 1).count) UniqueKeyConstraintResult( constraint = this, data = maybeNonUniqueRows.toOption.map(UniqueKeyConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeNonUniqueRows, _ == 0) ) } } case class UniqueKeyConstraintResult(constraint: UniqueKeyConstraint, data: Option[UniqueKeyConstraintResultData], status: ConstraintStatus) extends ConstraintResult[UniqueKeyConstraint] { val message: String = { val columnNames = constraint.columnNames val columnsString = columnNames.mkString(", ") val isPlural = columnNames.length > 1 val columnNoun = "Column" + (if (isPlural) "s" else "") val columnVerb = if (isPlural) "are" else "is" val maybeNumNonUniqueTuples = data.map(_.numNonUniqueTuples) val maybePluralS = maybeNumNonUniqueTuples.map(numNonUniqueTuples => if (numNonUniqueTuples != 1) "s" else "") (status, maybeNumNonUniqueTuples, maybePluralS) match { case (ConstraintSuccess, Some(0), _) => s"$columnNoun $columnsString $columnVerb a key." case (ConstraintFailure, Some(numNonUniqueTuples), Some(pluralS)) => s"$columnNoun $columnsString $columnVerb not a key ($numNonUniqueTuples non-unique tuple$pluralS)." case (ConstraintError(throwable), None, None) => s"Checking whether ${columnNoun.toLowerCase()} $columnsString $columnVerb a key failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class UniqueKeyConstraintResultData(numNonUniqueTuples: Long)
Example 31
Source File: NeverNullConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class NeverNullConstraint(columnName: String) extends Constraint { val fun = (df: DataFrame) => { val tryNullCount = Try(df.filter(new Column(columnName).isNull).count) NeverNullConstraintResult( constraint = this, data = tryNullCount.toOption.map(NeverNullConstraintResultData), status = ConstraintUtil.tryToStatus[Long](tryNullCount, _ == 0) ) } } case class NeverNullConstraintResult(constraint: NeverNullConstraint, data: Option[NeverNullConstraintResultData], status: ConstraintStatus) extends ConstraintResult[NeverNullConstraint] { val message: String = { val columnName = constraint.columnName val maybeNullRows = data.map(_.nullRows) val maybePluralS = maybeNullRows.map(nullRows => if (nullRows == 1) "" else "s") val maybeVerb = maybeNullRows.map(nullRows => if (nullRows == 1) "is" else "are") (status, maybeNullRows, maybePluralS, maybeVerb) match { case (ConstraintSuccess, Some(0), Some(pluralS), Some(verb)) => s"Column $columnName is never null." case (ConstraintFailure, Some(nullRows), Some(pluralS), Some(verb)) => s"Column $columnName contains $nullRows row$pluralS that $verb null (should never be null)." case (ConstraintError(throwable), None, None, None) => s"Checking column $columnName for being never null failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class NeverNullConstraintResultData(nullRows: Long)
Example 32
Source File: ApproxCountDistinct.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.hasColumn import org.apache.spark.sql.DeequFunctions.stateful_approx_count_distinct import org.apache.spark.sql.catalyst.expressions.aggregate.DeequHyperLogLogPlusPlusUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Column, Row} import Analyzers._ case class ApproxCountDistinctState(words: Array[Long]) extends DoubleValuedState[ApproxCountDistinctState] { override def sum(other: ApproxCountDistinctState): ApproxCountDistinctState = { ApproxCountDistinctState(DeequHyperLogLogPlusPlusUtils.merge(words, other.words)) } override def metricValue(): Double = { DeequHyperLogLogPlusPlusUtils.count(words) } override def toString: String = { s"ApproxCountDistinctState(${words.mkString(",")})" } } case class ApproxCountDistinct(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[ApproxCountDistinctState]("ApproxCountDistinct", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { stateful_approx_count_distinct(conditionalSelection(column, where)) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[ApproxCountDistinctState] = { ifNoNullsIn(result, offset) { _ => DeequHyperLogLogPlusPlusUtils.wordsFromBytes(result.getAs[Array[Byte]](offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: Nil } override def filterCondition: Option[String] = where }
Example 33
Source File: ForeignKeyConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class ForeignKeyConstraint(columnNames: Seq[(String, String)], referenceTable: DataFrame) extends Constraint { val fun = (df: DataFrame) => { val renamedColumns = columnNames.map{ case (baseColumn, refColumn) => ("b_" + baseColumn, "r_" + refColumn)} val (baseColumns, refColumns) = columnNames.unzip val (renamedBaseColumns, renamedRefColumns) = renamedColumns.unzip // check if foreign key is a key in reference table val maybeNonUniqueRows = Try( referenceTable.groupBy(refColumns.map(new Column(_)):_*).count.filter(new Column("count") > 1).count ) if (maybeNonUniqueRows.toOption.exists(_ > 0)) { ForeignKeyConstraintResult( constraint = this, data = Some(ForeignKeyConstraintResultData(numNonMatchingRefs = None)), status = ConstraintFailure ) } else { // rename all columns to avoid ambiguous column references val maybeRenamedDfAndRef = maybeNonUniqueRows.map(_ => { val renamedDf = df.select(baseColumns.zip(renamedBaseColumns).map { case (original, renamed) => new Column(original).as(renamed) }: _*) val renamedRef = referenceTable.select(refColumns.zip(renamedRefColumns).map { case (original, renamed) => new Column(original).as(renamed) }: _*) (renamedDf, renamedRef) }) // check if left outer join yields some null values val maybeLeftOuterJoin = maybeRenamedDfAndRef.map { case (renamedDf, renamedRef) => val joinCondition = renamedColumns.map { case (baseColumn, refColumn) => new Column(baseColumn) === new Column(refColumn) }.reduce(_ && _) renamedDf.distinct.join(renamedRef, joinCondition, "outer") } val maybeNotMatchingRefs = maybeLeftOuterJoin.map(_.filter(renamedRefColumns.map(new Column(_).isNull).reduce(_ && _)).count) ForeignKeyConstraintResult( constraint = this, data = maybeNotMatchingRefs.toOption.map(Some(_)).map(ForeignKeyConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeNotMatchingRefs, _ == 0) ) } } } case class ForeignKeyConstraintResult(constraint: ForeignKeyConstraint, data: Option[ForeignKeyConstraintResultData], status: ConstraintStatus) extends ConstraintResult[ForeignKeyConstraint] { val message: String = { val referenceTable = constraint.referenceTable val columnNames = constraint.columnNames val columnsString = columnNames.map { case (baseCol, refCol) => baseCol + "->" + refCol }.mkString(", ") val isPlural = columnNames.length > 1 val (columnDo, columnDefine, columnIs, columnPluralS) = if (isPlural) ("do", "define", "are", "s") else ("does", "defines", "is", "") val columnNoun = "Column" + columnPluralS val maybeNumNonMatchingRefs = data.map(_.numNonMatchingRefs) (status, maybeNumNonMatchingRefs) match { case (ConstraintSuccess, Some(Some(0))) => s"$columnNoun $columnsString $columnDefine a foreign key " + s"pointing to the reference table $referenceTable." case (ConstraintFailure, Some(None)) => s"$columnNoun $columnsString $columnIs not a key in the reference table." case (ConstraintFailure, Some(Some(nonMatching))) => val (rowsNoun, rowsDo) = if (nonMatching != 1) ("rows", "do") else ("row", "does") s"$columnNoun $columnsString $columnDo not define a foreign key " + s"pointing to $referenceTable. $nonMatching $rowsNoun $rowsDo not match." case (ConstraintError(throwable), None) => s"Checking whether ${columnNoun.toLowerCase} $columnsString $columnDefine a foreign key failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class ForeignKeyConstraintResultData(numNonMatchingRefs: Option[Long])
Example 34
Source File: AnyOfConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class AnyOfConstraint(columnName: String, allowedValues: Set[Any]) extends Constraint { val fun = (df: DataFrame) => { val maybeError = Try(df.select(new Column(columnName))) // check if column is not ambiguous val maybeColumnIndex = maybeError.map(_ => df.columns.indexOf(columnName)) val maybeNotAllowedCount = maybeColumnIndex.map(columnIndex => df.rdd.filter(row => !row.isNullAt(columnIndex) && !allowedValues.contains(row.get(columnIndex))).count) AnyOfConstraintResult( constraint = this, data = maybeNotAllowedCount.toOption.map(AnyOfConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeNotAllowedCount, _ == 0) ) } } case class AnyOfConstraintResult(constraint: AnyOfConstraint, data: Option[AnyOfConstraintResultData], status: ConstraintStatus) extends ConstraintResult[AnyOfConstraint] { val message: String = { val allowed = constraint.allowedValues val columnName = constraint.columnName val maybeFailedRows = data.map(_.failedRows) val maybePluralSAndVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) ("", "is") else ("s", "are")) (status, maybeFailedRows, maybePluralSAndVerb) match { case (ConstraintSuccess, Some(0), Some((pluralS, verb))) => s"Column $columnName contains only values in $allowed." case (ConstraintFailure, Some(failedRows), Some((pluralS, verb))) => s"Column $columnName contains $failedRows row$pluralS that $verb not in $allowed." case (ConstraintError(throwable), None, None) => s"Checking whether column $columnName contains only values in $allowed failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class AnyOfConstraintResultData(failedRows: Long)
Example 35
Source File: JoinableConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class JoinableConstraint(columnNames: Seq[(String, String)], referenceTable: DataFrame) extends Constraint { val fun = (df: DataFrame) => { val columnsMap = columnNames.toMap val renamedColumns = columnNames.map{ case (baseColumn, refColumn) => ("b_" + baseColumn, "r_" + refColumn)} val (baseColumns, refColumns) = columnNames.unzip val (renamedBaseColumns, renamedRefColumns) = renamedColumns.unzip val maybeNonUniqueRows = Try( referenceTable.groupBy(refColumns.map(new Column(_)):_*).count.filter(new Column("count") > 1).count ) // rename all columns to avoid ambiguous column references val maybeRenamedDfAndRef = maybeNonUniqueRows.map(_ => { val renamedDf = df.select(baseColumns.zip(renamedBaseColumns).map { case (original, renamed) => new Column(original).as(renamed) }: _*) val renamedRef = referenceTable.select(refColumns.zip(renamedRefColumns).map { case (original, renamed) => new Column(original).as(renamed) }: _*) (renamedDf, renamedRef) }) // check if join yields some values val maybeDistinctBeforeAndMatchingRows = maybeRenamedDfAndRef.map { case (renamedDf, renamedRef) => val renamedDfDistinct = renamedDf.distinct val distinctBefore = renamedDfDistinct.count val joinCondition = renamedColumns.map{ case (baseColumn, refColumn) => new Column(baseColumn) === new Column(refColumn) }.reduce(_ && _) val join = renamedDfDistinct.join(renamedRef, joinCondition) val matchingRows = join.distinct.count (distinctBefore, matchingRows) } JoinableConstraintResult( constraint = this, data = maybeDistinctBeforeAndMatchingRows.toOption.map{ case (distinctBefore, matchingRows) => JoinableConstraintResultData( distinctBefore = distinctBefore, matchingKeys = matchingRows ) }, status = ConstraintUtil.tryToStatus[Long](maybeDistinctBeforeAndMatchingRows.map{ case (distinctBefore, matchingRows) => matchingRows }, _ > 0) ) } } case class JoinableConstraintResult(constraint: JoinableConstraint, data: Option[JoinableConstraintResultData], status: ConstraintStatus) extends ConstraintResult[JoinableConstraint] { val maybeMatchRatio: Option[Double] = data.map(d => d.matchingKeys.toDouble / d.distinctBefore) val message: String = { val columnNames = constraint.columnNames val columnsString = columnNames.map{ case (baseCol, refCol) => baseCol + "->" + refCol }.mkString(", ") val maybeMatchPercentage = maybeMatchRatio.map(_ * 100.0) (status, data, maybeMatchPercentage) match { case (ConstraintSuccess, Some(JoinableConstraintResultData(distinctBefore, matchingKeys)), Some(matchPercentage)) => s"Key $columnsString can be used for joining. " + s"Join columns cardinality in base table: $distinctBefore. " + s"Join columns cardinality after joining: $matchingKeys (${"%.2f".format(matchPercentage)}" + "%)." case (ConstraintFailure, Some(_), Some(_)) => s"Key $columnsString cannot be used for joining (no result)." case (ConstraintError(throwable), None, None) => s"Checking whether $columnsString can be used for joining failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class JoinableConstraintResultData(distinctBefore: Long, matchingKeys: Long)
Example 36
Source File: ColumnColumnConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class ColumnColumnConstraint(constraintColumn: Column) extends Constraint { val fun = (df: DataFrame) => { val maybeFailingRows = Try { val succeedingRows = df.filter(constraintColumn).count df.count - succeedingRows } ColumnColumnConstraintResult( constraint = this, data = maybeFailingRows.toOption.map(ColumnColumnConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0) ) } } case class ColumnColumnConstraintResult(constraint: ColumnColumnConstraint, data: Option[ColumnColumnConstraintResultData], status: ConstraintStatus) extends ConstraintResult[ColumnColumnConstraint] { val message: String = ColumnConstraintUtil.createColumnConstraintMessage( status = status, constraintResult = this, constraintString = constraint.constraintColumn.toString, maybeViolatingRows = data.map(_.failedRows) ) } case class ColumnColumnConstraintResultData(failedRows: Long)
Example 37
Source File: ConditionalColumnConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class ConditionalColumnConstraint(statement: Column, implication: Column) extends Constraint { val fun = (df: DataFrame) => { val maybeFailingRows = Try { val succeedingRows = df.filter(!statement || implication).count df.count - succeedingRows } ConditionalColumnConstraintResult( constraint = this, data = maybeFailingRows.toOption.map(ConditionalColumnConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0) ) } } case class ConditionalColumnConstraintResult(constraint: ConditionalColumnConstraint, data: Option[ConditionalColumnConstraintResultData], status: ConstraintStatus) extends ConstraintResult[ConditionalColumnConstraint] { val message: String = ColumnConstraintUtil.createColumnConstraintMessage( status = status, constraintResult = this, constraintString = s"${constraint.statement} -> ${constraint.implication}", maybeViolatingRows = data.map(_.failedRows) ) } case class ConditionalColumnConstraintResultData(failedRows: Long)
Example 38
Source File: TypeConversionConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class TypeConversionConstraint(columnName: String, convertedType: DataType) extends Constraint { val fun = (df: DataFrame) => { val originalColumn = new Column(columnName) val castedColumnName = columnName + "_casted" val maybeCasted = Try(df.select(originalColumn, originalColumn.cast(convertedType).as(castedColumnName))) val maybeFailedCastsAndOriginalType = maybeCasted.map(casted => { val failedCastsCount = casted.filter(new Column(castedColumnName).isNull && originalColumn.isNotNull).count val originalType = df.schema.find(_.name == columnName).get.dataType (failedCastsCount, originalType) }) TypeConversionConstraintResult( constraint = this, data = maybeFailedCastsAndOriginalType.toOption.map{ case (failedCastsCount, originalType) => TypeConversionConstraintResultData( originalType = originalType, failedRows = failedCastsCount ) }, status = ConstraintUtil.tryToStatus[Long](maybeFailedCastsAndOriginalType.map{ case (failedCastsCount, originalType) => failedCastsCount }, _ == 0) ) } } case class TypeConversionConstraintResult(constraint: TypeConversionConstraint, data: Option[TypeConversionConstraintResultData], status: ConstraintStatus) extends ConstraintResult[TypeConversionConstraint] { val message: String = { val convertedType = constraint.convertedType val columnName = constraint.columnName val maybePluralSVerb = data.map(data => if (data.failedRows == 1) ("", "is") else ("s", "are")) (status, data, maybePluralSVerb) match { case (ConstraintSuccess, Some(TypeConversionConstraintResultData(originalType, 0)), _) => s"Column $columnName can be converted from $originalType to $convertedType." case (ConstraintFailure, Some(TypeConversionConstraintResultData(originalType, failedRows)), Some((pluralS, verb))) => s"Column $columnName cannot be converted from $originalType to $convertedType. " + s"$failedRows row$pluralS could not be converted." case (ConstraintError(throwable), None, None) => s"Checking whether column $columnName can be converted to $convertedType failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class TypeConversionConstraintResultData(originalType: DataType, failedRows: Long)
Example 39
Source File: DateFormatConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import java.text.SimpleDateFormat import org.apache.spark.sql.functions._ import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class DateFormatConstraint(columnName: String, formatString: String) extends Constraint { val fun = (df: DataFrame) => { val cannotBeDate = udf((column: String) => column != null && Try { val format = new SimpleDateFormat(formatString) format.setLenient(false) format.parse(column) }.isFailure) val maybeCannotBeDateCount = Try(df.filter(cannotBeDate(new Column(columnName))).count) DateFormatConstraintResult( this, data = maybeCannotBeDateCount.toOption.map(DateFormatConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeCannotBeDateCount, _ == 0) ) } } case class DateFormatConstraintResult(constraint: DateFormatConstraint, data: Option[DateFormatConstraintResultData], status: ConstraintStatus) extends ConstraintResult[DateFormatConstraint] { val message: String = { val format = constraint.formatString val columnName = constraint.columnName val maybeFailedRows = data.map(_.failedRows) val maybePluralS = maybeFailedRows.map(failedRows => if (failedRows == 1) "" else "s") val maybeVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) "is" else "are") (status, maybeFailedRows, maybePluralS, maybeVerb) match { case (ConstraintSuccess, Some(0), _, _) => s"Column $columnName is formatted by $format." case (ConstraintFailure, Some(failedRows), Some(pluralS), Some(verb)) => s"Column $columnName contains $failedRows row$pluralS that $verb not formatted by $format." case (ConstraintError(throwable), None, None, None) => s"Checking whether column $columnName is formatted by $format failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class DateFormatConstraintResultData(failedRows: Long)
Example 40
Source File: AlwaysNullConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class AlwaysNullConstraint(columnName: String) extends Constraint { override val fun = (df: DataFrame) => { val tryNotNullCount = Try(df.filter(new Column(columnName).isNotNull).count) AlwaysNullConstraintResult( constraint = this, status = ConstraintUtil.tryToStatus[Long](tryNotNullCount, _ == 0), data = tryNotNullCount.toOption.map(AlwaysNullConstraintResultData) ) } } case class AlwaysNullConstraintResult(constraint: AlwaysNullConstraint, status: ConstraintStatus, data: Option[AlwaysNullConstraintResultData] ) extends ConstraintResult[AlwaysNullConstraint] { val message: String = { val columnName = constraint.columnName val maybeNonNullRows = data.map(_.nonNullRows) val maybePluralS = maybeNonNullRows.map(n => if (n == 1) "" else "s") (status, maybeNonNullRows, maybePluralS) match { case (ConstraintError(throwable), None, None) => s"Checking column $columnName for being always null failed: $throwable" case (ConstraintSuccess, Some(0), Some(pluralS)) => s"Column $columnName is always null." case (ConstraintFailure, Some(nonNullRows), Some(pluralS)) => s"Column $columnName contains $nonNullRows non-null row$pluralS (should always be null)." case default => throw IllegalConstraintResultException(this) } } } case class AlwaysNullConstraintResultData(nonNullRows: Long)
Example 41
Source File: NumberOfRowsConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.functions.count import org.apache.spark.sql.{Column, DataFrame} case class NumberOfRowsConstraint private[ddq] (expected: Column) extends Constraint { val fun = (df: DataFrame) => { val countDf = df.agg(count(new Column("*")).as(NumberOfRowsConstraint.countKey)) val actual = countDf.collect().map(_.getLong(0)).apply(0) val satisfied = countDf.select(expected).collect().map(_.getBoolean(0)).apply(0) NumberOfRowsConstraintResult( constraint = this, actual = actual, status = if (satisfied) ConstraintSuccess else ConstraintFailure ) } } object NumberOfRowsConstraint { private[constraints] val countKey: String = "count" def apply(expected: Column => Column): NumberOfRowsConstraint = { new NumberOfRowsConstraint(expected(new Column(countKey))) } def greaterThan(expected: Int): NumberOfRowsConstraint = { NumberOfRowsConstraint(_ > expected) } def lessThan(expected: Int): NumberOfRowsConstraint = { NumberOfRowsConstraint(_ < expected) } def equalTo(expected: Int): NumberOfRowsConstraint = { NumberOfRowsConstraint(_ === expected) } } case class NumberOfRowsConstraintResult(constraint: NumberOfRowsConstraint, actual: Long, status: ConstraintStatus) extends ConstraintResult[NumberOfRowsConstraint] { val message: String = { val expected = constraint.expected status match { case ConstraintSuccess => s"The number of rows satisfies $expected." case ConstraintFailure => s"The actual number of rows $actual does not satisfy $expected." case default => throw IllegalConstraintResultException(this) } } }
Example 42
Source File: HivemallUtils.scala From hivemall-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, DataFrame, Row, UserDefinedFunction} object HivemallUtils { // # of maximum dimensions for feature vectors val maxDims = 100000000 def funcVectorizer(dense: Boolean = false, dims: Int = maxDims) : UserDefinedFunction = { udf(funcVectorizerImpl(dense, dims)) } private def funcVectorizerImpl(dense: Boolean, dims: Int) : Seq[String] => Vector = { if (dense) { // Dense features i: Seq[String] => { val features = new Array[Double](dims) i.map { ft => val s = ft.split(":").ensuring(_.size == 2) features(s(0).toInt) = s(1).toDouble } Vectors.dense(features) } } else { // Sparse features i: Seq[String] => { val features = i.map { ft => // val s = ft.split(":").ensuring(_.size == 2) val s = ft.split(":") (s(0).toInt, s(1).toDouble) } Vectors.sparse(dims, features) } } } }
Example 43
Source File: udfs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Column import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.DoubleType import scala.collection.mutable //scalastyle:off object udfs { def get_value_at(colName: String, i: Int): Column = { udf({ vec: org.apache.spark.ml.linalg.Vector => vec(i) }, DoubleType)(col(colName)) } val to_vector: UserDefinedFunction = udf({ arr: Seq[Double] => Vectors.dense(arr.toArray) }, VectorType) def to_vector(colName: String): Column = to_vector(col(colName)) }
Example 44
Source File: UDFTransformer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasInputCols, HasOutputCol, Wrappable} import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.serialize.ComplexParam import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam, UDPyFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.execution.python.UserDefinedPythonFunction import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.{Column, DataFrame, Dataset} import org.apache.spark.sql.functions.col object UDFTransformer extends ComplexParamsReadable[UDFTransformer] override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) if (isSet(inputCol)) { dataset.withColumn(getOutputCol, applyUDF(dataset.col(getInputCol))) } else { dataset.withColumn(getOutputCol, applyUDFOnCols(getInputCols.map(col): _*)) } } def validateAndTransformSchema(schema: StructType): StructType = { if (isSet(inputCol)) schema(getInputCol) else schema(Set(getInputCols: _*)) schema.add(StructField(getOutputCol, getDataType)) } def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema) def copy(extra: ParamMap): UDFTransformer = defaultCopy(extra) }
Example 45
Source File: ServingUDFs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package org.apache.spark.sql.execution.streaming import com.microsoft.ml.spark.io.http.HTTPResponseData import com.microsoft.ml.spark.io.http.HTTPSchema.{binary_to_response, empty_response, string_to_response} import org.apache.spark.sql.execution.streaming.continuous.HTTPSourceStateHolder import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{lit, struct, to_json, udf} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, Row} import scala.util.Try object ServingUDFs { private def jsonReply(c: Column) = string_to_response(to_json(c)) def makeReplyUDF(data: Column, dt: DataType, code: Column = lit(200), reason: Column = lit("Success")): Column = { dt match { case NullType => empty_response(code, reason) case StringType => string_to_response(data, code, reason) case BinaryType => binary_to_response(data) case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case at: ArrayType => at.elementType match { case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case _ => jsonReply(struct(data)) } case _ => jsonReply(struct(data)) } } private def sendReplyHelper(mapper: Row => HTTPResponseData)(serviceName: String, reply: Row, id: Row): Boolean = { if (Option(reply).isEmpty || Option(id).isEmpty) { null.asInstanceOf[Boolean] //scalastyle:ignore null } else { Try(HTTPSourceStateHolder.getServer(serviceName).replyTo(id.getString(0), id.getString(1), mapper(reply))) .toOption.isDefined } } def sendReplyUDF: UserDefinedFunction = { val toData = HTTPResponseData.makeFromRowConverter udf(sendReplyHelper(toData) _, BooleanType) } }
Example 46
Source File: DeltaTableOperations.scala From delta with Apache License 2.0 | 5 votes |
package io.delta.tables.execution import scala.collection.Map import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate} import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand} import org.apache.spark.sql.delta.util.AnalysisHelper import io.delta.tables.DeltaTable import org.apache.spark.sql.{functions, Column, DataFrame, Dataset} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.logical._ trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable => protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError { val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition) toDataset(sparkSession, delete) } protected def executeHistory(deltaLog: DeltaLog, limit: Option[Int]): DataFrame = { val history = new DeltaHistoryManager(deltaLog) val spark = self.toDF.sparkSession spark.createDataFrame(history.getHistory(limit)) } protected def executeGenerate(tblIdentifier: String, mode: String): Unit = { val tableId: TableIdentifier = sparkSession .sessionState .sqlParser .parseTableIdentifier(tblIdentifier) val generate = DeltaGenerateCommand(mode, tableId) generate.run(sparkSession) } protected def executeUpdate( set: Map[String, Column], condition: Option[Column]): Unit = improveUnsupportedOpError { val assignments = set.map { case (targetColName, column) => Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr) }.toSeq val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr)) toDataset(sparkSession, update) } protected def executeVacuum( deltaLog: DeltaLog, retentionHours: Option[Double]): DataFrame = { VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours) sparkSession.emptyDataFrame } protected def toStrColumnMap(map: Map[String, String]): Map[String, Column] = { map.toSeq.map { case (k, v) => k -> functions.expr(v) }.toMap } protected def sparkSession = self.toDF.sparkSession }
Example 47
Source File: HasEmbeddingsProperties.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.AnnotatorType import org.apache.spark.ml.param.{BooleanParam, IntParam, Params} import org.apache.spark.sql.Column import org.apache.spark.sql.types.MetadataBuilder trait HasEmbeddingsProperties extends Params { val dimension = new IntParam(this, "dimension", "Number of embedding dimensions") def setDimension(value: Int): this.type = set(this.dimension, value) def getDimension: Int = $(dimension) protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = { val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS) metadataBuilder.putLong("dimension", embeddingsDim.toLong) embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref)) col.as(col.toString, metadataBuilder.build) } protected def wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = { val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", AnnotatorType.SENTENCE_EMBEDDINGS) metadataBuilder.putLong("dimension", embeddingsDim.toLong) embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref)) col.as(col.toString, metadataBuilder.build) } }
Example 48
Source File: PartitionHelpers.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo.core import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} trait PartitionHelpers { protected def getDistinctPartitions(outputDataFrame: DataFrame, targetPartitions: Seq[String]): Dataset[Row] = { val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString)) outputDataFrame.select(targetPartitionsColumns: _*).distinct } protected def getParameterValue(row: Row, partitionString: String): String = createParameterValue(row.get(row.fieldIndex(partitionString))) protected def createParameterValue(partitionRawValue: Any): String = partitionRawValue match { case value: java.lang.Short => value.toString case value: java.lang.Integer => value.toString case value: scala.Predef.String => "'" + value + "'" case null => throw new Exception("Partition Value is null. No support for null partitions!") case value => throw new Exception("Unsupported partition DataType: " + value.getClass) } }
Example 49
Source File: FrequentItems.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 50
Source File: DataFrameModifierHelper.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.writer import com.stratio.sparta.sdk.pipeline.autoCalculations.AutoCalculatedField import com.stratio.sparta.sdk.pipeline.output.Output import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{Column, DataFrame} object DataFrameModifierHelper { def applyAutoCalculateFields(dataFrame: DataFrame, autoCalculateFields: Seq[AutoCalculatedField], auxSchema: StructType): DataFrame = autoCalculateFields.headOption match { case Some(firstAutoCalculate) => applyAutoCalculateFields( addColumnToDataFrame(dataFrame, firstAutoCalculate, auxSchema), autoCalculateFields.drop(1), auxSchema) case None => dataFrame } private[driver] def addColumnToDataFrame(dataFrame: DataFrame, autoCalculateField: AutoCalculatedField, auxSchema: StructType): DataFrame = { (autoCalculateField.fromNotNullFields, autoCalculateField.fromPkFields, autoCalculateField.fromFields, autoCalculateField.fromFixedValue) match { case (Some(fromNotNullFields), _, _, _) => val fields = fieldsWithAuxMetadata(dataFrame.schema.fields, auxSchema.fields).flatMap(field => if (!field.nullable) Some(col(field.name)) else None).toSeq addField(fromNotNullFields.field.name, fromNotNullFields.field.outputType, dataFrame, fields) case (None, Some(fromPkFields), _, _) => val fields = fieldsWithAuxMetadata(dataFrame.schema.fields, auxSchema.fields).flatMap(field => if (field.metadata.contains(Output.PrimaryKeyMetadataKey)) Some(col(field.name)) else None).toSeq addField(fromPkFields.field.name, fromPkFields.field.outputType, dataFrame, fields) case (None, None, Some(fromFields), _) => val fields = autoCalculateField.fromFields.get.fromFields.map(field => col(field)) addField(fromFields.field.name, fromFields.field.outputType, dataFrame, fields) case (None, None, None, Some(fromFixedValue)) => addLiteral(fromFixedValue.field.name, fromFixedValue.field.outputType, dataFrame, fromFixedValue.value) case _ => dataFrame } } private[driver] def addField(name: String, outputType: String, dataFrame: DataFrame, fields: Seq[Column]): DataFrame = outputType match { case "string" => dataFrame.withColumn(name, concat_ws(Output.Separator, fields: _*)) case "array" => dataFrame.withColumn(name, array(fields: _*)) case "map" => dataFrame.withColumn(name, struct(fields: _*)) case _ => dataFrame } private[driver] def addLiteral(name: String, outputType: String, dataFrame: DataFrame, literal: String): DataFrame = outputType match { case "string" => dataFrame.withColumn(name, lit(literal)) case "array" => dataFrame.withColumn(name, array(lit(literal))) case "map" => dataFrame.withColumn(name, struct(lit(literal))) case _ => dataFrame } private[driver] def fieldsWithAuxMetadata(dataFrameFields: Array[StructField], auxFields: Array[StructField]) = dataFrameFields.map(field => { auxFields.find(auxField => auxField.name == field.name) match { case Some(auxFounded) => field.copy(metadata = auxFounded.metadata) case None => field } }) }
Example 51
Source File: ShortestPaths.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import java.util import scala.collection.JavaConverters._ import org.apache.spark.graphx.{lib => graphxlib} import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.api.java.UDF1 import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{IntegerType, MapType} import org.graphframes.GraphFrame def landmarks(value: util.ArrayList[Any]): this.type = { landmarks(value.asScala) } def run(): DataFrame = { ShortestPaths.run(graph, check(lmarks, "landmarks")) } } private object ShortestPaths { private def run(graph: GraphFrame, landmarks: Seq[Any]): DataFrame = { val idType = graph.vertices.schema(GraphFrame.ID).dataType val longIdToLandmark = landmarks.map(l => GraphXConversions.integralId(graph, l) -> l).toMap val gx = graphxlib.ShortestPaths.run( graph.cachedTopologyGraphX, longIdToLandmark.keys.toSeq.sorted).mapVertices { case (_, m) => m.toSeq } val g = GraphXConversions.fromGraphX(graph, gx, vertexNames = Seq(DISTANCE_ID)) val distanceCol: Column = if (graph.hasIntegralIdType) { // It seems there are no easy way to convert a sequence of pairs into a map val mapToLandmark = udf { distances: Seq[Row] => distances.map { case Row(k: Long, v: Int) => k -> v }.toMap } mapToLandmark(g.vertices(DISTANCE_ID)) } else { val func = new UDF1[Seq[Row], Map[Any, Int]] { override def call(t1: Seq[Row]): Map[Any, Int] = { t1.map { case Row(k: Long, v: Int) => longIdToLandmark(k) -> v }.toMap } } val mapToLandmark = udf(func, MapType(idType, IntegerType, false)) mapToLandmark(col(DISTANCE_ID)) } val cols = graph.vertices.columns.map(col) :+ distanceCol.as(DISTANCE_ID) g.vertices.select(cols: _*) } private val DISTANCE_ID = "distances" }
Example 52
Source File: VerifyVowpalWabbitRegressorFuzzing.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.test.benchmarks.DatasetUtils import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{Column, DataFrame} class VerifyVowpalWabbitRegressorFuzzing extends EstimatorFuzzing[VowpalWabbitRegressor] { val numPartitions = 2 def readCSV(fileName: String, fileLocation: String): DataFrame = { session.read .option("header", "true").option("inferSchema", "true") .option("treatEmptyValuesAsNulls", "false") .option("delimiter", if (fileName.endsWith(".csv")) "," else "\t") .csv(fileLocation) } override def reader: MLReadable[_] = VowpalWabbitRegressor override def modelReader: MLReadable[_] = VowpalWabbitRegressionModel override def testObjects(): Seq[TestObject[VowpalWabbitRegressor]] = { val fileName = "energyefficiency2012_data.train.csv" val columnsFilter = Some("X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2") val labelCol = "Y1" val fileLocation = DatasetUtils.regressionTrainFile(fileName).toString val readDataset = readCSV(fileName, fileLocation).repartition(numPartitions) val dataset = if (columnsFilter.isDefined) { readDataset.select(columnsFilter.get.split(",").map(new Column(_)): _*) } else { readDataset } val featuresColumn = "features" val featurizer = new VowpalWabbitFeaturizer() .setInputCols(dataset.columns.filter(col => col != labelCol)) .setOutputCol("features") val vw = new VowpalWabbitRegressor() val predCol = "pred" val trainData = featurizer.transform(dataset) val model = vw.setLabelCol(labelCol) .setFeaturesCol("features") .setPredictionCol(predCol) .fit(trainData) Seq(new TestObject( vw, trainData)) } }
Example 53
Source File: FrequentItems.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 54
Source File: UserDefinedFunction.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.expressions.ScalaUDF import org.apache.spark.sql.types.DataType def asNondeterministic(): UserDefinedFunction = { if (!_deterministic) { this } else { val udf = copyAll() udf._deterministic = false udf } } } // We have to use a name different than `UserDefinedFunction` here, to avoid breaking the binary // compatibility of the auto-generate UserDefinedFunction object. private[sql] object SparkUserDefinedFunction { def create( f: AnyRef, dataType: DataType, inputSchemas: Seq[Option[ScalaReflection.Schema]]): UserDefinedFunction = { val inputTypes = if (inputSchemas.contains(None)) { None } else { Some(inputSchemas.map(_.get.dataType)) } val udf = new UserDefinedFunction(f, dataType, inputTypes) udf.nullableTypes = Some(inputSchemas.map(_.map(_.nullable).getOrElse(true))) udf } }
Example 55
Source File: GroupSortedDataset.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted.sql import scala.reflect.ClassTag import org.apache.spark.sql.{ Column, Dataset, Encoder } import org.apache.spark.sql.functions.col import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder } import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate } object GroupSortedDataset { private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = { val key = col(dataset.columns.head) val valueSort = { val sort = sortBy(col(dataset.columns.last)) if (reverse) sort.desc else sort.asc } new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort)) } } class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable { def toDS: Dataset[(K, V)] = dataset def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f)) } def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIterator(_)(f)) } def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f))) } def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] = mapStreamByKey(iter => Iterator(iter.reduceLeft(f))) def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(_.scanLeft(wCreate())(f)) } }
Example 56
Source File: Filter.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.common import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.{Column, DataFrame} class Filter extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Filter by condition" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var condition: String = _ override def setProperties(map: Map[String, Any]): Unit = { condition = MapUtil.get(map,"condition").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val condition = new PropertyDescriptor().name("condition"). displayName("condition") .description("The condition you want to filter") .defaultValue("name=='zhangsan'") .required(true) .example("name=='zhangsan'") descriptor = condition :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/common/SelectField.png") } override def getGroup(): List[String] = { List(StopGroup.CommonGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val df = in.read() var filterDF : DataFrame = df.filter(condition) out.write(filterDF) } }
Example 57
Source File: SelectField.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.common import cn.piflow._ import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.{Column, DataFrame} import scala.beans.BeanProperty class SelectField extends ConfigurableStop { val authorEmail: String = "[email protected]" val description: String = "Select data column" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var columnNames:String = _ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val df = in.read() val field = columnNames.split(",").map(x => x.trim) val columnArray : Array[Column] = new Array[Column](field.size) for(i <- 0 to field.size - 1){ columnArray(i) = new Column(field(i)) } var finalFieldDF : DataFrame = df.select(columnArray:_*) out.write(finalFieldDF) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map : Map[String, Any]): Unit = { columnNames = MapUtil.get(map,"columnNames").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val inports = new PropertyDescriptor() .name("columnNames") .displayName("ColumnNames") .description("Select the column you want,multiple columns separated by commas") .defaultValue("") .required(true) .example("id,name") descriptor = inports :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/common/SelectField.png") } override def getGroup(): List[String] = { List(StopGroup.CommonGroup) } }
Example 58
Source File: Join.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.common import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.spark.sql.{Column, DataFrame} class Join extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Table joins include full join, left join, right join and inner join" override val inportList: List[String] =List(Port.LeftPort,Port.RightPort) override val outportList: List[String] = List(Port.DefaultPort) var joinMode:String=_ var correlationColumn:String=_ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val leftDF = in.read(Port.LeftPort) val rightDF = in.read(Port.RightPort) var seq: Seq[String]= Seq() correlationColumn.split(",").foreach(x=>{ seq = seq .++(Seq(x.trim.toString)) }) var df: DataFrame = null joinMode match { case "inner" =>df = leftDF.join(rightDF, seq) case "left" => df = leftDF.join(rightDF,seq,"left_outer") case "right" => df = leftDF.join(rightDF,seq,"right_outer") case "full_outer" => df = leftDF.join(rightDF,seq,"outer") } out.write(df) } override def setProperties(map: Map[String, Any]): Unit = { joinMode = MapUtil.get(map,"joinMode").asInstanceOf[String] correlationColumn = MapUtil.get(map,"correlationColumn").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val joinMode = new PropertyDescriptor() .name("joinMode") .displayName("JoinMode") .description("For table associations, you can choose inner,left,right,full") .allowableValues(Set("inner","left","right","full_outer")) .defaultValue("inner") .required(true) .example("left") descriptor = joinMode :: descriptor val correlationColumn = new PropertyDescriptor() .name("correlationColumn") .displayName("CorrelationColumn") .description("Columns associated with tables,if multiple are separated by commas") .defaultValue("") .required(true) .example("id,name") descriptor = correlationColumn :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/common/Join.png") } override def getGroup(): List[String] = { List(StopGroup.CommonGroup) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 59
Source File: JsonUtil.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.util import org.apache.spark.sql.functions.explode import org.apache.spark.sql.{Column, DataFrame, SQLContext, SparkSession} import scala.collection.mutable.ArrayBuffer object JsonUtil extends Serializable{ // The tag you want to parse,If you want to open an array field,you have to write it like this:links_name(MasterField_ChildField) def ParserJsonDF(df:DataFrame,tag:String): DataFrame = { var openArrField:String="" var ArrSchame:String="" var tagARR: Array[String] = tag.split(",") var tagNew:String="" for(tt<-tagARR){ if(tt.indexOf("_")> -1){ //包含“.” val openField: Array[String] = tt.split("_") openArrField=openField(0) ArrSchame+=(openField(1)+",") }else{ tagNew+=(tt+",") } } tagNew+=openArrField ArrSchame=ArrSchame.substring(0,ArrSchame.length-1) tagARR = tagNew.split(",") var FinalDF:DataFrame=df //如果用户选择返回字段 var strings: Seq[Column] =tagNew.split(",").toSeq.map(p => new Column(p)) if(tag.length>0){ val df00 = FinalDF.select(strings : _*) FinalDF=df00 } //如果用户选择打开的数组字段,并给出schame if(openArrField.length>0&&ArrSchame.length>0){ val schames: Array[String] = ArrSchame.split(",") var selARR:ArrayBuffer[String]=ArrayBuffer()//分别取出已经打开的字段 //遍历数组,封装到column对象中 var coARR:ArrayBuffer[Column]=ArrayBuffer()//打开字段的select方法用 val sss = tagNew.split(",")//打开字段后todf方法用 var co: Column =null for(each<-tagARR){ if(each==openArrField){ co = explode(FinalDF(openArrField)) for(x<-schames){ selARR+=(openArrField+"."+x) } }else{ selARR+=each co=FinalDF(each) } coARR+=co } println("###################") selARR.foreach(println(_)) var selSEQ: Seq[Column] = selARR.toSeq.map(q => new Column(q)) var df01: DataFrame = FinalDF.select(coARR : _*).toDF(sss:_*) FinalDF = df01.select(selSEQ : _*) } FinalDF } }
Example 60
Source File: FieldPointer.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.model.pointers import org.apache.spark.sql import org.apache.spark.sql.Column import org.archive.archivespark.model._ import org.archive.archivespark.util.SelectorUtil trait GenericFieldPointer[+R <: EnrichRoot, +T] extends Serializable { this: FieldPointer[_, _] => } trait FieldPointer[Root <: EnrichRoot, T] extends GenericFieldPointer[Root, T] { def path[R <: Root](root: EnrichRootCompanion[R]): Seq[String] def get(root: Root): Option[T] = enrichable(root).map(_.get) def exists(root: Root): Boolean = root[T](path(root)).isDefined def enrichable(root: Root): Option[TypedEnrichable[T]] = { val initialized = init(root, excludeFromOutput = false) initialized[T](path(initialized)) } def multi: MultiFieldPointer[Root, T] = new SingleToMultiFieldPointer[Root, T](this) def init[R <: Root](root: R, excludeFromOutput: Boolean): R = root def pathTo[R <: Root](root: EnrichRootCompanion[R], field: String): Seq[String] = path(root) ++ SelectorUtil.parse(field) def col(root: EnrichRootCompanion[Root]): Column = sql.functions.col(SelectorUtil.toString(path(root).filter(f => f != "*" && !f.startsWith("[")))) def parent[A]: FieldPointer[Root, A] = new RelativeFieldPointer(this, 1, Seq.empty) def child[A](field: String): FieldPointer[Root, A] = new RelativeFieldPointer(this, 0, Seq(field)) def sibling[A](field: String): FieldPointer[Root, A] = new RelativeFieldPointer(this, 1, Seq(field)) def mapEnrichable[A](field: String)(f: TypedEnrichable[T] => A): EnrichFunc[Root, T, A] = { val sourcePointer = this new EnrichFunc[Root, T, A] { override def source: FieldPointer[Root, T] = sourcePointer override def fields: Seq[String] = Seq(field) override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = { derivatives << f(source) } } } def map[A](field: String)(f: T => A): EnrichFunc[Root, T, A] = mapEnrichable(field)(e => f(e.get)) def mapMultiEnrichable[A](field: String)(f: TypedEnrichable[T] => Seq[A]): MultiEnrichFunc[Root, T, A] = { val sourcePointer = this new MultiEnrichFunc[Root, T, A] { override def source: FieldPointer[Root, T] = sourcePointer override def fields: Seq[String] = Seq(field) override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = { derivatives.setNext(MultiValueEnrichable(f(source))) } } } def mapMulti[A](field: String)(f: T => Seq[A]): MultiEnrichFunc[Root, T, A] = mapMultiEnrichable(field)(e => f(e.get)) def mapIdentity(field: String): EnrichFunc[Root, T, T] = { val sourcePointer = this new EnrichFunc[Root, T, T] { override def source: FieldPointer[Root, T] = sourcePointer override def fields: Seq[String] = Seq(field) override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = { derivatives.setNext(IdentityField[T]) } } } } object FieldPointer { def apply[Root <: EnrichRoot, T](path: String): FieldPointer[Root, T] = apply(SelectorUtil.parse(path)) def apply[Root <: EnrichRoot, T](path: Seq[String]): FieldPointer[Root, T] = new PathFieldPointer(path) def multi[Root <: EnrichRoot, T](path: String): MultiFieldPointer[Root, T] = multi(SelectorUtil.parse(path)) def multi[Root <: EnrichRoot, T](path: Seq[String]): MultiFieldPointer[Root, T] = apply(path).multi def root[Root <: TypedEnrichRoot[T], T]: FieldPointer[Root, T] = new PathFieldPointer(Seq.empty) }
Example 61
Source File: FrequentItems.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 62
Source File: SchemaColumnSelection.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import scala.reflect.runtime.universe.TypeTag import java.sql.{Date, Timestamp} import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{rand, udf} case class SchemaColumnSelection[T](override val name: String, values: List[T])(implicit tag: TypeTag[T]) extends SchemaColumn { override def column(rowID: Option[Column] = None): Column = { val intToSelectionUDF = udf((index: Int) => { values(index) }) intToSelectionUDF(rand() * values.length % values.length) } } object SchemaColumnSelectionProtocol extends SchemaColumnSelectionProtocol trait SchemaColumnSelectionProtocol extends YamlParserProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnSelectionFormat extends YamlFormat[SchemaColumnSelection[_]] { override def read(yaml: YamlValue): SchemaColumnSelection[_] = { val fields = yaml.asYamlObject.fields val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError("data_type not set")) val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val values = fields.getOrElse(YamlString("values"), deserializationError("selection values not set")) dataType match { case SchemaColumnDataType.Int => SchemaColumnSelection(name, values.convertTo[List[Int]]) case SchemaColumnDataType.Long => SchemaColumnSelection(name, values.convertTo[List[Long]]) case SchemaColumnDataType.Float => SchemaColumnSelection(name, values.convertTo[List[Float]]) case SchemaColumnDataType.Double => SchemaColumnSelection(name, values.convertTo[List[Double]]) case SchemaColumnDataType.Date => SchemaColumnSelection(name, values.convertTo[List[Date]]) case SchemaColumnDataType.Timestamp => SchemaColumnSelection(name, values.convertTo[List[Timestamp]]) case SchemaColumnDataType.String => SchemaColumnSelection(name, values.convertTo[List[String]]) case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Selection}") } } override def write(obj: SchemaColumnSelection[_]): YamlValue = ??? } }
Example 63
Source File: UniqueValueRatio.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import com.amazon.deequ.metrics.DoubleMetric import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{col, count, lit, sum} import org.apache.spark.sql.types.DoubleType case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil } override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { val numUniqueValues = result.getDouble(offset) val numDistinctValues = result.getLong(offset + 1).toDouble toSuccessMetric(numUniqueValues / numDistinctValues) } override def filterCondition: Option[String] = where } object UniqueValueRatio { def apply(column: String): UniqueValueRatio = { new UniqueValueRatio(column :: Nil) } def apply(column: String, where: Option[String]): UniqueValueRatio = { new UniqueValueRatio(column :: Nil, where) } }
Example 64
Source File: Mean.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{count, sum} import org.apache.spark.sql.types.{DoubleType, StructType, LongType} import Analyzers._ case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] { override def sum(other: MeanState): MeanState = { MeanState(sum + other.sum, count + other.count) } override def metricValue(): Double = { if (count == 0L) Double.NaN else sum / count } } case class Mean(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MeanState]("Mean", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { sum(conditionalSelection(column, where)).cast(DoubleType) :: count(conditionalSelection(column, where)).cast(LongType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = { ifNoNullsIn(result, offset, howMany = 2) { _ => MeanState(result.getDouble(offset), result.getLong(offset + 1)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 65
Source File: ArrangePostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.postprocessors import com.typesafe.config.Config import it.agilelab.bigdata.DataQuality.checks.CheckResult import it.agilelab.bigdata.DataQuality.metrics.MetricResult import it.agilelab.bigdata.DataQuality.sources.HdfsFile import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils import it.agilelab.bigdata.DataQuality.utils.DQSettings import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter} import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, NumericType} import org.apache.spark.sql.{Column, DataFrame, SQLContext} import scala.collection.JavaConversions._ final class ArrangePostprocessor(config: Config, settings: DQSettings) extends BasicPostprocessor(config, settings) { private case class ColumnSelector(name: String, tipo: Option[String] = None, format: Option[String] = None, precision: Option[Integer] = None) { def toColumn()(implicit df: DataFrame): Column = { val dataType: Option[NumericType with Product with Serializable] = tipo.getOrElse("").toUpperCase match { case "DOUBLE" => Some(DoubleType) case "INT" => Some(IntegerType) case "LONG" => Some(LongType) case _ => None } import org.apache.spark.sql.functions.format_number import org.apache.spark.sql.functions.format_string (dataType, precision, format) match { case (Some(dt), None, None) => df(name).cast(dt) case(Some(dt), None, Some(f)) => format_string(f, df(name).cast(dt)).alias(name) case (Some(dt), Some(p),None) => format_number(df(name).cast(dt), p).alias(name) case (None, Some(p), None) => format_number(df(name), p).alias(name) case (None, None, Some(f)) => format_string(f, df(name)).alias(name) case _ => df(name) } } } private val vs = config.getString("source") private val target: HdfsTargetConfig = { val conf = config.getConfig("saveTo") utils.parseTargetConfig(conf)(settings).get } private val columns: Seq[ColumnSelector] = config.getAnyRefList("columnOrder").map { case x: String => ColumnSelector(x) case x: java.util.HashMap[_, String] => { val (name, v) = x.head.asInstanceOf[String Tuple2 _] v match { case v: String => ColumnSelector(name, Option(v)) case v: java.util.HashMap[String, _] => { val k = v.head._1 val f = v.head._2 f match { case f: Integer => ColumnSelector(name, Option(k), None, Option(f)) case f: String => ColumnSelector(name, Option(k), Option(f)) } } } } } override def process(vsRef: Set[HdfsFile], metRes: Seq[MetricResult], chkRes: Seq[CheckResult])( implicit fs: FileSystem, sqlContext: SQLContext, settings: DQSettings): HdfsFile = { val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vs).head implicit val df: DataFrame = HdfsReader.load(reqVS, settings.ref_date).head val arrangeDF = df.select(columns.map(_.toColumn): _*) HdfsWriter.saveVirtualSource(arrangeDF, target, settings.refDateString)( fs, sqlContext.sparkContext) new HdfsFile(target) } }
Example 66
Source File: FramelessSyntax.scala From frameless with Apache License 2.0 | 5 votes |
package frameless import org.apache.spark.sql.{Column, DataFrame, Dataset} trait FramelessSyntax { implicit class ColumnSyntax(self: Column) { def typedColumn[T, U: TypedEncoder]: TypedColumn[T, U] = new TypedColumn[T, U](self) def typedAggregate[T, U: TypedEncoder]: TypedAggregate[T, U] = new TypedAggregate[T, U](self) } implicit class DatasetSyntax[T: TypedEncoder](self: Dataset[T]) { def typed: TypedDataset[T] = TypedDataset.create[T](self) } implicit class DataframeSyntax(self: DataFrame){ def unsafeTyped[T: TypedEncoder]: TypedDataset[T] = TypedDataset.createUnsafe(self) } }
Example 67
Source File: basics.scala From odsc-west-streaming-trends with GNU General Public License v3.0 | 5 votes |
//spark-shell -i basics.scala import org.apache.spark.sql.types._ import spark.implicits._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, Column, Row} case class Coffee( name: String, roast:Int, region:String, bean: String, acidity:Int = 1, bitterness:Int = 1, flavors: Seq[String] ) case class CoffeeRating( coffeeName: String, score: Int, notes: Option[String] = None ) val availableCoffee = Seq( Coffee(name="folgers", roast=2, region="US", bean="robusta", acidity=7, bitterness=10, flavors=Seq("nutty")), Coffee(name="yuban", roast=2, region="Mexico", bean="robusta", acidity=6, bitterness=7, flavors=Seq("nutty")), Coffee(name="nespresso", roast=2, region="Cuba", bean="arabica", acidity=5, bitterness=3, flavors=Seq("nutty", "chocolate")), Coffee(name="ritual", roast=1, region="Brazil", bean="arabica", acidity=2, bitterness=1, flavors=Seq("fruity", "floral", "chocolate")), Coffee(name="four barrel", roast=1, region="Columbia", bean="arabica", flavors=Seq("nutty", "fruity")) ) val rawCoffeeRatings = Seq( CoffeeRating("folgers",1,Some("terrible")), CoffeeRating("folgers",2,Some("meh")), CoffeeRating("yuban",3,Some("worth the money")), CoffeeRating("nespresso",2,Some("it's coffee")), CoffeeRating("ritual",5,Some("fantastic")), CoffeeRating("four barrel",3), CoffeeRating("four barrel",5,Some("my fav")), CoffeeRating("ritual",4) ) def expandArray(df: DataFrame, col: Column): DataFrame = { val colName = col.toString() val values = df .selectExpr(s"explode($colName) as $colName") .select(col).distinct() .map { _.getString(0) } .collect().toSeq val expandedRows = values.foldLeft[DataFrame](df)( (d, v) => d.withColumn(v, when(array_contains(col, v), 1).otherwise(0)) ) expandedRows } // take the available coffee and add it to the stand val coffeeStand = spark.createDataset(availableCoffee) val coffeeRatings = spark.createDataset(rawCoffeeRatings) val coffeeWithRatings = coffeeStand.join(coffeeRatings, coffeeStand("name") === coffeeRatings("coffeeName")).drop("coffeeName") val sparkWay = coffeeWithRatings.groupBy("name").agg(avg("score") as "rating").sort(desc("rating")) // create memory sql table coffeeWithRatings.createOrReplaceTempView("coffee_ratings") val sqlWay = spark.sql("select name, avg(score) as rating from coffee_ratings GROUP BY name ORDER BY rating DESC") sparkWay.explain(true) sparkWay.show(10, false) sqlWay.explain(true) sqlWay.show(10, false)
Example 68
Source File: FrequentItems.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.Logging import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types.{ArrayType, StructField, StructType} private[sql] object FrequentItems extends Logging { private[sql] def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4, s"support ($support) must be greater than 1e-4.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) } val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toSeq) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes new DataFrame(df.sqlContext, LocalRelation(schema, Seq(resultRow))) } }
Example 69
Source File: SchemaColumnFixed.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import java.sql.{Date, Timestamp} import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column import org.apache.spark.sql.functions.lit case class SchemaColumnFixed[T](override val name: String, value: T) extends SchemaColumn { override def column(rowID: Option[Column] = None): Column = lit(value) } object SchemaColumnFixedProtocol extends SchemaColumnFixedProtocol trait SchemaColumnFixedProtocol extends YamlParserProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnFixedFormat extends YamlFormat[SchemaColumnFixed[_]] { override def read(yaml: YamlValue): SchemaColumnFixed[_] = { val fields = yaml.asYamlObject.fields val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name")) val value = fields.getOrElse(YamlString("value"), deserializationError(s"value not set for $name")) dataType match { case SchemaColumnDataType.Int => SchemaColumnFixed(name, value.convertTo[Int]) case SchemaColumnDataType.Long => SchemaColumnFixed(name, value.convertTo[Long]) case SchemaColumnDataType.Float => SchemaColumnFixed(name, value.convertTo[Float]) case SchemaColumnDataType.Double => SchemaColumnFixed(name, value.convertTo[Double]) case SchemaColumnDataType.Date => SchemaColumnFixed(name, value.convertTo[Date]) case SchemaColumnDataType.Timestamp => SchemaColumnFixed(name, value.convertTo[Timestamp]) case SchemaColumnDataType.String => SchemaColumnFixed(name, value.convertTo[String]) case SchemaColumnDataType.Boolean => SchemaColumnFixed(name, value.convertTo[Boolean]) case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Fixed}") } } override def write(obj: SchemaColumnFixed[_]): YamlValue = ??? } }
Example 70
Source File: SchemaColumnSequential.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import java.sql.{Date, Timestamp} import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{to_utc_timestamp, from_unixtime, monotonically_increasing_id, to_date} trait SchemaColumnSequential[T] extends SchemaColumn object SchemaColumnSequential { def apply(name: String, start: Int, step: Int): SchemaColumn = SchemaColumnSequentialNumeric(name, start, step) def apply(name: String, start: Long, step: Long): SchemaColumn = SchemaColumnSequentialNumeric(name, start, step) def apply(name: String, start: Float, step: Float): SchemaColumn = SchemaColumnSequentialNumeric(name, start, step) def apply(name: String, start: Double, step: Double): SchemaColumn = SchemaColumnSequentialNumeric(name, start, step) def apply(name: String, start: Date, step: Int): SchemaColumn = SchemaColumnSequentialDate(name, start, step) def apply(name: String, start: Timestamp, step: Int): SchemaColumn = SchemaColumnSequentialTimestamp(name, start, step) } private case class SchemaColumnSequentialNumeric[T: Numeric](override val name: String, start: T, step: T) extends SchemaColumnSequential[T] { override def column(rowID: Option[Column] = Some(monotonically_increasing_id)): Column = (rowID.get * step) + start } private case class SchemaColumnSequentialTimestamp(override val name: String, start: Timestamp, stepSeconds: Int) extends SchemaColumnSequential[Timestamp] { override def column(rowID: Option[Column] = Some(monotonically_increasing_id)): Column = { val startTime = start.getTime / 1000 to_utc_timestamp(from_unixtime(rowID.get * stepSeconds + startTime), "UTC") } } private case class SchemaColumnSequentialDate(override val name: String, start: Date, stepDays: Int) extends SchemaColumnSequential[Date] { val timestamp = SchemaColumnSequentialTimestamp(name, new Timestamp(start.getTime), stepDays * 86400) override def column(rowID: Option[Column]): Column = to_date(timestamp.column()) } object SchemaColumnSequentialProtocol extends SchemaColumnSequentialProtocol trait SchemaColumnSequentialProtocol extends YamlParserProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnSequentialFormat extends YamlFormat[SchemaColumnSequential[_]] { override def read(yaml: YamlValue): SchemaColumnSequential[_] = { val fields = yaml.asYamlObject.fields val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError("data_type not set")) val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val start = fields.getOrElse(YamlString("start"), deserializationError("start not set")) val step = fields.getOrElse(YamlString("step"), deserializationError("step not set")) dataType match { case "Int" => SchemaColumnSequentialNumeric(name, start.convertTo[Int], step.convertTo[Int]) case "Long" => SchemaColumnSequentialNumeric(name, start.convertTo[Long], step.convertTo[Long]) case "Float" => SchemaColumnSequentialNumeric(name, start.convertTo[Float], step.convertTo[Float]) case "Double" => SchemaColumnSequentialNumeric(name, start.convertTo[Double], step.convertTo[Double]) case "Date" => SchemaColumnSequentialDate(name, start.convertTo[Date], step.convertTo[Int]) case "Timestamp" => SchemaColumnSequentialTimestamp(name, start.convertTo[Timestamp], step.convertTo[Int]) case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Sequential}") } } override def write(obj: SchemaColumnSequential[_]): YamlValue = ??? } }
Example 71
Source File: SchemaColumnRandom.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import java.sql.{Date, Timestamp} import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{to_utc_timestamp, round, rand, from_unixtime, to_date} import org.apache.spark.sql.types.{IntegerType, LongType} trait SchemaColumnRandom[T] extends SchemaColumn object SchemaColumnRandom { val FloatDP = 3 val DoubleDP = 3 def apply(name: String, min: Int, max: Int): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Long, max: Long): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Float, max: Float): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Double, max: Double): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Date, max: Date): SchemaColumn = SchemaColumnRandomDate(name, min, max) def apply(name: String, min: Timestamp, max: Timestamp): SchemaColumn = SchemaColumnRandomTimestamp(name, min, max) def apply(name: String): SchemaColumn = SchemaColumnRandomBoolean(name) } private case class SchemaColumnRandomNumeric[T: Numeric](override val name: String, min: T, max: T) extends SchemaColumnRandom[T] { override def column(rowID: Option[Column] = None): Column = { import Numeric.Implicits._ (min, max) match { case (_: Int, _: Int) => round(rand() * (max - min) + min, 0).cast(IntegerType) case (_: Long, _: Long) => round(rand() * (max - min) + min, 0).cast(LongType) case (_: Float, _: Float) => round(rand() * (max - min) + min, SchemaColumnRandom.FloatDP) case (_: Double, _: Double) => round(rand() * (max - min) + min, SchemaColumnRandom.DoubleDP) } } } private case class SchemaColumnRandomTimestamp(override val name: String, min: Timestamp, max: Timestamp) extends SchemaColumnRandom[Timestamp] { override def column(rowID: Option[Column] = None): Column = { val minTime = min.getTime / 1000 val maxTime = max.getTime / 1000 to_utc_timestamp(from_unixtime(rand() * (maxTime - minTime) + minTime), "UTC") } } private case class SchemaColumnRandomDate(override val name: String, min: Date, max: Date) extends SchemaColumnRandom[Date] { val timestamp = SchemaColumnRandomTimestamp(name, new Timestamp(min.getTime), new Timestamp(max.getTime + 86400000)) override def column(rowID: Option[Column] = None): Column = to_date(timestamp.column()) } private case class SchemaColumnRandomBoolean(override val name: String) extends SchemaColumnRandom[Boolean] { override def column(rowID: Option[Column] = None): Column = rand() < 0.5f } object SchemaColumnRandomProtocol extends SchemaColumnRandomProtocol trait SchemaColumnRandomProtocol extends YamlParserProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnRandomFormat extends YamlFormat[SchemaColumnRandom[_]] { override def read(yaml: YamlValue): SchemaColumnRandom[_] = { val fields = yaml.asYamlObject.fields val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name")) if (dataType == SchemaColumnDataType.Boolean) { SchemaColumnRandomBoolean(name) } else { val min = fields.getOrElse(YamlString("min"), deserializationError(s"min not set for $name")) val max = fields.getOrElse(YamlString("max"), deserializationError(s"max not set for $name")) dataType match { case SchemaColumnDataType.Int => SchemaColumnRandomNumeric(name, min.convertTo[Int], max.convertTo[Int]) case SchemaColumnDataType.Long => SchemaColumnRandomNumeric(name, min.convertTo[Long], max.convertTo[Long]) case SchemaColumnDataType.Float => SchemaColumnRandomNumeric(name, min.convertTo[Float], max.convertTo[Float]) case SchemaColumnDataType.Double => SchemaColumnRandomNumeric(name, min.convertTo[Double], max.convertTo[Double]) case SchemaColumnDataType.Date => SchemaColumnRandomDate(name, min.convertTo[Date], max.convertTo[Date]) case SchemaColumnDataType.Timestamp => SchemaColumnRandomTimestamp(name, min.convertTo[Timestamp], max.convertTo[Timestamp]) case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Random}") } } } override def write(obj: SchemaColumnRandom[_]): YamlValue = ??? } }
Example 72
Source File: Maximum.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.max import org.apache.spark.sql.types.{DoubleType, StructType} import Analyzers._ case class MaxState(maxValue: Double) extends DoubleValuedState[MaxState] { override def sum(other: MaxState): MaxState = { MaxState(math.max(maxValue, other.maxValue)) } override def metricValue(): Double = { maxValue } } case class Maximum(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MaxState]("Maximum", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { max(conditionalSelection(column, where)).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = { ifNoNullsIn(result, offset) { _ => MaxState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 73
Source File: SchemaColumn.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column abstract class SchemaColumn { def name: String def column(rowID: Option[Column] = None): Column } object SchemaColumnDataType { val Int = "Int" val Long = "Long" val Float = "Float" val Double = "Double" val Date = "Date" val Timestamp = "Timestamp" val String = "String" val Boolean = "Boolean" } object SchemaColumnType { val Fixed = "Fixed" val Random = "Random" val Selection = "Selection" val Sequential = "Sequential" val Expression = "Expression" } object SchemaColumnProtocol extends YamlParserProtocol with SchemaColumnFixedProtocol with SchemaColumnRandomProtocol with SchemaColumnSelectionProtocol with SchemaColumnSequentialProtocol with SchemaColumnExpressionProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnFormat extends YamlFormat[SchemaColumn] { override def read(yaml: YamlValue): SchemaColumn = { val fields = yaml.asYamlObject.fields val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val YamlString(columnType) = fields.getOrElse(YamlString("column_type"), deserializationError(s"column_type not set for $name")) columnType match { case SchemaColumnType.Fixed => yaml.convertTo[SchemaColumnFixed[_]] case SchemaColumnType.Random => yaml.convertTo[SchemaColumnRandom[_]] case SchemaColumnType.Selection => yaml.convertTo[SchemaColumnSelection[_]] case SchemaColumnType.Sequential => yaml.convertTo[SchemaColumnSequential[_]] case SchemaColumnType.Expression => yaml.convertTo[SchemaColumnExpression] case _ => deserializationError(s"unsupported column_type: $columnType") } } override def write(obj: SchemaColumn): YamlValue = ??? } }
Example 74
Source File: FrequentItems.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 75
Source File: YelpHelpers.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.integration.yelp import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, LongType} import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions} import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey} import org.opencypher.morpheus.impl.table.SparkTable._ import org.opencypher.morpheus.integration.yelp.YelpConstants._ object YelpHelpers { case class YelpTables( userDf: DataFrame, businessDf: DataFrame, reviewDf: DataFrame ) def loadYelpTables(inputPath: String)(implicit spark: SparkSession): YelpTables = { import spark.implicits._ log("read business.json", 2) val rawBusinessDf = spark.read.json(s"$inputPath/business.json") log("read review.json", 2) val rawReviewDf = spark.read.json(s"$inputPath/review.json") log("read user.json", 2) val rawUserDf = spark.read.json(s"$inputPath/user.json") val businessDf = rawBusinessDf.select($"business_id".as(sourceIdKey), $"business_id", $"name", $"address", $"city", $"state") val reviewDf = rawReviewDf.select($"review_id".as(sourceIdKey), $"user_id".as(sourceStartNodeKey), $"business_id".as(sourceEndNodeKey), $"stars", $"date".cast(DateType)) val userDf = rawUserDf.select( $"user_id".as(sourceIdKey), $"name", $"yelping_since".cast(DateType), functions.split($"elite", ",").cast(ArrayType(LongType)).as("elite")) YelpTables(userDf, businessDf, reviewDf) } def printYelpStats(inputPath: String)(implicit spark: SparkSession): Unit = { val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") import spark.implicits._ rawBusinessDf.select($"city", $"state").distinct().show() rawBusinessDf.withColumnRenamed("business_id", "id") .join(rawReviewDf, $"id" === $"business_id") .groupBy($"city", $"state") .count().as("count") .orderBy($"count".desc, $"state".asc) .show(100) } def extractYelpCitySubset(inputPath: String, outputPath: String, city: String)(implicit spark: SparkSession): Unit = { import spark.implicits._ def emailColumn(userId: String): Column = functions.concat($"$userId", functions.lit("@yelp.com")) val rawUserDf = spark.read.json(s"$inputPath/user.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val businessDf = rawBusinessDf.filter($"city" === city) val reviewDf = rawReviewDf .join(businessDf, Seq("business_id"), "left_semi") .withColumn("user_email", emailColumn("user_id")) .withColumnRenamed("stars", "stars_tmp") .withColumn("stars", $"stars_tmp".cast(IntegerType)) .drop("stars_tmp") val userDf = rawUserDf .join(reviewDf, Seq("user_id"), "left_semi") .withColumn("email", emailColumn("user_id")) val friendDf = userDf .select($"email".as("user1_email"), functions.explode(functions.split($"friends", ", ")).as("user2_id")) .withColumn("user2_email", emailColumn("user2_id")) .select(s"user1_email", s"user2_email") businessDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/business.json") reviewDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/review.json") userDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/user.json") friendDf.write.json(s"$outputPath/$cityGraphName/$yelpBookDB/friend.json") } implicit class DataFrameOps(df: DataFrame) { def prependIdColumn(idColumn: String, prefix: String): DataFrame = df.transformColumns(idColumn)(column => functions.concat(functions.lit(prefix), column).as(idColumn)) } }
Example 76
Source File: EncodeLong.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.expressions import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, NullIntolerant, UnaryExpression} import org.apache.spark.sql.types.{BinaryType, DataType, LongType} import org.opencypher.morpheus.api.value.MorpheusElement._ case class EncodeLong(child: Expression) extends UnaryExpression with NullIntolerant with ExpectsInputTypes { override val dataType: DataType = BinaryType override val inputTypes: Seq[LongType] = Seq(LongType) override protected def nullSafeEval(input: Any): Any = EncodeLong.encodeLong(input.asInstanceOf[Long]) override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = defineCodeGen(ctx, ev, c => s"(byte[])(${EncodeLong.getClass.getName.dropRight(1)}.encodeLong($c))") } object EncodeLong { private final val moreBytesBitMask: Long = Integer.parseInt("10000000", 2) private final val varLength7BitMask: Long = Integer.parseInt("01111111", 2) private final val otherBitsMask = ~varLength7BitMask private final val maxBytesForLongVarEncoding = 10 // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def encodeLong(l: Long): Array[Byte] = { val tempResult = new Array[Byte](maxBytesForLongVarEncoding) var remainder = l var index = 0 while ((remainder & otherBitsMask) != 0) { tempResult(index) = ((remainder & varLength7BitMask) | moreBytesBitMask).toByte remainder >>>= 7 index += 1 } tempResult(index) = remainder.toByte val result = new Array[Byte](index + 1) System.arraycopy(tempResult, 0, result, 0, index + 1) result } // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def decodeLong(input: Array[Byte]): Long = { assert(input.nonEmpty, "`decodeLong` requires a non-empty array as its input") var index = 0 var currentByte = input(index) var decoded = currentByte & varLength7BitMask var nextLeftShift = 7 while ((currentByte & moreBytesBitMask) != 0) { index += 1 currentByte = input(index) decoded |= (currentByte & varLength7BitMask) << nextLeftShift nextLeftShift += 7 } assert(index == input.length - 1, s"`decodeLong` received an input array ${input.toSeq.toHex} with extra bytes that could not be decoded.") decoded } implicit class ColumnLongOps(val c: Column) extends AnyVal { def encodeLongAsMorpheusId(name: String): Column = encodeLongAsMorpheusId.as(name) def encodeLongAsMorpheusId: Column = new Column(EncodeLong(c.expr)) } }
Example 77
Source File: DatasetFilter.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.util import org.apache.spark.sql.{Column, DataFrame, Dataset} import org.apache.spark.sql.functions.substring case class DatasetFilter(lastModifiedGte: Option[java.sql.Timestamp] = None, current: Option[Boolean] = None, pathPrefix: Option[String] = None) { def applyFilter[T](ds: Dataset[T], forAnalysis: Boolean): Dataset[T] = { // Apply filters if a filter value was supplied, and the corresponding column exists in ds val lastModifiedOptionalPredicate: Option[Column] = if (ds.columns.contains("lastModified")) lastModifiedGte.map(ds("lastModified") >= _) else None val currentOptionalPredicate = if (ds.columns.contains("current")) current.map(ds("current") > _) else None val pathOptionalPredicate = if (ds.columns.contains("path")) pathPrefix.map { pathPrefix => substring(ds("path"), 0, pathPrefix.length) === pathPrefix } else None val temporalOptionalPredicate = (lastModifiedOptionalPredicate, currentOptionalPredicate) match { case (Some(lastModifiedPredicate), Some(currentPredicate)) if forAnalysis => Some(lastModifiedPredicate && currentPredicate) case (Some(lastModifiedPredicate), Some(currentPredicate)) if !forAnalysis => Some(lastModifiedPredicate || currentPredicate) case (Some(lastModifiedPredicate), _) => Some(lastModifiedPredicate) case (_, Some(currentPredicate)) => Some(currentPredicate) case _ => None } val overallOptionalPredicate = (pathOptionalPredicate, temporalOptionalPredicate) match { case (Some(pathPredicate), Some(currentPredicate)) => Some(pathPredicate && currentPredicate) case (Some(pathPredicate), _) => Some(pathPredicate) case (_, Some(temporalPredicate)) => Some(temporalPredicate) case _ => None } overallOptionalPredicate.fold(ds)(ds.filter) } }
Example 78
Source File: PathWithKeyFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.data import cmwell.analytics.util.{CassandraSystem, DatasetFilter, KeyFields} import com.datastax.spark.connector._ import com.datastax.spark.connector.rdd.CassandraTableScanRDD import org.apache.spark.sql.{Column, DataFrame, Dataset, SparkSession} object PathWithKeyFields extends EstimateDatasetSize { private val BytesPerRow = 8 + (3 * 8) + (16 + 8 + 32) // bit mask, fixed, variable override def estimateDatasetSize(implicit spark: SparkSession): Long = CassandraSystem.rowCount(table = "path") * BytesPerRow case class Columns(path: Column, lastModified: Column, uuid: Column) { def this(dataset: DataFrame, prefix: String = "") = this( path = dataset(prefix + "path"), lastModified = dataset(prefix + "lastModified"), uuid = dataset(prefix + "uuid")) } def isWellFormed(dataset: DataFrame, prefix: String = ""): Column = { val columns = new Columns(dataset, prefix) Constraints.isPathWellFormed(columns.path) && Constraints.isLastModifiedCasWellFormed(columns.lastModified) && Constraints.isUuidWellFormed(columns.uuid) } def apply(datasetFilter: Option[DatasetFilter] = None) (implicit spark: SparkSession): Dataset[KeyFields] = { // We can push filters on last_modified down to Cassandra. // CQL doesn't support filtering on path prefix. def pushDownDatasetFilter(scan: CassandraTableScanRDD[CassandraRow]): CassandraTableScanRDD[CassandraRow] = datasetFilter.fold(scan)(_.lastModifiedGte.fold(scan)(scan.where("last_modified >= ?", _))) val infotonRdd = pushDownDatasetFilter(spark.sparkContext.cassandraTable("data2", "path")) .select("path", "last_modified", "uuid") val objectRDD = infotonRdd.map { cassandraRow => KeyFields( path = cassandraRow.getString("path"), lastModified = new java.sql.Timestamp(cassandraRow.getDateTime("last_modified").getMillis), uuid = cassandraRow.getString("uuid")) } import spark.implicits._ val ds = spark.createDataset(objectRDD) datasetFilter.fold(ds)(_.applyFilter(ds, forAnalysis = false)) } }
Example 79
Source File: InfotonType.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.data import org.apache.spark.sql.Column object InfotonType { // In Elasticsearch indexes, the infoton type string is a full word in Pascal casing. private val ElasticsearchRepresentation = Seq( "ObjectInfoton", "FileInfoton", "LinkInfoton", "DeletedInfoton", "CompoundInfoton", "GhostInfoton") // In the infoton table, the type field is the first letter of the infoton type name in lower case. private val CassandraRepresentation: Seq[String] = ElasticsearchRepresentation.map(_.substring(0, 1).toLowerCase) def isWellFormedCas(column: Column): Column = column.isin(CassandraRepresentation: _*) def isWellFormedEs(column: Column): Column = column.isin(ElasticsearchRepresentation: _*) }
Example 80
Source File: AnalyzeInconsistenciesResult.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.io.File import java.nio.charset.StandardCharsets.UTF_8 import cmwell.analytics.data.InfotonAndIndexWithSystemFields import cmwell.analytics.util.Connector import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.apache.spark.sql.{Column, DataFrame, Row} import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.collection.breakOut object AnalyzeInconsistenciesResult { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(AnalyzeInconsistenciesResult.getClass) try { object Opts extends ScallopConf(args) { val in: ScallopOption[String] = opt[String]("in", short = 'i', descr = "The path to read the (parquet) inconsistencies dataset from", required = true) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the (csv) output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) verify() } Connector( appName = "Analyze InfotonAndIndexWithSystemFields Output", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds: DataFrame = spark.read.parquet(Opts.in()) import org.apache.spark.sql.functions._ // A column expression that counts the number of failures for each constraint. // This will also include null counts, needed to interpret the results. val constraints: Seq[(String, Column)] = InfotonAndIndexWithSystemFields.constraints(ds).map { case (name, predicate) => name -> sum(when(predicate, 0L).otherwise(1L)).as(name) }(breakOut) // Compute the failure counts val failureCounts: Row = ds.agg(constraints.head._2, constraints.tail.map(_._2): _*).head val results = for { i <- constraints.indices constraintName = constraints(i)._1 failureCount = if (failureCounts.isNullAt(i)) 0 else failureCounts.getAs[Long](i) } yield s"$constraintName,$failureCount" FileUtils.write(new File(Opts.out()), "constraint,failures\n" + results.mkString("\n"), UTF_8) } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }