org.apache.spark.sql.functions.col Scala Examples
The following examples show how to use org.apache.spark.sql.functions.col.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkBindingsTest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.schema import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark.sql.Row import org.apache.spark.sql.functions.{col, udf} case class Foo(a: Int, b: String, c: Seq[Bar]) object Foo extends SparkBindings[Foo] case class Bar(a: Int, c: Seq[Byte]) object Bar extends SparkBindings[Bar] class SparkBindingsTest2 extends TestBase { import session.implicits._ test("Test to make sure there are no strange memory leaks") { (1 to 40).foreach { i => val foos = (0 to 40).map(i => Tuple1(Foo(i, i.toString, Seq(Bar(i, "foo".getBytes))))) val converter = Foo.makeFromRowConverter val df = foos.toDF("foos") .repartition(2) .withColumn("mapped2", udf({ r: Row => converter(r) }, Foo.schema)(col("foos"))) val results = df.collect().toList println(results.head) } } }
Example 2
Source File: HashingTF.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 3
Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new VectorAssembler) } test("assemble") { import org.apache.spark.ml.feature.VectorAssembler.assemble assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty)) assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0))) val dv = Vectors.dense(2.0, 0.0) assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0))) val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0)) assert(assemble(0.0, dv, 1.0, sv) === Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0))) for (v <- Seq(1, "a", null)) { intercept[SparkException](assemble(v)) intercept[SparkException](assemble(1.0, v)) } } test("assemble should compress vectors") { import org.apache.spark.ml.feature.VectorAssembler.assemble val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) assert(v1.isInstanceOf[SparseVector]) val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) assert(v2.isInstanceOf[DenseVector]) } test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) )).toDF("id", "x", "y", "name", "z", "n") val assembler = new VectorAssembler() .setInputCols(Array("x", "y", "z", "n")) .setOutputCol("features") assembler.transform(df).select("features").collect().foreach { case Row(v: Vector) => assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0))) } } test("ML attributes") { val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari") val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0) val user = new AttributeGroup("user", Array( NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"), NumericAttribute.defaultAttr.withName("salary"))) val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0))) val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad") .select( col("browser").as("browser", browser.toMetadata()), col("hour").as("hour", hour.toMetadata()), col("count"), // "count" is an integer column without ML attribute col("user").as("user", user.toMetadata()), col("ad")) // "ad" is a vector column without ML attribute val assembler = new VectorAssembler() .setInputCols(Array("browser", "hour", "count", "user", "ad")) .setOutputCol("features") val output = assembler.transform(df) val schema = output.schema val features = AttributeGroup.fromStructField(schema("features")) assert(features.size === 7) val browserOut = features.getAttr(0) assert(browserOut === browser.withIndex(0).withName("browser")) val hourOut = features.getAttr(1) assert(hourOut === hour.withIndex(1).withName("hour")) val countOut = features.getAttr(2) assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2)) val userGenderOut = features.getAttr(3) assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3)) val userSalaryOut = features.getAttr(4) assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4)) assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5)) assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6)) } }
Example 4
Source File: OneHotEncoderSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext { def stringIndexed(): DataFrame = { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) val df = sqlContext.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") .fit(df) indexer.transform(df) } test("params") { ParamsSuite.checkParams(new OneHotEncoder) } test("OneHotEncoder dropLast = false") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") .setDropLast(false) val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1), vec(2)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0), (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0)) assert(output === expected) } test("OneHotEncoder dropLast = true") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0), (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0)) assert(output === expected) } test("input column with ML attribute") { val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large") val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size") .select(col("size").as("size", attr.toMetadata())) val encoder = new OneHotEncoder() .setInputCol("size") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("size_is_small").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("size_is_medium").withIndex(1)) } test("input column without ML attribute") { val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index") val encoder = new OneHotEncoder() .setInputCol("index") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("index_is_0").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("index_is_1").withIndex(1)) } }
Example 5
Source File: HashingTF.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 6
Source File: WholeStageCodegenSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Row import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions.{avg, broadcast, col, max} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{IntegerType, StringType, StructType} class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext { test("range/filter should be combined") { val df = spark.range(10).filter("id = 1").selectExpr("id + 1") val plan = df.queryExecution.executedPlan assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined) assert(df.collect() === Array(Row(2))) } test("Aggregate should be included in WholeStageCodegen") { val df = spark.range(10).groupBy().agg(max(col("id")), avg(col("id"))) val plan = df.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) assert(df.collect() === Array(Row(9, 4.5))) } test("Aggregate with grouping keys should be included in WholeStageCodegen") { val df = spark.range(3).groupBy("id").count().orderBy("id") val plan = df.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) assert(df.collect() === Array(Row(0, 1), Row(1, 1), Row(2, 1))) } test("BroadcastHashJoin should be included in WholeStageCodegen") { val rdd = spark.sparkContext.makeRDD(Seq(Row(1, "1"), Row(1, "1"), Row(2, "2"))) val schema = new StructType().add("k", IntegerType).add("v", StringType) val smallDF = spark.createDataFrame(rdd, schema) val df = spark.range(10).join(broadcast(smallDF), col("k") === col("id")) assert(df.queryExecution.executedPlan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[BroadcastHashJoinExec]).isDefined) assert(df.collect() === Array(Row(1, 1, "1"), Row(1, 1, "1"), Row(2, 2, "2"))) } test("Sort should be included in WholeStageCodegen") { val df = spark.range(3, 0, -1).toDF().sort(col("id")) val plan = df.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SortExec]).isDefined) assert(df.collect() === Array(Row(1), Row(2), Row(3))) } test("MapElements should be included in WholeStageCodegen") { import testImplicits._ val ds = spark.range(10).map(_.toString) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SerializeFromObjectExec]).isDefined) assert(ds.collect() === 0.until(10).map(_.toString).toArray) } test("typed filter should be included in WholeStageCodegen") { val ds = spark.range(10).filter(_ % 2 == 0) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined) assert(ds.collect() === Array(0, 2, 4, 6, 8)) } test("back-to-back typed filter should be included in WholeStageCodegen") { val ds = spark.range(10).filter(_ % 2 == 0).filter(_ % 3 == 0) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined) assert(ds.collect() === Array(0, 6)) } test("simple typed UDAF should be included in WholeStageCodegen") { import testImplicits._ val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS() .groupByKey(_._1).agg(typed.sum(_._2)) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0))) } }
Example 7
Source File: HashingTF.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 8
Source File: ParserSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.split1 import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.io.http._ import org.apache.http.client.methods.HttpPost import org.apache.spark.ml.Transformer import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, SparkSession} trait ParserUtils extends WithServer { def sampleDf(spark: SparkSession): DataFrame = { val df = spark.createDataFrame((1 to 10).map(Tuple1(_))) .toDF("data") val df2 = new JSONInputParser().setInputCol("data") .setOutputCol("parsedInput").setUrl(url) .transform(df) .withColumn("unparsedOutput", udf({ x: Int => HTTPResponseData( Array(), Some(EntityData( "{\"foo\": \"here\"}".getBytes, None, None, None, false, false, false)), StatusLineData(ProtocolVersionData("foo", 1, 1), 200, "bar"), "en") }).apply(col("data")) ) new JSONOutputParser() .setDataType(new StructType().add("foo", StringType)) .setInputCol("unparsedOutput") .setOutputCol("parsedOutput") .transform(df2) } def makeTestObject[T <: Transformer](t: T, session: SparkSession): Seq[TestObject[T]] = { Seq(new TestObject(t, sampleDf(session))) } } class JsonInputParserSuite extends TransformerFuzzing[JSONInputParser] with ParserUtils { override def testObjects(): Seq[TestObject[JSONInputParser]] = makeTestObject( new JSONInputParser().setInputCol("data").setOutputCol("out") .setUrl(url), session) override def reader: MLReadable[_] = JSONInputParser } class JsonOutputParserSuite extends TransformerFuzzing[JSONOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[JSONOutputParser]] = makeTestObject( new JSONOutputParser().setInputCol("unparsedOutput").setOutputCol("out") .setDataType(new StructType().add("foo", StringType)), session) override def reader: MLReadable[_] = JSONOutputParser } class StringOutputParserSuite extends TransformerFuzzing[StringOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[StringOutputParser]] = makeTestObject( new StringOutputParser().setInputCol("unparsedOutput").setOutputCol("out"), session) override def reader: MLReadable[_] = StringOutputParser } class CustomInputParserSuite extends TransformerFuzzing[CustomInputParser] with ParserUtils { override def testObjects(): Seq[TestObject[CustomInputParser]] = makeTestObject( new CustomInputParser().setInputCol("data").setOutputCol("out") .setUDF({ x: Int => new HttpPost(s"http://$x") }), session) override def reader: MLReadable[_] = CustomInputParser } class CustomOutputParserSuite extends TransformerFuzzing[CustomOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[CustomOutputParser]] = makeTestObject( new CustomOutputParser().setInputCol("unparsedOutput").setOutputCol("out") .setUDF({ x: HTTPResponseData => x.locale }), session) override def reader: MLReadable[_] = CustomOutputParser }
Example 9
Source File: HTTPSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.split2 import java.io.File import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.io.http.HTTPSchema.string_to_response import org.apache.http.impl.client.HttpClientBuilder import org.apache.spark.sql.execution.streaming.{HTTPSinkProvider, HTTPSourceProvider} import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.StringType class HTTPSuite extends TestBase with HTTPTestUtils { test("stream from HTTP", TestBase.Extended) { val q1 = session.readStream.format(classOf[HTTPSourceProvider].getName) .option("host", host) .option("port", port.toString) .option("path", apiPath) .load() .withColumn("contentLength", col("request.entity.contentLength")) .withColumn("reply", string_to_response(col("contentLength").cast(StringType))) .writeStream .format(classOf[HTTPSinkProvider].getName) .option("name", "foo") .queryName("foo") .option("replyCol", "reply") .option("checkpointLocation", new File(tmpDir.toFile, "checkpoints").toString) .start() Thread.sleep(5000) val client = HttpClientBuilder.create().build() val p1 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "here"), url) val p2 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "heree"), url) val p3 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "hereee"), url) val p4 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "hereeee"), url) val posts = List(p1, p2, p3, p4) val correctResponses = List(27, 28, 29, 30) posts.zip(correctResponses).foreach { p => assert(p._1 === p._2.toString) } q1.stop() client.close() } }
Example 10
Source File: ColumnCondition.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.checks import org.apache.spark.sql.functions.{col} private[checks] object ColumnCondition { def isEachNotNull(cols: Seq[String]): String = { cols .map(col(_).isNotNull) .reduce(_ and _) .toString() } def isAnyNotNull(cols: Seq[String]): String = { cols .map(col(_).isNotNull) .reduce(_ or _) .toString() } }
Example 11
Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable} import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions] class VowpalWabbitInteractions(override val uid: String) extends Transformer with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("VowpalWabbitInteractions")) override def transform(dataset: Dataset[_]): DataFrame = { val fieldSubset = dataset.schema.fields .filter(f => getInputCols.contains(f.name)) val mask = getMask val mode = udf((r: Row) => { // compute the final number of features val numElems = (0 until r.length) .map(r.getAs[Vector](_).numNonzeros).product val newIndices = new Array[Int](numElems) val newValues = new Array[Double](numElems) // build interaction features using FNV-1 val fnvPrime = 16777619 var i = 0 def interact(idx: Int, value: Double, ns: Int): Unit = { if (ns == r.size) { newIndices(i) += mask & idx newValues(i) += value i += 1 } else { val idx1 = idx * fnvPrime r.getAs[Vector](ns).foreachActive { case (idx2, value2) => interact(idx1 ^ idx2, value * value2, ns + 1) } } } // start the recursion interact(0, 1, 0) val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions) Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted) }) dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*))) } override def transformSchema(schema: StructType): StructType = { val fieldNames = schema.fields.map(_.name) for (f <- getInputCols) if (!fieldNames.contains(f)) throw new IllegalArgumentException("missing input column " + f) else { val fieldType = schema.fields(schema.fieldIndex(f)).dataType if (fieldType != VectorType) throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName) } schema.add(StructField(getOutputCol, VectorType, true)) } override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra) }
Example 12
Source File: VowpalWabbitRegressor.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.serialize.ConstructorReadable import org.apache.spark.ml.{BaseRegressor, ComplexParamsReadable, ComplexParamsWritable} import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.col import org.apache.spark.ml.regression.RegressionModel object VowpalWabbitRegressor extends DefaultParamsReadable[VowpalWabbitRegressor] @InternalWrapper class VowpalWabbitRegressor(override val uid: String) extends BaseRegressor[Row, VowpalWabbitRegressor, VowpalWabbitRegressionModel] with VowpalWabbitBase { def this() = this(Identifiable.randomUID("VowpalWabbitRegressor")) override def train(dataset: Dataset[_]): VowpalWabbitRegressionModel = { val model = new VowpalWabbitRegressionModel(uid) .setFeaturesCol(getFeaturesCol) .setAdditionalFeatures(getAdditionalFeatures) .setPredictionCol(getPredictionCol) trainInternal(dataset, model) } override def copy(extra: ParamMap): VowpalWabbitRegressor = defaultCopy(extra) } @InternalWrapper class VowpalWabbitRegressionModel(override val uid: String) extends RegressionModel[Row, VowpalWabbitRegressionModel] with VowpalWabbitBaseModel with ComplexParamsWritable { protected override def transformImpl(dataset: Dataset[_]): DataFrame = { transformImplInternal(dataset) .withColumn($(predictionCol), col($(rawPredictionCol))) } override def predict(features: Row): Double = { throw new NotImplementedError("Not implement") } override def copy(extra: ParamMap): this.type = defaultCopy(extra) } object VowpalWabbitRegressionModel extends ComplexParamsReadable[VowpalWabbitRegressionModel]
Example 13
Source File: VowpalWabbitClassifier.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.schema.DatasetExtensions import org.apache.spark.ml.ComplexParamsReadable import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.ml.classification.{ProbabilisticClassificationModel, ProbabilisticClassifier} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql._ import org.apache.spark.sql.functions.{col, udf} import org.vowpalwabbit.spark.VowpalWabbitExample import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import scala.math.exp object VowpalWabbitClassifier extends DefaultParamsReadable[VowpalWabbitClassifier] @InternalWrapper class VowpalWabbitClassifier(override val uid: String) extends ProbabilisticClassifier[Row, VowpalWabbitClassifier, VowpalWabbitClassificationModel] with VowpalWabbitBase { def this() = this(Identifiable.randomUID("VowpalWabbitClassifier")) // to support Grid search we need to replicate the parameters here... val labelConversion = new BooleanParam(this, "labelConversion", "Convert 0/1 Spark ML style labels to -1/1 VW style labels. Defaults to true.") setDefault(labelConversion -> true) def getLabelConversion: Boolean = $(labelConversion) def setLabelConversion(value: Boolean): this.type = set(labelConversion, value) override protected def train(dataset: Dataset[_]): VowpalWabbitClassificationModel = { val model = new VowpalWabbitClassificationModel(uid) .setFeaturesCol(getFeaturesCol) .setAdditionalFeatures(getAdditionalFeatures) .setPredictionCol(getPredictionCol) .setProbabilityCol(getProbabilityCol) .setRawPredictionCol(getRawPredictionCol) val finalDataset = if (!getLabelConversion) dataset else { val inputLabelCol = dataset.withDerivativeCol("label") dataset .withColumnRenamed(getLabelCol, inputLabelCol) .withColumn(getLabelCol, col(inputLabelCol) * 2 - 1) } trainInternal(finalDataset, model) } override def copy(extra: ParamMap): VowpalWabbitClassifier = defaultCopy(extra) } // Preparation for multi-class learning, though it no fun as numClasses is spread around multiple reductions @InternalWrapper class VowpalWabbitClassificationModel(override val uid: String) extends ProbabilisticClassificationModel[Row, VowpalWabbitClassificationModel] with VowpalWabbitBaseModel { def numClasses: Int = 2 override def transform(dataset: Dataset[_]): DataFrame = { val df = transformImplInternal(dataset) // which mode one wants to use depends a bit on how this should be deployed // 1. if you stay in spark w/o link=logistic is probably more convenient as it also returns the raw prediction // 2. if you want to export the model *and* get probabilities at scoring term w/ link=logistic is preferable // convert raw prediction to probability (if needed) val probabilityUdf = if (vwArgs.getArgs.contains("--link logistic")) udf { (pred: Double) => Vectors.dense(Array(1 - pred, pred)) } else udf { (pred: Double) => { val prob = 1.0 / (1.0 + exp(-pred)) Vectors.dense(Array(1 - prob, prob)) } } val df2 = df.withColumn($(probabilityCol), probabilityUdf(col($(rawPredictionCol)))) // convert probability to prediction val probability2predictionUdf = udf(probability2prediction _) df2.withColumn($(predictionCol), probability2predictionUdf(col($(probabilityCol)))) } override def copy(extra: ParamMap): this.type = defaultCopy(extra) protected override def predictRaw(features: Row): Vector = { throw new NotImplementedError("Not implemented") } protected override def raw2probabilityInPlace(rawPrediction: Vector): Vector= { throw new NotImplementedError("Not implemented") } } object VowpalWabbitClassificationModel extends ComplexParamsReadable[VowpalWabbitClassificationModel]
Example 14
Source File: udfs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Column import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.DoubleType import scala.collection.mutable //scalastyle:off object udfs { def get_value_at(colName: String, i: Int): Column = { udf({ vec: org.apache.spark.ml.linalg.Vector => vec(i) }, DoubleType)(col(colName)) } val to_vector: UserDefinedFunction = udf({ arr: Seq[Double] => Vectors.dense(arr.toArray) }, VectorType) def to_vector(colName: String): Column = to_vector(col(colName)) }
Example 15
Source File: UDFTransformer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasInputCols, HasOutputCol, Wrappable} import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.serialize.ComplexParam import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam, UDPyFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.execution.python.UserDefinedPythonFunction import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.{Column, DataFrame, Dataset} import org.apache.spark.sql.functions.col object UDFTransformer extends ComplexParamsReadable[UDFTransformer] override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) if (isSet(inputCol)) { dataset.withColumn(getOutputCol, applyUDF(dataset.col(getInputCol))) } else { dataset.withColumn(getOutputCol, applyUDFOnCols(getInputCols.map(col): _*)) } } def validateAndTransformSchema(schema: StructType): StructType = { if (isSet(inputCol)) schema(getInputCol) else schema(Set(getInputCols: _*)) schema.add(StructField(getOutputCol, getDataType)) } def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema) def copy(extra: ParamMap): UDFTransformer = defaultCopy(extra) }
Example 16
Source File: PageSplitter.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize.text import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable} import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} object PageSplitter extends DefaultParamsReadable[PageSplitter] class PageSplitter(override val uid: String) extends Transformer with HasInputCol with HasOutputCol with Wrappable with DefaultParamsWritable { def this() = this(Identifiable.randomUID("PageSplitter")) setDefault(outputCol, uid + "_output") val maximumPageLength = new IntParam(this, "maximumPageLength", "the maximum number of characters to be in a page") def setMaximumPageLength(v: Int): this.type = set(maximumPageLength, v) def getMaximumPageLength: Int = $(maximumPageLength) val minimumPageLength = new IntParam(this, "minimumPageLength", "the the minimum number of characters " + "to have on a page in order to preserve work boundaries") def setMinimumPageLength(v: Int): this.type = set(minimumPageLength, v) def getMinimumPageLength: Int = $(minimumPageLength) val boundaryRegex = new Param[String](this, "boundaryRegex", "how to split into words") def setBoundaryRegex(v: String): this.type = set(boundaryRegex, v) def getBoundaryRegex: String = $(boundaryRegex) setDefault(maximumPageLength -> 5000, minimumPageLength -> 4500, boundaryRegex -> "\\s") def split(textOpt: String): Seq[String] = { Option(textOpt).map { text => if (text.length < getMaximumPageLength) { Seq(text) } else { val lengths = text .split(getBoundaryRegex) .map(_.length) .flatMap(l => List(l, 1)) .dropRight(1) val indicies = lengths.scanLeft((0, 0, Nil: List[Int])) { case ((total, count, _), l) => if (count + l < getMaximumPageLength) { (total + l, count + l, Nil) } else if (count > getMinimumPageLength) { (total + l, l, List(total)) } else { val firstPageChars = getMaximumPageLength - count val firstPage = firstPageChars + total val remainingChars = l - firstPageChars val numPages = remainingChars / getMaximumPageLength val remainder = remainingChars - getMaximumPageLength * numPages val pages = List(firstPage) ::: (1 to numPages).map(i => total + firstPageChars + getMaximumPageLength * i).toList (total + l, remainder, pages) } }.flatMap(_._3) val words = (List(0) ::: indicies.toList ::: List(text.length)) .sliding(2) .map { case List(start, end) => text.substring(start, end) } .toSeq words } }.orNull } override def transform(dataset: Dataset[_]): DataFrame = { dataset.toDF().withColumn(getOutputCol, udf(split _, ArrayType(StringType))(col(getInputCol))) } override def copy(extra: ParamMap): MultiNGram = defaultCopy(extra) def transformSchema(schema: StructType): StructType = { assert(schema(getInputCol).dataType == StringType) schema.add(getOutputCol, ArrayType(StringType)) } }
Example 17
Source File: FunctionsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotator.{PerceptronApproach, Tokenizer} import com.johnsnowlabs.nlp.training.POS import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.types.ArrayType import org.scalatest._ class FunctionsTestSpec extends FlatSpec { "functions in functions" should "work successfully" in { import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._ val trainingPerceptronDF = POS().readDataset(ResourceHelper.spark, "src/test/resources/anc-pos-corpus-small/", "|", "tags") val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val pos = new PerceptronApproach() .setInputCols("document", "token") .setOutputCol("pos") .setPosColumn("tags") .setNIterations(3) val pipeline = new Pipeline() .setStages(Array( documentAssembler, tokenizer, pos )) val model = pipeline.fit(trainingPerceptronDF) val data = model.transform(Seq("Peter is a very good and compromised person.").toDF("text")) import functions._ val mapped = data.mapAnnotationsCol("pos", "modpos", (annotations: Seq[Annotation]) => { annotations.filter(_.result == "JJ") }) val modified = data.mapAnnotationsCol("pos", "modpos", (_: Seq[Annotation]) => { "hello world" }) val filtered = data.filterByAnnotationsCol("pos", (annotations: Seq[Annotation]) => { annotations.exists(_.result == "JJ") }) import org.apache.spark.sql.functions.col val udfed = data.select(mapAnnotations((annotations: Seq[Annotation]) => { annotations.filter(_.result == "JJ") }, ArrayType(Annotation.dataType))(col("pos"))) val udfed2 = data.select(mapAnnotationsStrict((annotations: Seq[Annotation]) => { annotations.filter(_.result == "JJ") })(col("pos"))) mapped.show(truncate = false) modified.show(truncate = false) filtered.show(truncate = false) udfed.show(truncate = false) udfed2.show(truncate = false) } }
Example 18
Source File: functions.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.{array, col, explode, udf} import org.apache.spark.sql.types.DataType import scala.reflect.runtime.universe._ object functions { implicit class FilterAnnotations(dataset: DataFrame) { def filterByAnnotationsCol(column: String, function: Seq[Annotation] => Boolean): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } dataset.filter(func(col(column)).as(column, meta)) } } def mapAnnotations[T](function: Seq[Annotation] => T, outputType: DataType): UserDefinedFunction = udf ( { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) }, outputType) def mapAnnotationsStrict(function: Seq[Annotation] => Seq[Annotation]): UserDefinedFunction = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } implicit class MapAnnotations(dataset: DataFrame) { def mapAnnotationsCol[T: TypeTag](column: String, outputCol: String, function: Seq[Annotation] => T): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } dataset.withColumn(outputCol, func(col(column)).as(outputCol, meta)) } } implicit class EachAnnotations(dataset: DataFrame) { import dataset.sparkSession.implicits._ def eachAnnotationsCol[T: TypeTag](column: String, function: Seq[Annotation] => Unit): Unit = { dataset.select(column).as[Array[Annotation]].foreach(function(_)) } } implicit class ExplodeAnnotations(dataset: DataFrame) { def explodeAnnotationsCol[T: TypeTag](column: String, outputCol: String): DataFrame = { val meta = dataset.schema(column).metadata dataset. withColumn(outputCol, explode(col(column))). withColumn(outputCol, array(col(outputCol)).as(outputCol, meta)) } } }
Example 19
Source File: ChiSquareTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.mllib.stat.{Statistics => OldStatistics} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col @Since("2.2.0") def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = { val spark = dataset.sparkSession import spark.implicits._ SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT) SchemaUtils.checkNumericType(dataset.schema, labelCol) val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, Vector)] .rdd.map { case (label, features) => OldLabeledPoint(label, OldVectors.fromML(features)) } val testResults = OldStatistics.chiSqTest(rdd) val pValues: Vector = Vectors.dense(testResults.map(_.pValue)) val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom) val statistics: Vector = Vectors.dense(testResults.map(_.statistic)) spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics))) } }
Example 20
Source File: ChiSquareTest.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.stat import com.tencent.angel.sona.ml.feature.LabeledPoint import org.apache.spark.linalg import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.util.SONASchemaUtils import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col /** * :: Experimental :: * * Chi-square hypothesis testing for categorical data. * * See <a href="http://en.wikipedia.org/wiki/Chi-squared_test">Wikipedia</a> for more information * on the Chi-squared test. */ object ChiSquareTest { private case class ChiSquareResult( pValues: linalg.Vector, degreesOfFreedom: Array[Int], statistics: linalg.Vector) /** * Conduct Pearson's independence test for every feature against the label. For each feature, the * (feature, label) pairs are converted into a contingency matrix for which the Chi-squared * statistic is computed. All label and feature values must be categorical. * * The null hypothesis is that the occurrence of the outcomes is statistically independent. * * @param dataset DataFrame of categorical labels and categorical features. * Real-valued features will be treated as categorical for each distinct value. * @param featuresCol Name of features column in dataset, of type `Vector` (`VectorUDT`) * @param labelCol Name of label column in dataset, of any numerical type * @return DataFrame containing the test result for every feature against the label. * This DataFrame will contain a single Row with the following fields: * - `pValues: Vector` * - `degreesOfFreedom: Array[Int]` * - `statistics: Vector` * Each of these fields has one value per feature. */ def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = { val spark = dataset.sparkSession import spark.implicits._ SONASchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT) SONASchemaUtils.checkNumericType(dataset.schema, labelCol) val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, linalg.Vector)] .rdd.map { case (label, features) => LabeledPoint(label, features) } val testResults = Statistics.chiSqTest(rdd) val pValues: linalg.Vector = Vectors.dense(testResults.map(_.pValue)) val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom) val statistics: linalg.Vector = Vectors.dense(testResults.map(_.statistic)) spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics))) } }
Example 21
Source File: DatasetUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata} import org.apache.spark.sql.{Column, DataFrame, Dataset} object DatasetUtil { def withColumns[T](ds: Dataset[T], colNames: Seq[String], cols: Seq[Column], metadata: Seq[Metadata]): DataFrame = { require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") require(colNames.size == metadata.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of metadata elements: ${metadata.size}") val sparkSession = ds.sparkSession val queryExecution = ds.queryExecution val resolver = sparkSession.sessionState.analyzer.resolver val output = queryExecution.analyzed.output checkColumnNameDuplication(colNames, "in given column names", sparkSession.sessionState.conf.caseSensitiveAnalysis) val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) => colName -> col.as(colName, metadata) }.toMap val replacedAndExistingColumns = output.map { field => columnMap.find { case (colName, _) => resolver(field.name, colName) } match { case Some((colName: String, col: Column)) => col.as(colName) case _ => new Column(field) } } val newColumns = columnMap.filter { case (colName, col) => !output.exists(f => resolver(f.name, colName)) }.map { case (colName, col) => col.as(colName) } ds.select(replacedAndExistingColumns ++ newColumns: _*) } def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = { withColumns(ds, Seq(colName), Seq(col), Seq(metadata)) } private def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } /** * Cast a column in a Dataset to Vector type. * * The supported data types of the input column are * - Vector * - float/double type Array. * * Note: The returned column does not have Metadata. * * @param dataset input DataFrame * @param colName column name. * @return Vector column */ def columnToVector(dataset: Dataset[_], colName: String): Column = { val columnDataType = dataset.schema(colName).dataType columnDataType match { case _: VectorUDT => col(colName) case fdt: ArrayType => val transferUDF = fdt.elementType match { case _: FloatType => udf(f = (vector: Seq[Float]) => { val inputArray = Array.fill[Double](vector.size)(0.0) vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble) Vectors.dense(inputArray) }) case _: DoubleType => udf((vector: Seq[Double]) => { Vectors.dense(vector.toArray) }) case other => throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector") } transferUDF(col(colName)) case other => throw new IllegalArgumentException(s"$other column cannot be cast to Vector") } } }
Example 22
Source File: OilPriceFunc.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.geomesa import java.text.SimpleDateFormat import java.util.Calendar import org.apache.spark.sql.SparkSession import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{udf, window, last, col, lag} object OilPriceFunc { // use this if the window function misbehaves due to timezone e.g. BST // ./spark-shell --driver-java-options "-Duser.timezone=UTC" // ./spark-submit --conf 'spark.driver.extraJavaOptions=-Duser.timezone=UTC' // define a function to reformat the date field def convert(date:String) : String = { val df1 = new SimpleDateFormat("dd/MM/yyyy") val dt = df1.parse(date) val df2 = new SimpleDateFormat("yyyy-MM-dd") df2.format(dt) } // create and save oil price changes def createOilPriceDF(inputfile: String, outputfile: String, spark: SparkSession) = { val oilPriceDF = spark. read. option("header", "true"). option("inferSchema", "true"). csv(inputfile) val convertDateUDF = udf { (Date: String) => convert(Date) } val oilPriceDatedDF = oilPriceDF.withColumn("DATE", convertDateUDF(oilPriceDF("DATE"))) // offset to start at beginning of week val windowDF = oilPriceDatedDF.groupBy(window(oilPriceDatedDF.col("DATE"), "7 days", "7 days", "4 days")) val windowLastDF = windowDF.agg(last("PRICE") as "last(PRICE)").sort("window") // windowLastDF.show(20, false) val sortedWindow = Window.orderBy("window.start") val lagLastCol = lag(col("last(PRICE)"), 1).over(sortedWindow) val lagLastColDF = windowLastDF.withColumn("lastPrev(PRICE)", lagLastCol) // lagLastColDF.show(20, false) val simplePriceChangeFunc = udf { (last: Double, prevLast: Double) => var change = ((last - prevLast) compare 0).signum if (change == -1) change = 0 change.toDouble } val findDateTwoDaysAgoUDF = udf { (date: String) => val dateFormat = new SimpleDateFormat("yyyy-MM-dd") val cal = Calendar.getInstance cal.setTime(dateFormat.parse(date)) cal.add(Calendar.DATE, -3) dateFormat.format(cal.getTime) } val oilPriceChangeDF = lagLastColDF.withColumn("label", simplePriceChangeFunc( lagLastColDF("last(PRICE)"), lagLastColDF("lastPrev(PRICE)") )).withColumn("commonFriday", findDateTwoDaysAgoUDF(lagLastColDF("window.end"))) // oilPriceChangeDF.show(20, false) oilPriceChangeDF.select("label", "commonFriday"). write. format("com.databricks.spark.csv"). option("header", "true"). //.option("codec", "org.apache.hadoop.io.compress.GzipCodec") save(outputfile) } }
Example 23
Source File: StructuralProfiler.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers.raw import au.com.bytecode.opencsv.CSVParser import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} case class StructuralProfiler(delimiter: String = ",") { def profile(df: Dataset[String]): Dataset[StructuralReport] = { import df.sparkSession.implicits._ val rows = df.mapPartitions({ lines => val parser = new CSVParser(delimiter.charAt(0)) lines.map(line => (parser.parseLine(line).length, line)) }) val fieldCount = rows.groupByKey({ case (fields, line) => fields }).count() .withColumnRenamed("value", "fields") .withColumnRenamed("count(1)", "count") val fieldLine = rows.groupByKey({ case (fields, line) => fields }).reduceGroups({ (v1, v2) => v1 }).map({ case (fields, (_, line)) => (fields, line) }) .withColumnRenamed("_1", "_fieldLine_") .withColumnRenamed("_2", "line") fieldCount.join(fieldLine, col("fields") === col("_fieldLine_")) .drop("_fieldLine_") .map({ case Row(columns: Int, count: Long, line: String) => StructuralReport( columns, count, line ) }) } } case class StructuralReport( fields: Int, metricValue: Double, description: String )
Example 24
Source File: CardinalityProfiler.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers.field import io.gzet.profilers.Utils import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} import scala.collection.mutable import scalaz.Scalaz._ case class CardinalityProfiler(topN: Int = 5) { def profile(df: Dataset[Array[String]]): Dataset[CardinalityReport] = { val total = df.sparkSession.sparkContext.broadcast(df.count()) import df.sparkSession.implicits._ val features = Utils.buildColumns(df) val topNValues = features.groupByKey({ field => field }).count().map({ case (field, count) => (field.idx, Map(field.value -> count)) }).groupByKey({ case (column, map) => column }).reduceGroups({ (v1, v2) => val m1 = v1._2 val m2 = v2._2 val m = (m1 |+| m2).toSeq.sortBy(_._2).reverse (v1._1, m.take(math.min(m.size, topN)).toMap) }).map({ case (column, (_, map)) => val top = map.keySet.toArray (column, top) }) .withColumnRenamed("_1", "_topNValues_") .withColumnRenamed("_2", "description") val cardinalities = features.distinct().groupByKey(_.idx).count().map({ case (column, distinctValues) => val cardinality = distinctValues / total.value.toDouble (column, cardinality) }) .withColumnRenamed("_1", "column") .withColumnRenamed("_2", "cardinality") cardinalities.join(topNValues, col("column") === col("_topNValues_")) .drop("_topNValues_") .map({ case Row(column: Int, cardinality: Double, description: mutable.WrappedArray[String]) => CardinalityReport( column, cardinality, description.toArray ) }) } } case class CardinalityReport( field: Int, metricValue: Double, description: Array[String] )
Example 25
Source File: OneHotEncoderSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { def stringIndexed(): DataFrame = { val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) val df = sqlContext.createDataFrame(data).toDF("id", "label") val indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex") .fit(df) indexer.transform(df) } test("params") { ParamsSuite.checkParams(new OneHotEncoder) } test("OneHotEncoder dropLast = false") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") .setDropLast(false) val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1), vec(2)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0), (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0)) assert(output === expected) } test("OneHotEncoder dropLast = true") { val transformed = stringIndexed() val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").map { r => val vec = r.getAs[Vector](1) (r.getInt(0), vec(0), vec(1)) }.collect().toSet // a -> 0, b -> 2, c -> 1 val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0), (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0)) assert(output === expected) } test("input column with ML attribute") { val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large") val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size") .select(col("size").as("size", attr.toMetadata())) val encoder = new OneHotEncoder() .setInputCol("size") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1)) } test("input column without ML attribute") { val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index") val encoder = new OneHotEncoder() .setInputCol("index") .setOutputCol("encoded") val output = encoder.transform(df) val group = AttributeGroup.fromStructField(output.schema("encoded")) assert(group.size === 2) assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0)) assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1)) } test("read/write") { val t = new OneHotEncoder() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setDropLast(false) testDefaultReadWrite(t) } }
Example 26
Source File: HashingTF.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 27
Source File: SampleStratifiedOn.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.algebra.sampling import org.apache.spark.sql.functions.{rand, udf, col} import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter } import play.api.libs.json._ import mimir.algebra._ import mimir.exec.spark.RAToSpark import mimir.serialization.{ Json => MimirJson } case class SampleStratifiedOn(column:ID, t:Type, strata:Map[PrimitiveValue,Double]) extends SamplingMode { val sparkStrata = strata.map { case (v, p) => RAToSpark.getNative(v, t) -> p } .toMap override def toString = s"ON $column WITH STRATA ${strata.map { case (v,p) => s"$v -> $p"}.mkString(" | ")}" def apply(plan: LogicalPlan, seed: Long): LogicalPlan = { // Adapted from Spark's df.stat.sampleBy method val c = col(column.id) val r = rand(seed) val f = udf { (stratum: Any, x: Double) => x < sparkStrata.getOrElse(stratum, 0.0) } Filter( f(c, r).expr, plan ) } def expressions: Seq[Expression] = Seq(Var(column)) def rebuildExpressions(x: Seq[Expression]): SamplingMode = { x(0) match { case Var(newColumn) => SampleStratifiedOn(newColumn, t, strata) case _ => throw new RAException("Internal Error: Rewriting stratification variable with arbitrary expression") } } def toJson: JsValue = JsObject(Map[String,JsValue]( "mode" -> JsString(SampleStratifiedOn.MODE), "column" -> JsString(column.id), "type" -> MimirJson.ofType(t), "strata" -> JsArray( strata .toSeq .map { case (v, p) => JsObject(Map[String,JsValue]( "value" -> MimirJson.ofPrimitive(v), "probability" -> JsNumber(p) )) } ) )) } object SampleStratifiedOn { val MODE = "stratified_on" def parseJson(json:Map[String, JsValue]): Option[SampleStratifiedOn] = { if(json("mode").as[String].equals(MODE)){ val t = MimirJson.toType(json("type")) Some(SampleStratifiedOn( ID(json("column").as[String]), t, json("strata") .as[Seq[Map[String,JsValue]]] .map { stratum => MimirJson.toPrimitive(t, stratum("value")) -> stratum("probability").as[Double] } .toMap )) } else { None } } }
Example 28
Source File: ClassifierDatasetEncoder.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.ml.tensorflow import com.johnsnowlabs.nlp.Annotation import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{size, explode, col} import scala.collection.mutable class ClassifierDatasetEncoder(val params: ClassifierDatasetEncoderParams) extends Serializable { val tags2Id: Map[String, Int] = params.tags.zipWithIndex .map(p => (p._1, p._2)) .toMap val tags: Array[String] = tags2Id .map(p => (p._2, p._1)) .toArray .sortBy(p => p._1) .map(p => p._2) def encodeTags(labels: Array[String]): Array[Array[Int]] = { labels.map { t => val labelIDsArray = Array.fill(tags.length)(0) labelIDsArray(tags2Id(t)) = 1 labelIDsArray } } def decodeOutputData(tagIds: Array[Array[Float]]): Array[Array[(String, Float)]] = { val scoresMetadata = tagIds.map { scores => scores.zipWithIndex.flatMap { case (score, idx) => val tag = tags2Id.find(_._2 == idx).map(_._1).getOrElse("NA") Map(tag -> score) } } scoresMetadata } } case class ClassifierDatasetEncoderParams(tags: Array[String])
Example 29
Source File: HashingTF.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 30
Source File: UnaryTransformerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.DoubleParam import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.{DataType, DataTypes} import org.apache.spark.util.Utils // $example off$ object MyTransformer extends DefaultParamsReadable[MyTransformer] // $example off$ def main(args: Array[String]) { val spark = SparkSession .builder() .appName("UnaryTransformerExample") .getOrCreate() // $example on$ val myTransformer = new MyTransformer() .setShift(0.5) .setInputCol("input") .setOutputCol("output") // Create data, transform, and display it. val data = spark.range(0, 5).toDF("input") .select(col("input").cast("double").as("input")) val result = myTransformer.transform(data) println("Transformed by adding constant value") result.show() // Save and load the Transformer. val tmpDir = Utils.createTempDir() val dirName = tmpDir.getCanonicalPath myTransformer.write.overwrite().save(dirName) val sameTransformer = MyTransformer.load(dirName) // Transform the data to show the results are identical. println("Same transform applied from loaded model") val sameResult = sameTransformer.transform(data) sameResult.show() Utils.deleteRecursively(tmpDir) // $example off$ spark.stop() } } // scalastyle:on println
Example 31
Source File: Distinctness.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.functions.{col, sum} import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.Column case class Distinctness(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Distinctness", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { (sum(col(COUNT_COL).geq(1).cast(DoubleType)) / numRows) :: Nil } override def filterCondition: Option[String] = where } object Distinctness { def apply(column: String): Distinctness = { new Distinctness(column :: Nil) } }
Example 32
Source File: Uniqueness.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, lit, sum} import org.apache.spark.sql.types.DoubleType case class Uniqueness(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil } override def filterCondition: Option[String] = where } object Uniqueness { def apply(column: String): Uniqueness = { new Uniqueness(column :: Nil) } def apply(column: String, where: Option[String]): Uniqueness = { new Uniqueness(column :: Nil, where) } }
Example 33
Source File: Entropy.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, sum, udf} case class Entropy(column: String, where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { val summands = udf { (count: Double) => if (count == 0.0) { 0.0 } else { -(count / numRows) * math.log(count / numRows) } } sum(summands(col(COUNT_COL))) :: Nil } override def filterCondition: Option[String] = where }
Example 34
Source File: MutualInformation.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers._ import com.amazon.deequ.metrics.{DoubleMetric, Entity} import org.apache.spark.sql.functions.{col, sum, udf} import org.apache.spark.sql.types.StructType import Analyzers.COUNT_COL import com.amazon.deequ.analyzers.runners.MetricCalculationException override def preconditions: Seq[StructType => Unit] = { Preconditions.exactlyNColumns(columns, 2) +: super.preconditions } override def toFailureMetric(exception: Exception): DoubleMetric = { metricFromFailure(exception, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn) } override def filterCondition: Option[String] = where } object MutualInformation { def apply(columnA: String, columnB: String): MutualInformation = { new MutualInformation(columnA :: columnB :: Nil) } }
Example 35
Source File: ApproxQuantiles.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import com.amazon.deequ.analyzers.runners.{IllegalAnalyzerParameterException, MetricCalculationException} import com.amazon.deequ.metrics.{Entity, KeyedDoubleMetric} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DeequFunctions, Row} import scala.util.{Failure, Success} case class ApproxQuantiles(column: String, quantiles: Seq[Double], relativeError: Double = 0.01) extends ScanShareableAnalyzer[ApproxQuantileState, KeyedDoubleMetric] { val PARAM_CHECKS: StructType => Unit = { _ => quantiles.foreach { quantile => if (quantile < 0.0 || quantile > 1.0) { throw new IllegalAnalyzerParameterException(MetricCalculationException .getApproxQuantileIllegalParamMessage(quantile)) } } if (relativeError < 0.0 || relativeError > 1.0) { throw new IllegalAnalyzerParameterException(MetricCalculationException .getApproxQuantileIllegalErrorParamMessage(relativeError)) } } override private[deequ] def aggregationFunctions() = { DeequFunctions.stateful_approx_quantile(col(column), relativeError) :: Nil } override private[deequ] def fromAggregationResult( result: Row, offset: Int) : Option[ApproxQuantileState] = { if (result.isNullAt(offset)) { None } else { val percentileDigest = ApproximatePercentile.serializer.deserialize( result.getAs[Array[Byte]](offset)) Some(ApproxQuantileState(percentileDigest)) } } override def computeMetricFrom(state: Option[ApproxQuantileState]): KeyedDoubleMetric = { state match { case Some(theState) => val digest = theState.percentileDigest val computedQuantiles = digest.getPercentiles(quantiles.toArray) val results = quantiles.zip(computedQuantiles) .map { case (quantile, result) => quantile.toString -> result } .toMap KeyedDoubleMetric(Entity.Column, "ApproxQuantiles", column, Success(results)) case _ => toFailureMetric(Analyzers.emptyStateException(this)) } } override def toFailureMetric(exception: Exception): KeyedDoubleMetric = { KeyedDoubleMetric(Entity.Column, "ApproxQuantiles", column, Failure( MetricCalculationException.wrapIfNecessary(exception))) } override def preconditions: Seq[StructType => Unit] = { PARAM_CHECKS :: hasColumn(column) :: isNumeric(column) :: Nil } }
Example 36
Source File: UniqueValueRatio.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import com.amazon.deequ.metrics.DoubleMetric import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{col, count, lit, sum} import org.apache.spark.sql.types.DoubleType case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil } override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { val numUniqueValues = result.getDouble(offset) val numDistinctValues = result.getLong(offset + 1).toDouble toSuccessMetric(numUniqueValues / numDistinctValues) } override def filterCondition: Option[String] = where } object UniqueValueRatio { def apply(column: String): UniqueValueRatio = { new UniqueValueRatio(column :: Nil) } def apply(column: String, where: Option[String]): UniqueValueRatio = { new UniqueValueRatio(column :: Nil, where) } }
Example 37
Source File: WholeStageCodegenSparkSubmitSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.TimeLimits import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.{LocalSparkSession, QueryTest, Row, SparkSession} import org.apache.spark.sql.functions.{array, col, count, lit} import org.apache.spark.sql.types.IntegerType import org.apache.spark.unsafe.Platform import org.apache.spark.util.ResetSystemProperties // Due to the need to set driver's extraJavaOptions, this test needs to use actual SparkSubmit. class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach with ResetSystemProperties { test("Generated code on driver should not embed platform-specific constant") { val unusedJar = TestUtils.createJarWithClasses(Seq.empty) // HotSpot JVM specific: Set up a local cluster with the driver/executor using mismatched // settings of UseCompressedOops JVM option. val argsForSparkSubmit = Seq( "--class", WholeStageCodegenSparkSubmitSuite.getClass.getName.stripSuffix("$"), "--master", "local-cluster[1,1,1024]", "--driver-memory", "1g", "--conf", "spark.ui.enabled=false", "--conf", "spark.master.rest.enabled=false", "--conf", "spark.driver.extraJavaOptions=-XX:-UseCompressedOops", "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops", unusedJar.toString) SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") } } object WholeStageCodegenSparkSubmitSuite extends Assertions with Logging { var spark: SparkSession = _ def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") spark = SparkSession.builder().getOrCreate() // Make sure the test is run where the driver and the executors uses different object layouts val driverArrayHeaderSize = Platform.BYTE_ARRAY_OFFSET val executorArrayHeaderSize = spark.sparkContext.range(0, 1).map(_ => Platform.BYTE_ARRAY_OFFSET).collect.head.toInt assert(driverArrayHeaderSize > executorArrayHeaderSize) val df = spark.range(71773).select((col("id") % lit(10)).cast(IntegerType) as "v") .groupBy(array(col("v"))).agg(count(col("*"))) val plan = df.queryExecution.executedPlan assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined) val expectedAnswer = Row(Array(0), 7178) :: Row(Array(1), 7178) :: Row(Array(2), 7178) :: Row(Array(3), 7177) :: Row(Array(4), 7177) :: Row(Array(5), 7177) :: Row(Array(6), 7177) :: Row(Array(7), 7177) :: Row(Array(8), 7177) :: Row(Array(9), 7177) :: Nil val result = df.collect QueryTest.sameRows(result.toSeq, expectedAnswer) match { case Some(errMsg) => fail(errMsg) case _ => } } }
Example 38
Source File: MultiStreamHandler.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import knolx.Config._ import knolx.KnolXLogger import org.apache.spark.sql.functions.col import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode} import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{Encoders, SparkSession} case class CurrentPowerConsumption(kwh: Double) case class PowerConsumptionStatus(numOfReadings: Long, total: Double, avg: Double, status: String) { def compute(newReadings: List[Double]) = { val newTotal = newReadings.sum + total val newNumOfReadings = numOfReadings + newReadings.size val newAvg = newTotal / newNumOfReadings.toDouble PowerConsumptionStatus(newNumOfReadings, newTotal, newAvg, "ON") } } object MultiStreamHandler extends App with KnolXLogger { info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") val updateStateFunc = (deviceId: String, newReadings: Iterator[(String, CurrentPowerConsumption)], state: GroupState[PowerConsumptionStatus]) => { val data = newReadings.toList.map { case(_, reading) => reading }.map(_.kwh) lazy val initialPowerConsumptionStatus = PowerConsumptionStatus(0L, 0D, 0D, "OFF") val currentState = state.getOption.fold(initialPowerConsumptionStatus.compute(data))(_.compute(data)) val currentStatus = if(state.hasTimedOut) { // If we do not receive any reading, for a device, we will assume that it is OFF. currentState.copy(status = "OFF") } else { state.setTimeoutDuration("10 seconds") currentState } state.update(currentStatus) (deviceId, currentStatus) } info("Creating Streaming DF...") val dataStream = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", topic) .option("failOnDataLoss", false) .option("includeTimestamp", true) .load() info("Writing data to Console...") import spark.implicits._ implicit val currentPowerConsumptionEncoder = Encoders.kryo[CurrentPowerConsumption] implicit val powerConsumptionStatusEncoder = Encoders.kryo[PowerConsumptionStatus] val query = dataStream .select(col("key").cast(StringType).as("key"), col("value").cast(StringType).as("value")) .as[(String, String)] .map { case(deviceId, unit) => (deviceId, CurrentPowerConsumption(Option(unit).fold(0D)(_.toDouble))) } .groupByKey { case(deviceId, _) => deviceId } .mapGroupsWithState[PowerConsumptionStatus, (String, PowerConsumptionStatus)](GroupStateTimeout.ProcessingTimeTimeout())(updateStateFunc) .toDF("deviceId", "current_status") .writeStream .format("console") .option("truncate", false) .outputMode(OutputMode.Update()) .option("checkpointLocation", checkPointDir) .start() info("Waiting for the query to terminate...") query.awaitTermination() query.stop() }
Example 39
Source File: StreamStreamOuterJoiner.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import knolx.Config._ import knolx.KnolXLogger import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.functions.{col, expr, from_json} import org.apache.spark.sql.types.StructType object StreamStreamOuterJoiner extends App with KnolXLogger { info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") info("Streaming companies Dataframe") val companiesDF = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", companiesTopic) .load() .select(col("value").cast("string").as("companyName"), col("timestamp").as("companyTradingTime")) .withWatermark("companyTradingTime", "10 seconds") companiesDF.writeStream.format("console").option("truncate", false).start() info("Original Streaming Dataframe") val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType] val stockStreamDF = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", stocksTopic) .load() .select(from_json(col("value").cast("string"), schema).as("value"), col("timestamp").as("stockInputTime")) .select("value.*", "stockInputTime") .withWatermark("stockInputTime", "10 seconds") info("Filtered Streaming Dataframe") val filteredStockStreamDF = stockStreamDF.join(companiesDF, expr("companyName = stockName AND stockInputTime >= companyTradingTime AND stockInputTime <= companyTradingTime + interval 20 seconds"), joinType = "leftOuter") val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").option("truncate", false).start() info("Waiting for the query to terminate...") filteredStockStreamingQuery.awaitTermination() filteredStockStreamingQuery.stop() }
Example 40
Source File: StreamStaticJoiner.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import knolx.Config._ import knolx.KnolXLogger import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.functions.{col, from_json} import org.apache.spark.sql.types.StructType object StreamStaticJoiner extends App with KnolXLogger { info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") info("Static Dataframe") val companiesDF = spark.read.option("header", "true").csv("src/main/resources/companies.csv") companiesDF.show(false) info("Original Streaming Dataframe") val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType] val stockStreamDF = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", topic) .load() .select(from_json(col("value").cast("string"), schema).as("value")) .select("value.*") stockStreamDF.printSchema() stockStreamDF.writeStream.format("console").start() info("Filtered Streaming Dataframe") val filteredStockStreamDF = stockStreamDF.join(companiesDF, "companyName") val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").start() info("Waiting for the query to terminate...") filteredStockStreamingQuery.awaitTermination() filteredStockStreamingQuery.stop() }
Example 41
Source File: MultinomialLabeler.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.MultinomialLabelerModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasFeaturesCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{udf, col} import ml.combust.mleap.core.util.VectorConverters._ class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"), val model: MultinomialLabelerModel) extends Transformer with HasFeaturesCol with HasProbabilitiesCol with HasLabelsCol { def setFeaturesCol(value: String): this.type = set(featuresCol, value) def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value) def setLabelsCol(value: String): this.type = set(labelsCol, value) @org.apache.spark.annotation.Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val probabilitiesUdf = udf { (vector: Vector) => model.top(vector).map(_._1).toArray } val labelsUdf = udf { (vector: Vector) => model.topLabels(vector).toArray } dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))). withColumn($(labelsCol), labelsUdf(col($(featuresCol)))) } override def copy(extra: ParamMap): Transformer = copyValues(new MultinomialLabeler(uid, model), extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT], s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(probabilitiesCol)), s"Output column ${$(probabilitiesCol)} already exists.") require(!inputFields.exists(_.name == $(labelsCol)), s"Output column ${$(labelsCol)} already exists.") StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)), StructField($(labelsCol), ArrayType(StringType)))) } }
Example 42
Source File: CoverTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class CoverTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") property("argument numSplits must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setNumSplits(0) } } property("argument overlapRatio must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setOverlapRatio(0.0) } } property("cover estimator changes nothing with the original dataframe") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { val covered = cover .fit(transformed) .transform(transformed) .drop("cover_ids") .except(transformed) .count() should be(0) } } } property("generated cover covers all range of specified columns") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") val uncovered = udf { xs: Seq[Long] => xs.length == 0 } forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { cover .fit(transformed) .transform(transformed) .where(uncovered(col("cover_ids"))) .count() should be(0) } } } property("Cover is readable/writable") { val cover = new Cover() .setInputCols("double", "integer") .setOutputCol("cover_ids") testDefaultReadWrite(cover) } property("CoverModel is readable/writable") { val model = new CoverModel("myCoverModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0)) .setInputCols("double", "integer") .setOutputCol("cover_ids") val newModel = testDefaultReadWrite(model) assert(newModel.min === model.min) assert(newModel.max === model.max) } }
Example 43
Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector} import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class ReebDiagramTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") val cover = new Cover() .setExploding(true) .setInputCols("double", "integer") .setOutputCol("cover_id") property("argument topTreeSize must be positive") { intercept[IllegalArgumentException] { val reeb = new ReebDiagram() // .setIdCol("id") // .setCoverCol("cover_id") // .setFeaturesCol("vector") // .setOutputCol("cluster_id") .setTopTreeSize(0) } } property("placeholder") { val reeb = new ReebDiagram() .setK(15) .setIdCol("id") .setCoverCol("cover_id") .setFeaturesCol("vector") .setOutputCol("cluster_id") forAll(dataframeGen.arbitrary) { df => val assembled = assembler.transform(df) whenever( assembled.count() > 0 && hasDistinctValues(assembled, "double", "integer")) { val transformed = cover .fit(assembled) .transform(assembled) val result = reeb .setTopTreeSize(1) .fit(transformed) .transform(transformed) // result.show() } } } }
Example 44
Source File: DebugConfluentSparkAvroUtils.scala From confluent-spark-avro with Apache License 2.0 | 5 votes |
package com.databricks.spark.avro import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.col object DebugConfluentSparkAvroUtils { def main(args: Array[String]): Unit = { val kafkaUrl = args(0) val schemaRegistryUrl = args(1) val topic = args(2) val spark = SparkSession.builder().master("local[2]").getOrCreate() val df = spark.read.format("kafka") .option("kafka.bootstrap.servers", kafkaUrl) .option("subscribe", topic) .load() val utils = new ConfluentSparkAvroUtils(schemaRegistryUrl) val keyDes = utils.deserializerForSubject(topic + "-key") val valDes = utils.deserializerForSubject(topic + "-value") df.select( keyDes(col("key")).alias("key"), valDes(col("value")).alias("value") ).show(10) spark.stop() } }
Example 45
Source File: ExecutorSideSQLConfSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.internal import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.execution.debug.codegenStringSeq import org.apache.spark.sql.functions.col import org.apache.spark.sql.test.SQLTestUtils class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { import testImplicits._ protected var spark: SparkSession = null // Create a new [[SparkSession]] running in local-cluster mode. override def beforeAll(): Unit = { super.beforeAll() spark = SparkSession.builder() .master("local-cluster[2,1,1024]") .appName("testing") .getOrCreate() } override def afterAll(): Unit = { spark.stop() spark = null } override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { pairs.foreach { case (k, v) => SQLConf.get.setConfString(k, v) } try f finally { pairs.foreach { case (k, _) => SQLConf.get.unsetConf(k) } } } test("ReadOnlySQLConf is correctly created at the executor side") { withSQLConf("spark.sql.x" -> "a") { val checks = spark.range(10).mapPartitions { _ => val conf = SQLConf.get Iterator(conf.isInstanceOf[ReadOnlySQLConf] && conf.getConfString("spark.sql.x") == "a") }.collect() assert(checks.forall(_ == true)) } } test("case-sensitive config should work for json schema inference") { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { withTempPath { path => val pathString = path.getCanonicalPath spark.range(10).select('id.as("ID")).write.json(pathString) spark.range(10).write.mode("append").json(pathString) assert(spark.read.json(pathString).columns.toSet == Set("id", "ID")) } } } test("SPARK-24727 CODEGEN_CACHE_MAX_ENTRIES is correctly referenced at the executor side") { withSQLConf(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key -> "300") { val checks = spark.range(10).mapPartitions { _ => val conf = SQLConf.get Iterator(conf.isInstanceOf[ReadOnlySQLConf] && conf.getConfString(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key) == "300") }.collect() assert(checks.forall(_ == true)) } } test("SPARK-22219: refactor to control to generate comment") { Seq(true, false).foreach { flag => withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> flag.toString) { val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() .queryExecution.executedPlan) assert(res.length == 2) assert(res.forall { case (_, code) => (code.contains("* Codegend pipeline") == flag) && (code.contains("// input[") == flag) }) } } } }
Example 46
Source File: StreamStreamJoiner.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import knolx.Config._ import knolx.KnolXLogger import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.functions.{col, expr, from_json} import org.apache.spark.sql.types.StructType object StreamStreamJoiner extends App with KnolXLogger { info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") info("Streaming companies Dataframe") val companiesDF = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", companiesTopic) .load() .select(col("value").cast("string").as("companyName"), col("timestamp").as("companyTradingTime")) companiesDF.writeStream.format("console").option("truncate", false).start() info("Original Streaming Dataframe") val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType] val stockStreamDF = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", stocksTopic) .load() .select(from_json(col("value").cast("string"), schema).as("value"), col("timestamp").as("stockInputTime")) .select("value.*", "stockInputTime") info("Filtered Streaming Dataframe") val filteredStockStreamDF = stockStreamDF.join(companiesDF, expr("companyName = stockName AND stockInputTime >= companyTradingTime AND stockInputTime <= companyTradingTime + interval 20 seconds")) val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").option("truncate", false).start() info("Waiting for the query to terminate...") filteredStockStreamingQuery.awaitTermination() filteredStockStreamingQuery.stop() }
Example 47
Source File: ExtractPythonUDFsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest} import org.apache.spark.sql.functions.col import org.apache.spark.sql.test.SharedSQLContext class ExtractPythonUDFsSuite extends SparkPlanTest with SharedSQLContext { import testImplicits.newProductEncoder import testImplicits.localSeqToDatasetHolder val batchedPythonUDF = new MyDummyPythonUDF val scalarPandasUDF = new MyDummyScalarPandasUDF private def collectBatchExec(plan: SparkPlan): Seq[BatchEvalPythonExec] = plan.collect { case b: BatchEvalPythonExec => b } private def collectArrowExec(plan: SparkPlan): Seq[ArrowEvalPythonExec] = plan.collect { case b: ArrowEvalPythonExec => b } test("Chained Batched Python UDFs should be combined to a single physical node") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c", batchedPythonUDF(col("a"))) .withColumn("d", batchedPythonUDF(col("c"))) val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan) assert(pythonEvalNodes.size == 1) } test("Chained Scalar Pandas UDFs should be combined to a single physical node") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c", scalarPandasUDF(col("a"))) .withColumn("d", scalarPandasUDF(col("c"))) val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan) assert(arrowEvalNodes.size == 1) } test("Mixed Batched Python UDFs and Pandas UDF should be separate physical node") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c", batchedPythonUDF(col("a"))) .withColumn("d", scalarPandasUDF(col("b"))) val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan) val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan) assert(pythonEvalNodes.size == 1) assert(arrowEvalNodes.size == 1) } test("Independent Batched Python UDFs and Scalar Pandas UDFs should be combined separately") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c1", batchedPythonUDF(col("a"))) .withColumn("c2", batchedPythonUDF(col("c1"))) .withColumn("d1", scalarPandasUDF(col("a"))) .withColumn("d2", scalarPandasUDF(col("d1"))) val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan) val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan) assert(pythonEvalNodes.size == 1) assert(arrowEvalNodes.size == 1) } test("Dependent Batched Python UDFs and Scalar Pandas UDFs should not be combined") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c1", batchedPythonUDF(col("a"))) .withColumn("d1", scalarPandasUDF(col("c1"))) .withColumn("c2", batchedPythonUDF(col("d1"))) .withColumn("d2", scalarPandasUDF(col("c2"))) val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan) val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan) assert(pythonEvalNodes.size == 2) assert(arrowEvalNodes.size == 2) } }
Example 48
Source File: GroupSortedDataset.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted.sql import scala.reflect.ClassTag import org.apache.spark.sql.{ Column, Dataset, Encoder } import org.apache.spark.sql.functions.col import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder } import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate } object GroupSortedDataset { private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = { val key = col(dataset.columns.head) val valueSort = { val sort = sortBy(col(dataset.columns.last)) if (reverse) sort.desc else sort.asc } new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort)) } } class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable { def toDS: Dataset[(K, V)] = dataset def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f)) } def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIterator(_)(f)) } def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f))) } def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] = mapStreamByKey(iter => Iterator(iter.reduceLeft(f))) def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(_.scanLeft(wCreate())(f)) } }
Example 49
Source File: ConcatArrowAndExplodeSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import java.io.ByteArrayOutputStream import java.nio.channels.Channels import java.util.concurrent.TimeUnit import com.twosigma.flint.arrow.ArrowUtils import org.apache.arrow.memory.RootAllocator import org.apache.arrow.vector.ipc.ArrowFileWriter import org.apache.arrow.vector.{ BigIntVector, Float8Vector, VectorSchemaRoot } import org.apache.spark.sql.functions.{ array, col, lit, struct } import org.apache.spark.sql.types._ class ConcatArrowAndExplodeSpec extends TimeSeriesSuite { "ConcatArrowAndExplode" should "work" in { val batchSize = 10 var df = spark.range(1000, 2000, 1000).toDF("time") val columns = (0 until batchSize).map(v => struct((df("time") + v).as("time"), lit(v.toDouble).as("v"))) df = df.withColumn("base_rows", array(columns: _*)) val allocator = new RootAllocator(Long.MaxValue) val schema1 = StructType(Seq(StructField("v1", DoubleType))) val root1 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema1), allocator) val vector1 = root1.getVector("v1").asInstanceOf[Float8Vector] vector1.allocateNew() for (i <- 0 until batchSize) { vector1.set(i, i + 10.0) } vector1.setValueCount(batchSize) val out1 = new ByteArrayOutputStream() val arrowWriter1 = new ArrowFileWriter(root1, null, Channels.newChannel(out1)) arrowWriter1.writeBatch() arrowWriter1.close() root1.close() df = df.withColumn("f1_schema", struct(lit(0.0).as("v1"))) df = df.withColumn("f1_data", lit(out1.toByteArray)) val schema2 = StructType(Seq(StructField("v2", DoubleType), StructField("v3", LongType))) val root2 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema2), allocator) val vector2 = root2.getVector("v2").asInstanceOf[Float8Vector] val vector3 = root2.getVector("v3").asInstanceOf[BigIntVector] vector2.allocateNew() vector3.allocateNew() for (i <- 0 until batchSize) { vector2.set(i, i + 20.0) } vector2.setValueCount(batchSize) for (i <- 0 until batchSize) { vector3.set(i, i + 30L) } vector3.setValueCount(batchSize) val out2 = new ByteArrayOutputStream() val arrowWriter2 = new ArrowFileWriter(root2, null, Channels.newChannel(out2)) arrowWriter2.writeBatch() arrowWriter2.close() root2.close() df = df.withColumn("f2_schema", struct(lit(0.0).as("v2"), lit(0L).as("v3"))) df = df.withColumn("f2_data", lit(out2.toByteArray)) var tsrdd = TimeSeriesRDD.fromDF(df)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) tsrdd = tsrdd.concatArrowAndExplode("base_rows", Seq("f1_schema", "f2_schema"), Seq("f1_data", "f2_data")) tsrdd.toDF.show() var expected = spark.range(1000, 1000 + batchSize).toDF("time") expected = expected.withColumn("v", col("time") - 1000.0) expected = expected.withColumn("v1", col("time") - 1000 + 10.0) expected = expected.withColumn("v2", col("time") - 1000 + 20.0) expected = expected.withColumn("v3", col("time") - 1000 + 30) val expectedTsrdd = TimeSeriesRDD.fromDF(expected)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) assertEquals(tsrdd, expectedTsrdd) } }
Example 50
Source File: RateSource.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.source.rate import com.phylosoft.spark.learning.sql.streaming.source.StreamingSource import org.apache.spark.sql.functions.col import org.apache.spark.sql.{DataFrame, SparkSession} trait RateSource extends StreamingSource { val spark: SparkSession val rowsPerSecond: String val numPartitions: String override def readStream(): DataFrame = { spark.readStream .format("rate") .option("rowsPerSecond", rowsPerSecond) .option("numPartitions", numPartitions) .load() .select(col("*")) } }
Example 51
Source File: AdRateSources.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.source.rate import org.apache.spark.sql.functions.{col, rand} import org.apache.spark.sql.{DataFrame, SparkSession} class AdRateSources(val spark: SparkSession, val rowsPerSecond: String = "5", val numPartitions: String = "1") extends RateSource { def loadImpressions(): DataFrame = { readStream() .select( col("value").as("adId"), col("timestamp").as("impressionTime")) } def loadClicks(): DataFrame = { readStream() .where((rand() * 100).cast("integer") < 10) // 10 out of every 100 impressions result in a click .select((col("value") - 50).as("adId"), col("timestamp").as("clickTime")) // -50 so that a click with same id as impression is generated much later (i.e. delayed data). .where("adId > 0") } }
Example 52
Source File: UserActionsRateSource.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.source.rate import org.apache.spark.sql.functions.{col, lit, pmod, rand} import org.apache.spark.sql.{DataFrame, SparkSession} class UserActionsRateSource(val spark: SparkSession, val rowsPerSecond: String = "5", val numPartitions: String = "1") extends RateSource { def loadUserActions(): DataFrame = { readStream() .where((rand() * 100).cast("integer") < 30) // 30 out of every 100 user actions .select(pmod(col("value"), lit(9)).as("userId"), col("timestamp").as("actionTime")) } }
Example 53
Source File: FilterTopFeaturesProcess.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs.wal.process import org.apache.s2graph.s2jobs.task.TaskConf import org.apache.s2graph.s2jobs.wal.WalLogAgg import org.apache.s2graph.s2jobs.wal.transformer.{DefaultTransformer, Transformer} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import play.api.libs.json.{JsObject, Json} object FilterTopFeaturesProcess { private var validFeatureHashKeys: Set[Long] = null def getValidFeatureHashKeys(validFeatureHashKeysBCast: Broadcast[Array[Long]]): Set[Long] = { if (validFeatureHashKeys == null) { validFeatureHashKeys = validFeatureHashKeysBCast.value.toSet } validFeatureHashKeys } def collectDistinctFeatureHashes(ss: SparkSession, filteredDict: DataFrame): Array[Long] = { import ss.implicits._ val featureHashUDF = udf((dim: String, value: String) => WalLogAgg.toFeatureHash(dim, value)) filteredDict.withColumn("featureHash", featureHashUDF(col("dim"), col("value"))) .select("featureHash") .distinct().as[Long].collect() } def filterTopKsPerDim(dict: DataFrame, maxRankPerDim: Broadcast[Map[String, Int]], defaultMaxRank: Int): DataFrame = { val filterUDF = udf((dim: String, rank: Long) => { rank < maxRankPerDim.value.getOrElse(dim, defaultMaxRank) }) dict.filter(filterUDF(col("dim"), col("rank"))) } def filterWalLogAgg(ss: SparkSession, walLogAgg: Dataset[WalLogAgg], transformers: Seq[Transformer], validFeatureHashKeysBCast: Broadcast[Array[Long]]) = { import ss.implicits._ walLogAgg.mapPartitions { iter => val validFeatureHashKeys = getValidFeatureHashKeys(validFeatureHashKeysBCast) iter.map { walLogAgg => WalLogAgg.filterProps(walLogAgg, transformers, validFeatureHashKeys) } } } } class FilterTopFeaturesProcess(taskConf: TaskConf) extends org.apache.s2graph.s2jobs.task.Process(taskConf) { import FilterTopFeaturesProcess._ override def execute(ss: SparkSession, inputMap: Map[String, DataFrame]): DataFrame = { import ss.implicits._ val maxRankPerDim = taskConf.options.get("maxRankPerDim").map { s => Json.parse(s).as[JsObject].fields.map { case (k, jsValue) => k -> jsValue.as[Int] }.toMap } val maxRankPerDimBCast = ss.sparkContext.broadcast(maxRankPerDim.getOrElse(Map.empty)) val defaultMaxRank = taskConf.options.get("defaultMaxRank").map(_.toInt) val featureDict = inputMap(taskConf.options("featureDict")) val walLogAgg = inputMap(taskConf.options("walLogAgg")).as[WalLogAgg] val transformers = TaskConf.parseTransformers(taskConf) val filteredDict = filterTopKsPerDim(featureDict, maxRankPerDimBCast, defaultMaxRank.getOrElse(Int.MaxValue)) val validFeatureHashKeys = collectDistinctFeatureHashes(ss, filteredDict) val validFeatureHashKeysBCast = ss.sparkContext.broadcast(validFeatureHashKeys) filterWalLogAgg(ss, walLogAgg, transformers, validFeatureHashKeysBCast).toDF() } override def mandatoryOptions: Set[String] = Set("featureDict", "walLogAgg") }
Example 54
Source File: WholeStageCodegenSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Row import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions.{avg, broadcast, col, max} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{IntegerType, StringType, StructType} class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext { test("range/filter should be combined") { val df = spark.range(10).filter("id = 1").selectExpr("id + 1") val plan = df.queryExecution.executedPlan assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined) assert(df.collect() === Array(Row(2))) } test("Aggregate should be included in WholeStageCodegen") { val df = spark.range(10).groupBy().agg(max(col("id")), avg(col("id"))) val plan = df.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) assert(df.collect() === Array(Row(9, 4.5))) } test("Aggregate with grouping keys should be included in WholeStageCodegen") { val df = spark.range(3).groupBy("id").count().orderBy("id") val plan = df.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) assert(df.collect() === Array(Row(0, 1), Row(1, 1), Row(2, 1))) } test("BroadcastHashJoin should be included in WholeStageCodegen") { val rdd = spark.sparkContext.makeRDD(Seq(Row(1, "1"), Row(1, "1"), Row(2, "2"))) val schema = new StructType().add("k", IntegerType).add("v", StringType) val smallDF = spark.createDataFrame(rdd, schema) val df = spark.range(10).join(broadcast(smallDF), col("k") === col("id")) assert(df.queryExecution.executedPlan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[BroadcastHashJoinExec]).isDefined) assert(df.collect() === Array(Row(1, 1, "1"), Row(1, 1, "1"), Row(2, 2, "2"))) } test("Sort should be included in WholeStageCodegen") { val df = spark.range(3, 0, -1).toDF().sort(col("id")) val plan = df.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SortExec]).isDefined) assert(df.collect() === Array(Row(1), Row(2), Row(3))) } test("MapElements should be included in WholeStageCodegen") { import testImplicits._ val ds = spark.range(10).map(_.toString) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SerializeFromObjectExec]).isDefined) assert(ds.collect() === 0.until(10).map(_.toString).toArray) } test("typed filter should be included in WholeStageCodegen") { val ds = spark.range(10).filter(_ % 2 == 0) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined) assert(ds.collect() === Array(0, 2, 4, 6, 8)) } test("back-to-back typed filter should be included in WholeStageCodegen") { val ds = spark.range(10).filter(_ % 2 == 0).filter(_ % 3 == 0) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined) assert(ds.collect() === Array(0, 6)) } test("simple typed UDAF should be included in WholeStageCodegen") { import testImplicits._ val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS() .groupByKey(_._1).agg(typed.sum(_._2)) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0))) } }
Example 55
Source File: SuiteKickoff.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload import com.ibm.sparktc.sparkbench.utils.SparkFuncs._ import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.functions.{col, lit} import scala.collection.parallel.ForkJoinTaskSupport object SuiteKickoff { private val log = org.slf4j.LoggerFactory.getLogger(getClass) def run(s: Suite, spark: SparkSession): Unit = { verifyOutput(s.benchmarkOutput, s.saveMode, spark) // Translate the maps into runnable workloads val workloads: Seq[Workload] = s.workloadConfigs.map(ConfigCreator.mapToConf) val dataframes: Seq[DataFrame] = (0 until s.repeat).flatMap { i => // This will produce one DataFrame of one row for each workload in the sequence. // We're going to produce one coherent DF later from these val dfSeqFromOneRun: Seq[DataFrame] = { if (s.parallel) runParallel(workloads, spark) else runSerially(workloads, spark) } // Indicate which run of this suite this was. dfSeqFromOneRun.map(_.withColumn("run", lit(i))) } // getting the Spark confs so we can output them in the results. val strSparkConfs = spark.conf.getAll // Ah, see, here's where we're joining that series of one-row DFs val singleDF = joinDataFrames(dataframes, spark) s.description.foreach(log.info) // And now we're going to curry in the results val plusSparkConf = addConfToResults(singleDF, strSparkConfs) val plusDescription = addConfToResults(plusSparkConf, Map("description" -> s.description)).coalesce(1) // And write to disk. We're done with this suite! if(s.benchmarkOutput.nonEmpty) writeToDisk(s.benchmarkOutput.get, s.saveMode, plusDescription, spark) } private def runParallel(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = { val confSeqPar = workloadConfigs.par confSeqPar.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(confSeqPar.size)) confSeqPar.map(_.run(spark)).seq } private def runSerially(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = { workloadConfigs.map(_.run(spark)) } private def joinDataFrames(seq: Seq[DataFrame], spark: SparkSession): DataFrame = { if (seq.length == 1) seq.head else { val seqOfColNames = seq.map(_.columns.toSet) val allTheColumns = seqOfColNames.foldLeft(Set[String]())(_ ++ _) def expr(myCols: Set[String], allCols: Set[String]) = { allCols.toList.map { case x if myCols.contains(x) => col(x) case x => lit(null).as(x) } } val seqFixedDfs = seq.map(df => df.select(expr(df.columns.toSet, allTheColumns): _*)) // Folding left across this sequence should be fine because each DF should only have 1 row // Nevarr Evarr do this to legit dataframes that are all like big and stuff seqFixedDfs.foldLeft(spark.createDataFrame(spark.sparkContext.emptyRDD[Row], seqFixedDfs.head.schema))(_ union _) } } }
Example 56
Source File: Cleaner.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions.clean import com.hankcs.hanlp.HanLP import config.paramconf.{HasOutputCol, HasInputCol} import functions.MySchemaUtils import functions.clean.chinese.BCConvert import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1) override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val cleanFunc = udf {line: String => var cleaned = "" getFanJian match { case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line) case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line) case _ => cleaned = line } getQuanBan match { case "q2b" => cleaned = BCConvert.qj2bj(cleaned) case "b2q" => cleaned = BCConvert.bj2qj(cleaned) case _ => cleaned = cleaned } cleaned } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record => val outputIndex = record.fieldIndex($(outputCol)) record.getString(outputIndex).length >= getMinLineLen } } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.typeName.equals(StringType.typeName), s"Input type must be StringType but got $inputType.") MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable) } } object Cleaner extends DefaultParamsReadable[Cleaner] { override def load(path: String): Cleaner = super.load(path) }
Example 57
Source File: implicits.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb import com.audienceproject.spark.dynamodb.reflect.SchemaAnalysis import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.StructField import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag object implicits { implicit class DynamoDBDataFrameReader(reader: DataFrameReader) { def dynamodb(tableName: String): DataFrame = getDynamoDBSource(tableName).load() def dynamodb(tableName: String, indexName: String): DataFrame = getDynamoDBSource(tableName).option("indexName", indexName).load() def dynamodbAs[T <: Product : ClassTag : TypeTag](tableName: String): Dataset[T] = { implicit val encoder: Encoder[T] = ExpressionEncoder() getColumnsAlias(getDynamoDBSource(tableName) .schema(SchemaAnalysis[T]).load()).as } def dynamodbAs[T <: Product : ClassTag : TypeTag](tableName: String, indexName: String): Dataset[T] = { implicit val encoder: Encoder[T] = ExpressionEncoder() getColumnsAlias(getDynamoDBSource(tableName) .option("indexName", indexName) .schema(SchemaAnalysis[T]).load()).as } private def getDynamoDBSource(tableName: String): DataFrameReader = reader.format("com.audienceproject.spark.dynamodb.datasource").option("tableName", tableName) private def getColumnsAlias(dataFrame: DataFrame): DataFrame = { val columnsAlias = dataFrame.schema.collect({ case StructField(name, _, _, metadata) if metadata.contains("alias") => col(name).as(metadata.getString("alias")) case StructField(name, _, _, _) => col(name) }) dataFrame.select(columnsAlias: _*) } } implicit class DynamoDBDataFrameWriter[T](writer: DataFrameWriter[T]) { def dynamodb(tableName: String): Unit = writer.format("com.audienceproject.spark.dynamodb.datasource").option("tableName", tableName).save() } }
Example 58
Source File: TestUtils.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.utils import org.apache.spark.sql.functions.{col, count, lit} import org.apache.spark.sql.{DataFrame, Row} object TestUtils { implicit class ExtendedDataFrame(df: DataFrame) { def hasDiff(anotherDf: DataFrame): Boolean = { def printDiff(incoming: Boolean)(row: Row): Unit = { if (incoming) print("+ ") else print("- ") println(row) } val groupedDf = df.groupBy(df.columns.map(col): _*).agg(count(lit(1))).collect().toSet val groupedAnotherDf = anotherDf.groupBy(anotherDf.columns.map(col): _*).agg(count(lit(1))).collect().toSet groupedDf.diff(groupedAnotherDf).foreach(printDiff(incoming = true)) groupedAnotherDf.diff(groupedDf).foreach(printDiff(incoming = false)) groupedDf.diff(groupedAnotherDf).nonEmpty || groupedAnotherDf.diff(groupedDf).nonEmpty } } }
Example 59
Source File: PartitionHelpers.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo.core import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} trait PartitionHelpers { protected def getDistinctPartitions(outputDataFrame: DataFrame, targetPartitions: Seq[String]): Dataset[Row] = { val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString)) outputDataFrame.select(targetPartitionsColumns: _*).distinct } protected def getParameterValue(row: Row, partitionString: String): String = createParameterValue(row.get(row.fieldIndex(partitionString))) protected def createParameterValue(partitionRawValue: Any): String = partitionRawValue match { case value: java.lang.Short => value.toString case value: java.lang.Integer => value.toString case value: scala.Predef.String => "'" + value + "'" case null => throw new Exception("Partition Value is null. No support for null partitions!") case value => throw new Exception("Unsupported partition DataType: " + value.getClass) } }
Example 60
Source File: Example3_7.scala From LearningSparkV2 with Apache License 2.0 | 5 votes |
package main.scala.chapter3 import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{col, expr} object Example3_7 { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Example-3_7") .getOrCreate() if (args.length <= 0) { println("usage Example3_7 <file path to blogs.json") System.exit(1) } //get the path to the JSON file val jsonFile = args(0) //define our schema as before val schema = StructType(Array(StructField("Id", IntegerType, false), StructField("First", StringType, false), StructField("Last", StringType, false), StructField("Url", StringType, false), StructField("Published", StringType, false), StructField("Hits", IntegerType, false), StructField("Campaigns", ArrayType(StringType), false))) //Create a DataFrame by reading from the JSON file a predefined Schema val blogsDF = spark.read.schema(schema).json(jsonFile) //show the DataFrame schema as output blogsDF.show(truncate = false) // print the schemas print(blogsDF.printSchema) print(blogsDF.schema) // Show columns and expressions blogsDF.select(expr("Hits") * 2).show(2) blogsDF.select(col("Hits") * 2).show(2) blogsDF.select(expr("Hits * 2")).show(2) // show heavy hitters blogsDF.withColumn("Big Hitters", (expr("Hits > 10000"))).show() } }
Example 61
Source File: Word2Vec.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import scala.util.Random import org.apache.spark.ml import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.feature.Word2VecModel import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, split} import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object Word2Vec extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ val df = DataGenerator.generateDoc( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, vocabSize, docLength, "text" ) df.select(split(col("text"), " ").as("text")) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.feature.Word2Vec().setInputCol("text") } override def testAdditionalMethods( ctx: MLBenchContext, model: Transformer): Map[String, () => _] = { import ctx.params._ val rng = new Random(ctx.seed()) val word2vecModel = model.asInstanceOf[Word2VecModel] val testWord = Vectors.dense(Array.fill(word2vecModel.getVectorSize)(rng.nextGaussian())) Map("findSynonyms" -> (() => { word2vecModel.findSynonyms(testWord, numSynonymsToFind) })) } }
Example 62
Source File: WholeStageCodegenSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Row import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions.{avg, broadcast, col, max} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{IntegerType, StringType, StructType} class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext { test("range/filter should be combined") { val df = spark.range(10).filter("id = 1").selectExpr("id + 1") val plan = df.queryExecution.executedPlan assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined) assert(df.collect() === Array(Row(2))) } test("Aggregate should be included in WholeStageCodegen") { val df = spark.range(10).groupBy().agg(max(col("id")), avg(col("id"))) val plan = df.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) assert(df.collect() === Array(Row(9, 4.5))) } test("Aggregate with grouping keys should be included in WholeStageCodegen") { val df = spark.range(3).groupBy("id").count().orderBy("id") val plan = df.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) assert(df.collect() === Array(Row(0, 1), Row(1, 1), Row(2, 1))) } test("BroadcastHashJoin should be included in WholeStageCodegen") { val rdd = spark.sparkContext.makeRDD(Seq(Row(1, "1"), Row(1, "1"), Row(2, "2"))) val schema = new StructType().add("k", IntegerType).add("v", StringType) val smallDF = spark.createDataFrame(rdd, schema) val df = spark.range(10).join(broadcast(smallDF), col("k") === col("id")) assert(df.queryExecution.executedPlan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[BroadcastHashJoinExec]).isDefined) assert(df.collect() === Array(Row(1, 1, "1"), Row(1, 1, "1"), Row(2, 2, "2"))) } test("Sort should be included in WholeStageCodegen") { val df = spark.range(3, 0, -1).toDF().sort(col("id")) val plan = df.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SortExec]).isDefined) assert(df.collect() === Array(Row(1), Row(2), Row(3))) } test("MapElements should be included in WholeStageCodegen") { import testImplicits._ val ds = spark.range(10).map(_.toString) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SerializeFromObjectExec]).isDefined) assert(ds.collect() === 0.until(10).map(_.toString).toArray) } test("typed filter should be included in WholeStageCodegen") { val ds = spark.range(10).filter(_ % 2 == 0) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined) assert(ds.collect() === Array(0, 2, 4, 6, 8)) } test("back-to-back typed filter should be included in WholeStageCodegen") { val ds = spark.range(10).filter(_ % 2 == 0).filter(_ % 3 == 0) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined) assert(ds.collect() === Array(0, 6)) } test("simple typed UDAF should be included in WholeStageCodegen") { import testImplicits._ val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS() .groupByKey(_._1).agg(typed.sum(_._2)) val plan = ds.queryExecution.executedPlan assert(plan.find(p => p.isInstanceOf[WholeStageCodegenExec] && p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0))) } }
Example 63
Source File: HashingTF.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 64
Source File: HashingTF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 65
Source File: PredicatePushdownSuite.scala From spark-exasol-connector with Apache License 2.0 | 5 votes |
package com.exasol.spark import java.sql.Timestamp import org.apache.spark.sql.functions.col import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.scalatest.funsuite.AnyFunSuite class PredicatePushdownSuite extends AnyFunSuite with BaseDockerSuite with DataFrameSuiteBase { test("with where clause build from filters: filter") { createDummyTable() import spark.implicits._ val df = spark.read .format("exasol") .option("host", container.host) .option("port", s"${container.port}") .option("query", s"SELECT * FROM $EXA_SCHEMA.$EXA_TABLE") .load() .filter($"id" < 3) .filter(col("city").like("Ber%")) .select("id", "city") val result = df.collect().map(x => (x.getLong(0), x.getString(1))).toSet assert(result.size === 1) assert(result === Set((1, "Berlin"))) } test("with where clause build from filters: createTempView and spark.sql") { createDummyTable() val df = spark.read .format("exasol") .option("host", container.host) .option("port", s"${container.port}") .option("query", s"SELECT * FROM $EXA_SCHEMA.$EXA_TABLE") .load() df.createOrReplaceTempView("myTable") val myDF = spark .sql("SELECT id, city FROM myTable WHERE id BETWEEN 1 AND 3 AND name < 'Japan'") val result = myDF.collect().map(x => (x.getLong(0), x.getString(1))).toSet assert(result.size === 2) assert(result === Set((1, "Berlin"), (2, "Paris"))) } test("date and timestamp should be read and filtered correctly") { import java.sql.Date createDummyTable() val df = spark.read .format("exasol") .option("host", container.host) .option("port", s"${container.port}") .option("query", s"SELECT date_info, updated_at FROM $EXA_SCHEMA.$EXA_TABLE") .load() val minTimestamp = Timestamp.valueOf("2017-12-30 00:00:00.0000") val testDate = Date.valueOf("2017-12-31") val resultDate = df.collect().map(_.getDate(0)) assert(resultDate.contains(testDate)) val resultTimestamp = df.collect().map(_.getTimestamp(1)).map(x => x.after(minTimestamp)) assert(!resultTimestamp.contains(false)) val filteredByDateDF = df.filter(col("date_info") === testDate) assert(filteredByDateDF.count() === 1) val filteredByTimestampDF = df.filter(col("updated_at") < minTimestamp) assert(filteredByTimestampDF.count() === 0) } test("count should be performed successfully") { createDummyTable() val df = spark.read .format("exasol") .option("host", container.host) .option("port", s"${container.port}") .option("query", s"SELECT * FROM $EXA_SCHEMA.$EXA_TABLE") .load() val result = df.count() assert(result === 3) } }
Example 66
Source File: GaussianProcessCommons.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import breeze.optimize.LBFGSB import org.apache.spark.ml.commons.kernel.{EyeKernel, Kernel, _} import org.apache.spark.ml.commons.util.DiffFunctionMemoized import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.Instrumentation import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} private[ml] trait GaussianProcessCommons[F, E <: Predictor[F, E, M], M <: PredictionModel[F, M]] extends ProjectedGaussianProcessHelper { this: Predictor[F, E, M] with GaussianProcessParams => protected val getKernel : () => Kernel = () => $(kernel)() + $(sigma2).const * new EyeKernel protected def getPoints(dataset: Dataset[_]) = { dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map { case Row(label: Double, features: Vector) => LabeledPoint(label, features) } } protected def groupForExperts(points: RDD[LabeledPoint]) = { val numberOfExperts = Math.round(points.count().toDouble / $(datasetSizeForExpert)) points.zipWithIndex.map { case(instance, index) => (index % numberOfExperts, instance) }.groupByKey().map(_._2) } protected def getExpertLabelsAndKernels(points: RDD[LabeledPoint]): RDD[(BDV[Double], Kernel)] = { groupForExperts(points).map { chunk => val (labels, trainingVectors) = chunk.map(lp => (lp.label, lp.features)).toArray.unzip (BDV(labels: _*), getKernel().setTrainingVectors(trainingVectors)) } } protected def projectedProcess(expertLabelsAndKernels: RDD[(BDV[Double], Kernel)], points: RDD[LabeledPoint], optimalHyperparameters: BDV[Double]) = { val activeSet = $(activeSetProvider)($(activeSetSize), expertLabelsAndKernels, points, getKernel, optimalHyperparameters, $(seed)) points.unpersist() val (matrixKmnKnm, vectorKmny) = getMatrixKmnKnmAndVectorKmny(expertLabelsAndKernels, activeSet) expertLabelsAndKernels.unpersist() val optimalKernel = getKernel().setHyperparameters(optimalHyperparameters).setTrainingVectors(activeSet) // inv(sigma^2 K_mm + K_mn * K_nm) * K_mn * y val (magicVector, magicMatrix) = getMagicVector(optimalKernel, matrixKmnKnm, vectorKmny, activeSet, optimalHyperparameters) new GaussianProjectedProcessRawPredictor(magicVector, magicMatrix, optimalKernel) } protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor) : M } class GaussianProjectedProcessRawPredictor private[commons] (val magicVector: BDV[Double], val magicMatrix: BDM[Double], val kernel: Kernel) extends Serializable { def predict(features: Vector): (Double, Double) = { val cross = kernel.crossKernel(features) val selfKernel = kernel.selfKernel(features) (cross * magicVector, selfKernel + cross * magicMatrix * cross.t) } }
Example 67
Source File: PageRankSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.DataTypes import org.graphframes.examples.Graphs import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils} class PageRankSuite extends SparkFunSuite with GraphFrameTestSparkContext { val n = 100 test("Star example") { val g = Graphs.star(n) val resetProb = 0.15 val errorTol = 1.0e-5 val pr = g.pageRank .resetProbability(resetProb) .tol(errorTol).run() TestUtils.testSchemaInvariants(g, pr) TestUtils.checkColumnType(pr.vertices.schema, "pagerank", DataTypes.DoubleType) TestUtils.checkColumnType(pr.edges.schema, "weight", DataTypes.DoubleType) } test("friends graph with personalized PageRank") { val results = Graphs.friends.pageRank.resetProbability(0.15).maxIter(10).sourceId("a").run() val gRank = results.vertices.filter(col("id") === "g").select("pagerank").first().getDouble(0) assert(gRank === 0.0, s"User g (Gabby) doesn't connect with a. So its pagerank should be 0 but we got $gRank.") } }
Example 68
Source File: ParallelPersonalizedPageRankSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import com.github.zafarkhaja.semver.Version import org.apache.spark.ml.linalg.{SQLDataTypes, SparseVector} import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.DataTypes import org.graphframes.examples.Graphs import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils} class ParallelPersonalizedPageRankSuite extends SparkFunSuite with GraphFrameTestSparkContext { val n = 100 test("Illegal function call argument setting") { val g = Graphs.star(n) val vertexIds: Array[Any] = Array(1L, 2L, 3L) // Not providing number of iterations intercept[IllegalArgumentException] { g.parallelPersonalizedPageRank.sourceIds(vertexIds).run() } // Not providing sourceIds intercept[IllegalArgumentException] { g.parallelPersonalizedPageRank.maxIter(15).run() } // Provided empty sourceIds intercept[IllegalArgumentException] { g.parallelPersonalizedPageRank.maxIter(15).sourceIds(Array()).run() } } test("Star example parallel personalized PageRank") { val g = Graphs.star(n) val resetProb = 0.15 val maxIter = 10 val vertexIds: Array[Any] = Array(1L, 2L, 3L) lazy val prc = g.parallelPersonalizedPageRank .maxIter(maxIter) .sourceIds(vertexIds) .resetProbability(resetProb) val pr = prc.run() TestUtils.testSchemaInvariants(g, pr) TestUtils.checkColumnType(pr.vertices.schema, "pageranks", SQLDataTypes.VectorType) TestUtils.checkColumnType(pr.edges.schema, "weight", DataTypes.DoubleType) } // In Spark <2.4, sourceIds must be smaller than Int.MaxValue, // which might not be the case for LONG_ID in graph.indexedVertices. if (Version.valueOf(org.apache.spark.SPARK_VERSION) .greaterThanOrEqualTo(Version.valueOf("2.4.0"))) { test("friends graph with parallel personalized PageRank") { val g = Graphs.friends val resetProb = 0.15 val maxIter = 10 val vertexIds: Array[Any] = Array("a") lazy val prc = g.parallelPersonalizedPageRank .maxIter(maxIter) .sourceIds(vertexIds) .resetProbability(resetProb) val pr = prc.run() val prInvalid = pr.vertices .select("pageranks") .collect() .filter { row: Row => vertexIds.size != row.getAs[SparseVector](0).size } assert(prInvalid.size === 0, s"found ${prInvalid.size} entries with invalid number of returned personalized pagerank vector") val gRank = pr.vertices .filter(col("id") === "g") .select("pageranks") .first().getAs[SparseVector](0) assert(gRank.numNonzeros === 0, s"User g (Gabby) doesn't connect with a. So its pagerank should be 0 but we got ${gRank.numNonzeros}.") } } }
Example 69
Source File: ShortestPaths.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import java.util import scala.collection.JavaConverters._ import org.apache.spark.graphx.{lib => graphxlib} import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.api.java.UDF1 import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{IntegerType, MapType} import org.graphframes.GraphFrame def landmarks(value: util.ArrayList[Any]): this.type = { landmarks(value.asScala) } def run(): DataFrame = { ShortestPaths.run(graph, check(lmarks, "landmarks")) } } private object ShortestPaths { private def run(graph: GraphFrame, landmarks: Seq[Any]): DataFrame = { val idType = graph.vertices.schema(GraphFrame.ID).dataType val longIdToLandmark = landmarks.map(l => GraphXConversions.integralId(graph, l) -> l).toMap val gx = graphxlib.ShortestPaths.run( graph.cachedTopologyGraphX, longIdToLandmark.keys.toSeq.sorted).mapVertices { case (_, m) => m.toSeq } val g = GraphXConversions.fromGraphX(graph, gx, vertexNames = Seq(DISTANCE_ID)) val distanceCol: Column = if (graph.hasIntegralIdType) { // It seems there are no easy way to convert a sequence of pairs into a map val mapToLandmark = udf { distances: Seq[Row] => distances.map { case Row(k: Long, v: Int) => k -> v }.toMap } mapToLandmark(g.vertices(DISTANCE_ID)) } else { val func = new UDF1[Seq[Row], Map[Any, Int]] { override def call(t1: Seq[Row]): Map[Any, Int] = { t1.map { case Row(k: Long, v: Int) => longIdToLandmark(k) -> v }.toMap } } val mapToLandmark = udf(func, MapType(idType, IntegerType, false)) mapToLandmark(col(DISTANCE_ID)) } val cols = graph.vertices.columns.map(col) :+ distanceCol.as(DISTANCE_ID) g.vertices.select(cols: _*) } private val DISTANCE_ID = "distances" }
Example 70
Source File: TriangleCount.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{array, col, explode, when} import org.graphframes.GraphFrame import org.graphframes.GraphFrame.{DST, ID, LONG_DST, LONG_SRC, SRC} class TriangleCount private[graphframes] (private val graph: GraphFrame) extends Arguments { def run(): DataFrame = { TriangleCount.run(graph) } } private object TriangleCount { private def run(graph: GraphFrame): DataFrame = { // Dedup edges by flipping them to have LONG_SRC < LONG_DST // TODO (when we drop support for Spark 1.4): Use functions greatest, smallest instead of UDFs val dedupedE = graph.indexedEdges .filter(s"$LONG_SRC != $LONG_DST") .selectExpr( s"if($LONG_SRC < $LONG_DST, $SRC, $DST) as $SRC", s"if($LONG_SRC < $LONG_DST, $DST, $SRC) as $DST") .dropDuplicates(Seq(SRC, DST)) val g2 = GraphFrame(graph.vertices, dedupedE) // Because SRC < DST, there exists only one type of triangles: // - Non-cycle with one edge flipped. These are counted 1 time each by motif finding. val triangles = g2.find("(a)-[]->(b); (b)-[]->(c); (a)-[]->(c)") val triangleCounts = triangles .select(explode(array(col("a.id"), col("b.id"), col("c.id"))).as(ID)) .groupBy(ID) .count() val v = graph.vertices val countsCol = when(col("count").isNull, 0L).otherwise(col("count")) val newV = v.join(triangleCounts, v(ID) === triangleCounts(ID), "left_outer") .select(countsCol.as(COUNT_ID) +: v.columns.map(v.apply) :_ *) newV } private val COUNT_ID = "count" }
Example 71
Source File: BaseSparkSpec.scala From gemini with GNU General Public License v3.0 | 5 votes |
package tech.sourced.gemini import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.functions.col import org.scalatest.{BeforeAndAfterAll, Suite} import org.slf4j.{Logger => Slf4jLogger} trait BaseSparkSpec extends BeforeAndAfterAll { this: Suite => @transient var sparkSession: SparkSession = _ private var _conf: SparkConf = _ def useSparkConf(conf: SparkConf): SparkConf = { _conf = conf _conf } def useDefaultSparkConf(): SparkConf = { val defaultConf: SparkConf = new SparkConf(true) .setAppName(this.getClass.getSimpleName) .set("spark.cassandra.connection.host", Gemini.defaultCassandraHost) .set("spark.cassandra.connection.port", Gemini.defaultCassandraPort.toString) .set("spark.cassandra.connection.keep_alive_ms", "5000") .set("spark.cassandra.connection.timeout_ms", "30000") .set("spark.tech.sourced.bblfsh.grpc.host", "127.0.0.1") .set("spark.ui.showConsoleProgress", "false") .set("spark.ui.enabled", "false") .set("spark.cleaner.ttl", "3600") useSparkConf(defaultConf) } override protected def beforeAll(): Unit = { super.beforeAll() if (_conf == null) { useDefaultSparkConf() } sparkSession = SparkSession.builder() .master("local[*]") .config(_conf) .config("spark.driver.host", "localhost") .getOrCreate() } override protected def afterAll(): Unit = { // commented due to "Cannot call methods on a stopped SparkContext" // but for tests we don't really need to stop spark // it will be stopped automatically when tests exit // resetSparkContext() // make sure different suites don't use the same cache cleanSparkCache() super.afterAll() } def resetSparkContext(): Unit = { if (sparkSession != null) { sparkSession.stop() } sparkSession = null } def cleanSparkCache(): Unit = { if (sparkSession != null) { sparkSession.sqlContext.clearCache() } } // don't process all content of repos to speedup tests class LimitedHash(s: SparkSession, log: Slf4jLogger, mode: String, filePaths: Seq[String]) extends Hash(s, log, mode) { override def filesForRepos(repos: DataFrame): DataFrame = super.filesForRepos(repos).filter(col("path").isin(filePaths: _*)) } object LimitedHash { def apply(s: SparkSession, log: Slf4jLogger, mode: String, paths: Seq[String]): LimitedHash = new LimitedHash(s, log, mode, paths) } }
Example 72
Source File: StructuredStreamingWordCount.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import com.datastax.driver.core.Cluster import knolx.Config._ import knolx.KnolXLogger import knolx.spark.CassandraForeachWriter.writeToCassandra import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{col, lit, sum} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StringType object StructuredStreamingWordCount extends App with KnolXLogger { val cluster = Cluster.builder.addContactPoints(cassandraHosts).build val session = cluster.newSession() info("Creating Keypsace and tables in Cassandra...") session.execute(s"CREATE KEYSPACE IF NOT EXISTS $keyspace WITH " + "replication = {'class':'SimpleStrategy','replication_factor':1};") session.execute(s"CREATE TABLE IF NOT EXISTS $keyspace.wordcount ( word text PRIMARY KEY,count int );") info("Closing DB connection...") session.close() session.getCluster.close() info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") info("Creating Streaming DF...") val dataStream = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", topic) .load() info("Writing data to Cassandra...") val query = dataStream .select(col("value").cast(StringType).as("word"), lit(1).as("count")) .groupBy(col("word")) .agg(sum("count").as("count")) .writeStream .outputMode(OutputMode.Update()) .foreach(writeToCassandra) .option("checkpointLocation", checkPointDir) .start() info("Waiting for the query to terminate...") query.awaitTermination() query.stop() }