org.apache.spark.sql.functions.col Scala Example

Source File: SparkBindingsTest.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.schema

import com.microsoft.ml.spark.core.test.base.TestBase
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.{col, udf}

case class Foo(a: Int, b: String, c: Seq[Bar])

object Foo extends SparkBindings[Foo]

case class Bar(a: Int, c: Seq[Byte])

object Bar extends SparkBindings[Bar]

class SparkBindingsTest2 extends TestBase {

  import session.implicits._

  test("Test to make sure there are no strange memory leaks") {
    (1 to 40).foreach { i =>
      val foos = (0 to 40).map(i => Tuple1(Foo(i, i.toString, Seq(Bar(i, "foo".getBytes)))))
      val converter = Foo.makeFromRowConverter
      val df = foos.toDF("foos")
        .repartition(2)
        .withColumn("mapped2",
          udf({ r: Row => converter(r) }, Foo.schema)(col("foos")))
      val results = df.collect().toList
      println(results.head)
    }
  }

}

Source File: HashingTF.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.col

class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new VectorAssembler)
  }

  test("assemble") {
    import org.apache.spark.ml.feature.VectorAssembler.assemble
    assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty))
    assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0)))
    val dv = Vectors.dense(2.0, 0.0)
    assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0)))
    val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0))
    assert(assemble(0.0, dv, 1.0, sv) ===
      Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0)))
    for (v <- Seq(1, "a", null)) {
      intercept[SparkException](assemble(v))
      intercept[SparkException](assemble(1.0, v))
    }
  }

  test("assemble should compress vectors") {
    import org.apache.spark.ml.feature.VectorAssembler.assemble
    val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0))
    assert(v1.isInstanceOf[SparseVector])
    val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0)))
    assert(v2.isInstanceOf[DenseVector])
  }

  test("VectorAssembler") {
    val df = sqlContext.createDataFrame(Seq(
      (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L)
    )).toDF("id", "x", "y", "name", "z", "n")
    val assembler = new VectorAssembler()
      .setInputCols(Array("x", "y", "z", "n"))
      .setOutputCol("features")
    assembler.transform(df).select("features").collect().foreach {
      case Row(v: Vector) =>
        assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0)))
    }
  }

  test("ML attributes") {
    val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari")
    val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0)
    val user = new AttributeGroup("user", Array(
      NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"),
      NumericAttribute.defaultAttr.withName("salary")))
    val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0)))
    val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad")
      .select(
        col("browser").as("browser", browser.toMetadata()),
        col("hour").as("hour", hour.toMetadata()),
        col("count"), // "count" is an integer column without ML attribute
        col("user").as("user", user.toMetadata()),
        col("ad")) // "ad" is a vector column without ML attribute
    val assembler = new VectorAssembler()
      .setInputCols(Array("browser", "hour", "count", "user", "ad"))
      .setOutputCol("features")
    val output = assembler.transform(df)
    val schema = output.schema
    val features = AttributeGroup.fromStructField(schema("features"))
    assert(features.size === 7)
    val browserOut = features.getAttr(0)
    assert(browserOut === browser.withIndex(0).withName("browser"))
    val hourOut = features.getAttr(1)
    assert(hourOut === hour.withIndex(1).withName("hour"))
    val countOut = features.getAttr(2)
    assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2))
    val userGenderOut = features.getAttr(3)
    assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3))
    val userSalaryOut = features.getAttr(4)
    assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4))
    assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5))
    assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6))
  }
}

Source File: OneHotEncoderSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col

class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext {

  def stringIndexed(): DataFrame = {
    val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
    val df = sqlContext.createDataFrame(data).toDF("id", "label")
    val indexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("labelIndex")
      .fit(df)
    indexer.transform(df)
  }

  test("params") {
    ParamsSuite.checkParams(new OneHotEncoder)
  }

  test("OneHotEncoder dropLast = false") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
      .setInputCol("labelIndex")
      .setOutputCol("labelVec")
      .setDropLast(false)
    val encoded = encoder.transform(transformed)

    val output = encoded.select("id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1), vec(2))
    }.collect().toSet
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0),
      (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0))
    assert(output === expected)
  }

  test("OneHotEncoder dropLast = true") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
      .setInputCol("labelIndex")
      .setOutputCol("labelVec")
    val encoded = encoder.transform(transformed)

    val output = encoded.select("id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1))
    }.collect().toSet
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0),
      (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0))
    assert(output === expected)
  }

  test("input column with ML attribute") {
    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size")
      .select(col("size").as("size", attr.toMetadata()))
    val encoder = new OneHotEncoder()
      .setInputCol("size")
      .setOutputCol("encoded")
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("size_is_small").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("size_is_medium").withIndex(1))
  }

  test("input column without ML attribute") {
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index")
    val encoder = new OneHotEncoder()
      .setInputCol("index")
      .setOutputCol("encoded")
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("index_is_0").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("index_is_1").withIndex(1))
  }
}

Source File: HashingTF.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

Source File: WholeStageCodegenSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.aggregate.HashAggregateExec
import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
import org.apache.spark.sql.expressions.scalalang.typed
import org.apache.spark.sql.functions.{avg, broadcast, col, max}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {

  test("range/filter should be combined") {
    val df = spark.range(10).filter("id = 1").selectExpr("id + 1")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)
    assert(df.collect() === Array(Row(2)))
  }

  test("Aggregate should be included in WholeStageCodegen") {
    val df = spark.range(10).groupBy().agg(max(col("id")), avg(col("id")))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(9, 4.5)))
  }

  test("Aggregate with grouping keys should be included in WholeStageCodegen") {
    val df = spark.range(3).groupBy("id").count().orderBy("id")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(0, 1), Row(1, 1), Row(2, 1)))
  }

  test("BroadcastHashJoin should be included in WholeStageCodegen") {
    val rdd = spark.sparkContext.makeRDD(Seq(Row(1, "1"), Row(1, "1"), Row(2, "2")))
    val schema = new StructType().add("k", IntegerType).add("v", StringType)
    val smallDF = spark.createDataFrame(rdd, schema)
    val df = spark.range(10).join(broadcast(smallDF), col("k") === col("id"))
    assert(df.queryExecution.executedPlan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[BroadcastHashJoinExec]).isDefined)
    assert(df.collect() === Array(Row(1, 1, "1"), Row(1, 1, "1"), Row(2, 2, "2")))
  }

  test("Sort should be included in WholeStageCodegen") {
    val df = spark.range(3, 0, -1).toDF().sort(col("id"))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SortExec]).isDefined)
    assert(df.collect() === Array(Row(1), Row(2), Row(3)))
  }

  test("MapElements should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = spark.range(10).map(_.toString)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SerializeFromObjectExec]).isDefined)
    assert(ds.collect() === 0.until(10).map(_.toString).toArray)
  }

  test("typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 2, 4, 6, 8))
  }

  test("back-to-back typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0).filter(_ % 3 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 6))
  }

  test("simple typed UDAF should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS()
      .groupByKey(_._1).agg(typed.sum(_._2))

    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0)))
  }
}

Source File: HashingTF.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Source File: ParserSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split1

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.io.http._
import org.apache.http.client.methods.HttpPost
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, SparkSession}

trait ParserUtils extends WithServer {

  def sampleDf(spark: SparkSession): DataFrame = {
    val df = spark.createDataFrame((1 to 10).map(Tuple1(_)))
      .toDF("data")
    val df2 = new JSONInputParser().setInputCol("data")
      .setOutputCol("parsedInput").setUrl(url)
      .transform(df)
      .withColumn("unparsedOutput", udf({ x: Int =>
        HTTPResponseData(
          Array(),
          Some(EntityData(
            "{\"foo\": \"here\"}".getBytes, None, None, None, false, false, false)),
          StatusLineData(ProtocolVersionData("foo", 1, 1), 200, "bar"),
          "en")
      }).apply(col("data"))
      )

    new JSONOutputParser()
      .setDataType(new StructType().add("foo", StringType))
      .setInputCol("unparsedOutput")
      .setOutputCol("parsedOutput")
      .transform(df2)
  }

  def makeTestObject[T <: Transformer](t: T, session: SparkSession): Seq[TestObject[T]] = {
    Seq(new TestObject(t, sampleDf(session)))
  }

}

class JsonInputParserSuite extends TransformerFuzzing[JSONInputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[JSONInputParser]] = makeTestObject(
    new JSONInputParser().setInputCol("data").setOutputCol("out")
      .setUrl(url), session)

  override def reader: MLReadable[_] = JSONInputParser
}

class JsonOutputParserSuite extends TransformerFuzzing[JSONOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[JSONOutputParser]] = makeTestObject(
    new JSONOutputParser().setInputCol("unparsedOutput").setOutputCol("out")
      .setDataType(new StructType().add("foo", StringType)), session)

  override def reader: MLReadable[_] = JSONOutputParser
}

class StringOutputParserSuite extends TransformerFuzzing[StringOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[StringOutputParser]] = makeTestObject(
    new StringOutputParser().setInputCol("unparsedOutput").setOutputCol("out"), session)

  override def reader: MLReadable[_] = StringOutputParser
}

class CustomInputParserSuite extends TransformerFuzzing[CustomInputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[CustomInputParser]] = makeTestObject(
    new CustomInputParser().setInputCol("data").setOutputCol("out")
      .setUDF({ x: Int => new HttpPost(s"http://$x") }), session)

  override def reader: MLReadable[_] = CustomInputParser
}

class CustomOutputParserSuite extends TransformerFuzzing[CustomOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[CustomOutputParser]] = makeTestObject(
    new CustomOutputParser().setInputCol("unparsedOutput").setOutputCol("out")
      .setUDF({ x: HTTPResponseData => x.locale }), session)

  override def reader: MLReadable[_] = CustomOutputParser
}

Source File: HTTPSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split2

import java.io.File

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.io.http.HTTPSchema.string_to_response
import org.apache.http.impl.client.HttpClientBuilder
import org.apache.spark.sql.execution.streaming.{HTTPSinkProvider, HTTPSourceProvider}
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StringType

class HTTPSuite extends TestBase with HTTPTestUtils {

  test("stream from HTTP", TestBase.Extended) {
    val q1 = session.readStream.format(classOf[HTTPSourceProvider].getName)
      .option("host", host)
      .option("port", port.toString)
      .option("path", apiPath)
      .load()
      .withColumn("contentLength", col("request.entity.contentLength"))
      .withColumn("reply", string_to_response(col("contentLength").cast(StringType)))
      .writeStream
      .format(classOf[HTTPSinkProvider].getName)
      .option("name", "foo")
      .queryName("foo")
      .option("replyCol", "reply")
      .option("checkpointLocation", new File(tmpDir.toFile, "checkpoints").toString)
      .start()

    Thread.sleep(5000)
    val client = HttpClientBuilder.create().build()
    val p1 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "here"), url)
    val p2 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "heree"), url)
    val p3 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "hereee"), url)
    val p4 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "hereeee"), url)
    val posts = List(p1, p2, p3, p4)
    val correctResponses = List(27, 28, 29, 30)

    posts.zip(correctResponses).foreach { p =>
      assert(p._1 === p._2.toString)
    }
    q1.stop()
    client.close()
  }

}

Source File: ColumnCondition.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.checks

import org.apache.spark.sql.functions.{col}

private[checks] object ColumnCondition {

  def isEachNotNull(cols: Seq[String]): String = {
    cols
      .map(col(_).isNotNull)
      .reduce(_ and _)
      .toString()
  }

  def isAnyNotNull(cols: Seq[String]): String = {
    cols
      .map(col(_).isNotNull)
      .reduce(_ or _)
      .toString()
  }
}

Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable}
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.functions.{col, struct, udf}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType

object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions]


class VowpalWabbitInteractions(override val uid: String) extends Transformer
  with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable
{
  def this() = this(Identifiable.randomUID("VowpalWabbitInteractions"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val fieldSubset = dataset.schema.fields
      .filter(f => getInputCols.contains(f.name))

    val mask = getMask

    val mode = udf((r: Row) => {

      // compute the final number of features
      val numElems = (0 until r.length)
        .map(r.getAs[Vector](_).numNonzeros).product

      val newIndices = new Array[Int](numElems)
      val newValues = new Array[Double](numElems)

      // build interaction features using FNV-1
      val fnvPrime = 16777619
      var i = 0

      def interact(idx: Int, value: Double, ns: Int): Unit = {
        if (ns == r.size) {
          newIndices(i) += mask & idx
          newValues(i) += value

          i += 1
        }
        else {
          val idx1 = idx * fnvPrime

          r.getAs[Vector](ns).foreachActive { case (idx2, value2) =>
            interact(idx1 ^ idx2, value * value2, ns + 1)
          }
        }
      }

      // start the recursion
      interact(0, 1, 0)

      val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions)

      Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted)
    })

    dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*)))
  }

  override def transformSchema(schema: StructType): StructType = {
    val fieldNames = schema.fields.map(_.name)
    for (f <- getInputCols)
      if (!fieldNames.contains(f))
        throw new IllegalArgumentException("missing input column " + f)
      else {
        val fieldType = schema.fields(schema.fieldIndex(f)).dataType

        if (fieldType != VectorType)
          throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName)
      }

    schema.add(StructField(getOutputCol, VectorType, true))
  }

  override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra)
}

Source File: VowpalWabbitRegressor.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.env.InternalWrapper
import com.microsoft.ml.spark.core.serialize.ConstructorReadable
import org.apache.spark.ml.{BaseRegressor, ComplexParamsReadable, ComplexParamsWritable}
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql._
import org.apache.spark.sql.functions.col
import org.apache.spark.ml.regression.RegressionModel

object VowpalWabbitRegressor extends DefaultParamsReadable[VowpalWabbitRegressor]

@InternalWrapper
class VowpalWabbitRegressor(override val uid: String)
  extends BaseRegressor[Row, VowpalWabbitRegressor, VowpalWabbitRegressionModel]
    with VowpalWabbitBase
{
  def this() = this(Identifiable.randomUID("VowpalWabbitRegressor"))

  override def train(dataset: Dataset[_]): VowpalWabbitRegressionModel = {
    val model = new VowpalWabbitRegressionModel(uid)
      .setFeaturesCol(getFeaturesCol)
      .setAdditionalFeatures(getAdditionalFeatures)
      .setPredictionCol(getPredictionCol)

    trainInternal(dataset, model)
  }

  override def copy(extra: ParamMap): VowpalWabbitRegressor = defaultCopy(extra)
}

@InternalWrapper
class VowpalWabbitRegressionModel(override val uid: String)
  extends RegressionModel[Row, VowpalWabbitRegressionModel]
    with VowpalWabbitBaseModel
    with ComplexParamsWritable
{
  protected override def transformImpl(dataset: Dataset[_]): DataFrame = {
    transformImplInternal(dataset)
      .withColumn($(predictionCol), col($(rawPredictionCol)))
  }

  override def predict(features: Row): Double = {
    throw new NotImplementedError("Not implement")
  }

  override def copy(extra: ParamMap): this.type = defaultCopy(extra)
}

object VowpalWabbitRegressionModel extends ComplexParamsReadable[VowpalWabbitRegressionModel]

Source File: VowpalWabbitClassifier.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.env.InternalWrapper
import com.microsoft.ml.spark.core.schema.DatasetExtensions
import org.apache.spark.ml.ComplexParamsReadable
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.ml.classification.{ProbabilisticClassificationModel, ProbabilisticClassifier}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql._
import org.apache.spark.sql.functions.{col, udf}
import org.vowpalwabbit.spark.VowpalWabbitExample
import com.microsoft.ml.spark.core.schema.DatasetExtensions._

import scala.math.exp

object VowpalWabbitClassifier extends DefaultParamsReadable[VowpalWabbitClassifier]

@InternalWrapper
class VowpalWabbitClassifier(override val uid: String)
  extends ProbabilisticClassifier[Row, VowpalWabbitClassifier, VowpalWabbitClassificationModel]
  with VowpalWabbitBase
{
  def this() = this(Identifiable.randomUID("VowpalWabbitClassifier"))

  // to support Grid search we need to replicate the parameters here...
  val labelConversion = new BooleanParam(this, "labelConversion",
    "Convert 0/1 Spark ML style labels to -1/1 VW style labels. Defaults to true.")
  setDefault(labelConversion -> true)
  def getLabelConversion: Boolean = $(labelConversion)
  def setLabelConversion(value: Boolean): this.type = set(labelConversion, value)

  override protected def train(dataset: Dataset[_]): VowpalWabbitClassificationModel = {
    val model = new VowpalWabbitClassificationModel(uid)
      .setFeaturesCol(getFeaturesCol)
      .setAdditionalFeatures(getAdditionalFeatures)
      .setPredictionCol(getPredictionCol)
      .setProbabilityCol(getProbabilityCol)
      .setRawPredictionCol(getRawPredictionCol)

    val finalDataset = if (!getLabelConversion)
      dataset
    else {
      val inputLabelCol = dataset.withDerivativeCol("label")
      dataset
        .withColumnRenamed(getLabelCol, inputLabelCol)
        .withColumn(getLabelCol, col(inputLabelCol) * 2 - 1)
    }

    trainInternal(finalDataset, model)
  }

  override def copy(extra: ParamMap): VowpalWabbitClassifier = defaultCopy(extra)
}

// Preparation for multi-class learning, though it no fun as numClasses is spread around multiple reductions
@InternalWrapper
class VowpalWabbitClassificationModel(override val uid: String)
  extends ProbabilisticClassificationModel[Row, VowpalWabbitClassificationModel]
    with VowpalWabbitBaseModel {

  def numClasses: Int = 2

  override def transform(dataset: Dataset[_]): DataFrame = {
    val df = transformImplInternal(dataset)

    // which mode one wants to use depends a bit on how this should be deployed
    // 1. if you stay in spark w/o link=logistic is probably more convenient as it also returns the raw prediction
    // 2. if you want to export the model *and* get probabilities at scoring term w/ link=logistic is preferable

    // convert raw prediction to probability (if needed)
    val probabilityUdf = if (vwArgs.getArgs.contains("--link logistic"))
      udf { (pred: Double) => Vectors.dense(Array(1 - pred, pred)) }
    else
      udf { (pred: Double) => {
        val prob = 1.0 / (1.0 + exp(-pred))
        Vectors.dense(Array(1 - prob, prob))
      } }

    val df2 = df.withColumn($(probabilityCol), probabilityUdf(col($(rawPredictionCol))))

    // convert probability to prediction
    val probability2predictionUdf = udf(probability2prediction _)
    df2.withColumn($(predictionCol), probability2predictionUdf(col($(probabilityCol))))
  }

  override def copy(extra: ParamMap): this.type = defaultCopy(extra)

  protected override def predictRaw(features: Row): Vector = {
    throw new NotImplementedError("Not implemented")
  }

  protected override def raw2probabilityInPlace(rawPrediction: Vector): Vector= {
    throw new NotImplementedError("Not implemented")
  }
}

object VowpalWabbitClassificationModel extends ComplexParamsReadable[VowpalWabbitClassificationModel]

Source File: udfs.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.Column
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.DoubleType

import scala.collection.mutable

//scalastyle:off
object udfs {

  def get_value_at(colName: String, i: Int): Column = {
    udf({
      vec: org.apache.spark.ml.linalg.Vector => vec(i)
    }, DoubleType)(col(colName))
  }

  val to_vector: UserDefinedFunction = udf({
    arr: Seq[Double] => Vectors.dense(arr.toArray)
  }, VectorType)

  def to_vector(colName: String): Column = to_vector(col(colName))

}

Source File: UDFTransformer.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasInputCols, HasOutputCol, Wrappable}
import com.microsoft.ml.spark.core.env.InternalWrapper
import com.microsoft.ml.spark.core.serialize.ComplexParam
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.{ParamMap, UDFParam, UDPyFParam}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.execution.python.UserDefinedPythonFunction
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.types.{DataType, StructField, StructType}
import org.apache.spark.sql.{Column, DataFrame, Dataset}
import org.apache.spark.sql.functions.col

object UDFTransformer extends ComplexParamsReadable[UDFTransformer]


  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    if (isSet(inputCol)) {
      dataset.withColumn(getOutputCol, applyUDF(dataset.col(getInputCol)))
    } else {
      dataset.withColumn(getOutputCol, applyUDFOnCols(getInputCols.map(col): _*))
    }
  }

  def validateAndTransformSchema(schema: StructType): StructType = {
    if (isSet(inputCol)) schema(getInputCol) else schema(Set(getInputCols: _*))
    schema.add(StructField(getOutputCol, getDataType))
  }

  def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema)

  def copy(extra: ParamMap): UDFTransformer = defaultCopy(extra)

}

Source File: PageSplitter.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.featurize.text

import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable}
import org.apache.spark.ml._
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}

object PageSplitter extends DefaultParamsReadable[PageSplitter]


class PageSplitter(override val uid: String)
  extends Transformer with HasInputCol with HasOutputCol with Wrappable with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("PageSplitter"))

  setDefault(outputCol, uid + "_output")

  val maximumPageLength =
    new IntParam(this, "maximumPageLength",
      "the maximum number of characters to be in a page")

  def setMaximumPageLength(v: Int): this.type = set(maximumPageLength, v)

  def getMaximumPageLength: Int = $(maximumPageLength)

  val minimumPageLength =
    new IntParam(this, "minimumPageLength",
      "the the minimum number of characters " +
        "to have on a page in order to preserve work boundaries")

  def setMinimumPageLength(v: Int): this.type = set(minimumPageLength, v)

  def getMinimumPageLength: Int = $(minimumPageLength)

  val boundaryRegex = new Param[String](this, "boundaryRegex", "how to split into words")

  def setBoundaryRegex(v: String): this.type = set(boundaryRegex, v)

  def getBoundaryRegex: String = $(boundaryRegex)

  setDefault(maximumPageLength -> 5000, minimumPageLength -> 4500, boundaryRegex -> "\\s")

  def split(textOpt: String): Seq[String] = {
    Option(textOpt).map { text =>
      if (text.length < getMaximumPageLength) {
        Seq(text)
      } else {
        val lengths = text
          .split(getBoundaryRegex)
          .map(_.length)
          .flatMap(l => List(l, 1))
          .dropRight(1)

        val indicies = lengths.scanLeft((0, 0, Nil: List[Int])) { case ((total, count, _), l) =>
          if (count + l < getMaximumPageLength) {
            (total + l, count + l, Nil)
          } else if (count > getMinimumPageLength) {
            (total + l, l, List(total))
          } else {
            val firstPageChars = getMaximumPageLength - count
            val firstPage = firstPageChars + total
            val remainingChars = l - firstPageChars

            val numPages = remainingChars / getMaximumPageLength
            val remainder = remainingChars - getMaximumPageLength * numPages
            val pages = List(firstPage) ::: (1 to numPages).map(i =>
              total + firstPageChars + getMaximumPageLength * i).toList
            (total + l, remainder, pages)
          }
        }.flatMap(_._3)

        val words = (List(0) ::: indicies.toList ::: List(text.length))
          .sliding(2)
          .map { case List(start, end) => text.substring(start, end) }
          .toSeq
        words
      }
    }.orNull
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.toDF().withColumn(getOutputCol, udf(split _, ArrayType(StringType))(col(getInputCol)))
  }

  override def copy(extra: ParamMap): MultiNGram =
    defaultCopy(extra)

  def transformSchema(schema: StructType): StructType = {
    assert(schema(getInputCol).dataType == StringType)
    schema.add(getOutputCol, ArrayType(StringType))
  }

}

Source File: FunctionsTestSpec.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp

import com.johnsnowlabs.nlp.annotator.{PerceptronApproach, Tokenizer}
import com.johnsnowlabs.nlp.training.POS
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types.ArrayType
import org.scalatest._

class FunctionsTestSpec extends FlatSpec {

  "functions in functions" should "work successfully" in {

    import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._

    val trainingPerceptronDF = POS().readDataset(ResourceHelper.spark, "src/test/resources/anc-pos-corpus-small/", "|", "tags")

    val documentAssembler = new DocumentAssembler()
      .setInputCol("text")
      .setOutputCol("document")

    val tokenizer = new Tokenizer()
      .setInputCols(Array("document"))
      .setOutputCol("token")

    val pos = new PerceptronApproach()
      .setInputCols("document", "token")
      .setOutputCol("pos")
      .setPosColumn("tags")
      .setNIterations(3)

    val pipeline = new Pipeline()
      .setStages(Array(
        documentAssembler,
        tokenizer,
        pos
      ))

    val model = pipeline.fit(trainingPerceptronDF)
    val data = model.transform(Seq("Peter is a very good and compromised person.").toDF("text"))

    import functions._

    val mapped = data.mapAnnotationsCol("pos", "modpos", (annotations: Seq[Annotation]) => {
      annotations.filter(_.result == "JJ")
    })

    val modified = data.mapAnnotationsCol("pos", "modpos", (_: Seq[Annotation]) => {
      "hello world"
    })

    val filtered = data.filterByAnnotationsCol("pos", (annotations: Seq[Annotation]) => {
      annotations.exists(_.result == "JJ")
    })

    import org.apache.spark.sql.functions.col

    val udfed = data.select(mapAnnotations((annotations: Seq[Annotation]) => {
      annotations.filter(_.result == "JJ")
    }, ArrayType(Annotation.dataType))(col("pos")))

    val udfed2 = data.select(mapAnnotationsStrict((annotations: Seq[Annotation]) => {
      annotations.filter(_.result == "JJ")
    })(col("pos")))

    mapped.show(truncate = false)
    modified.show(truncate = false)
    filtered.show(truncate = false)
    udfed.show(truncate = false)
    udfed2.show(truncate = false)
  }

}

Source File: functions.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp

import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions.{array, col, explode, udf}
import org.apache.spark.sql.types.DataType

import scala.reflect.runtime.universe._

object functions {

  implicit class FilterAnnotations(dataset: DataFrame) {
    def filterByAnnotationsCol(column: String, function: Seq[Annotation] => Boolean): DataFrame = {
      val meta = dataset.schema(column).metadata
      val func = udf {
        annotatorProperties: Seq[Row] =>
          function(annotatorProperties.map(Annotation(_)))
      }
      dataset.filter(func(col(column)).as(column, meta))
    }
  }

  def mapAnnotations[T](function: Seq[Annotation] => T, outputType: DataType): UserDefinedFunction = udf ( {
    annotatorProperties: Seq[Row] =>
      function(annotatorProperties.map(Annotation(_)))
  }, outputType)

  def mapAnnotationsStrict(function: Seq[Annotation] => Seq[Annotation]): UserDefinedFunction = udf {
    annotatorProperties: Seq[Row] =>
      function(annotatorProperties.map(Annotation(_)))
  }

  implicit class MapAnnotations(dataset: DataFrame) {
    def mapAnnotationsCol[T: TypeTag](column: String, outputCol: String, function: Seq[Annotation] => T): DataFrame = {
      val meta = dataset.schema(column).metadata
      val func = udf {
        annotatorProperties: Seq[Row] =>
          function(annotatorProperties.map(Annotation(_)))
      }
      dataset.withColumn(outputCol, func(col(column)).as(outputCol, meta))
    }
  }

  implicit class EachAnnotations(dataset: DataFrame) {

    import dataset.sparkSession.implicits._

    def eachAnnotationsCol[T: TypeTag](column: String, function: Seq[Annotation] => Unit): Unit = {
      dataset.select(column).as[Array[Annotation]].foreach(function(_))
    }
  }

  implicit class ExplodeAnnotations(dataset: DataFrame) {
    def explodeAnnotationsCol[T: TypeTag](column: String, outputCol: String): DataFrame = {
      val meta = dataset.schema(column).metadata
      dataset.
        withColumn(outputCol, explode(col(column))).
        withColumn(outputCol, array(col(outputCol)).as(outputCol, meta))
    }
  }

}

Source File: ChiSquareTest.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.stat

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.util.SchemaUtils
import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
import org.apache.spark.mllib.stat.{Statistics => OldStatistics}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col



  @Since("2.2.0")
  def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = {
    val spark = dataset.sparkSession
    import spark.implicits._

    SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT)
    SchemaUtils.checkNumericType(dataset.schema, labelCol)
    val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, Vector)]
      .rdd.map { case (label, features) => OldLabeledPoint(label, OldVectors.fromML(features)) }
    val testResults = OldStatistics.chiSqTest(rdd)
    val pValues: Vector = Vectors.dense(testResults.map(_.pValue))
    val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom)
    val statistics: Vector = Vectors.dense(testResults.map(_.statistic))
    spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics)))
  }
}

Source File: ChiSquareTest.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.stat

import com.tencent.angel.sona.ml.feature.LabeledPoint
import org.apache.spark.linalg
import org.apache.spark.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.util.SONASchemaUtils
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col


/**
 * :: Experimental ::
 *
 * Chi-square hypothesis testing for categorical data.
 *
 * See <a href="http://en.wikipedia.org/wiki/Chi-squared_test">Wikipedia</a> for more information
 * on the Chi-squared test.
 */
object ChiSquareTest {

  
  private case class ChiSquareResult(
                                      pValues: linalg.Vector,
                                      degreesOfFreedom: Array[Int],
                                      statistics: linalg.Vector)

  /**
   * Conduct Pearson's independence test for every feature against the label. For each feature, the
   * (feature, label) pairs are converted into a contingency matrix for which the Chi-squared
   * statistic is computed. All label and feature values must be categorical.
   *
   * The null hypothesis is that the occurrence of the outcomes is statistically independent.
   *
   * @param dataset  DataFrame of categorical labels and categorical features.
   *                 Real-valued features will be treated as categorical for each distinct value.
   * @param featuresCol  Name of features column in dataset, of type `Vector` (`VectorUDT`)
   * @param labelCol  Name of label column in dataset, of any numerical type
   * @return DataFrame containing the test result for every feature against the label.
   *         This DataFrame will contain a single Row with the following fields:
   *          - `pValues: Vector`
   *          - `degreesOfFreedom: Array[Int]`
   *          - `statistics: Vector`
   *         Each of these fields has one value per feature.
   */

  def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = {
    val spark = dataset.sparkSession
    import spark.implicits._

    SONASchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT)
    SONASchemaUtils.checkNumericType(dataset.schema, labelCol)
    val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, linalg.Vector)]
      .rdd.map { case (label, features) => LabeledPoint(label, features) }
    val testResults = Statistics.chiSqTest(rdd)
    val pValues: linalg.Vector = Vectors.dense(testResults.map(_.pValue))
    val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom)
    val statistics: linalg.Vector = Vectors.dense(testResults.map(_.statistic))
    spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics)))
  }
}

Source File: DatasetUtil.scala From sona with Apache License 2.0

5 votes

package org.apache.spark.util

import org.apache.spark.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata}
import org.apache.spark.sql.{Column, DataFrame, Dataset}


object DatasetUtil {
  def withColumns[T](ds: Dataset[T],
                     colNames: Seq[String],
                     cols: Seq[Column],
                     metadata: Seq[Metadata]): DataFrame = {
    require(colNames.size == cols.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of columns: ${cols.size}")
    require(colNames.size == metadata.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of metadata elements: ${metadata.size}")

    val sparkSession = ds.sparkSession
    val queryExecution = ds.queryExecution
    val resolver = sparkSession.sessionState.analyzer.resolver
    val output = queryExecution.analyzed.output

    checkColumnNameDuplication(colNames,
      "in given column names",
      sparkSession.sessionState.conf.caseSensitiveAnalysis)

    val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) =>
      colName -> col.as(colName, metadata)
    }.toMap

    val replacedAndExistingColumns = output.map { field =>
      columnMap.find { case (colName, _) =>
        resolver(field.name, colName)
      } match {
        case Some((colName: String, col: Column)) => col.as(colName)
        case _ => new Column(field)
      }
    }

    val newColumns = columnMap.filter { case (colName, col) =>
      !output.exists(f => resolver(f.name, colName))
    }.map { case (colName, col) => col.as(colName) }

    ds.select(replacedAndExistingColumns ++ newColumns: _*)
  }

  def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = {
    withColumns(ds, Seq(colName), Seq(col), Seq(metadata))
  }

  private def checkColumnNameDuplication(columnNames: Seq[String], colType: String,
                                         caseSensitiveAnalysis: Boolean): Unit = {
    val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
    if (names.distinct.length != names.length) {
      val duplicateColumns = names.groupBy(identity).collect {
        case (x, ys) if ys.length > 1 => s"`$x`"
      }
      throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
    }
  }

  /**
    * Cast a column in a Dataset to Vector type.
    *
    * The supported data types of the input column are
    * - Vector
    * - float/double type Array.
    *
    * Note: The returned column does not have Metadata.
    *
    * @param dataset input DataFrame
    * @param colName column name.
    * @return Vector column
    */
  def columnToVector(dataset: Dataset[_], colName: String): Column = {
    val columnDataType = dataset.schema(colName).dataType
    columnDataType match {
      case _: VectorUDT => col(colName)
      case fdt: ArrayType =>
        val transferUDF = fdt.elementType match {
          case _: FloatType => udf(f = (vector: Seq[Float]) => {
            val inputArray = Array.fill[Double](vector.size)(0.0)
            vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble)
            Vectors.dense(inputArray)
          })
          case _: DoubleType => udf((vector: Seq[Double]) => {
            Vectors.dense(vector.toArray)
          })
          case other =>
            throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector")
        }
        transferUDF(col(colName))
      case other =>
        throw new IllegalArgumentException(s"$other column cannot be cast to Vector")
    }
  }

}

Source File: OilPriceFunc.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.geomesa

import java.text.SimpleDateFormat
import java.util.Calendar

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{udf, window, last, col, lag}

object OilPriceFunc {

    // use this if the window function misbehaves due to timezone e.g. BST
    // ./spark-shell --driver-java-options "-Duser.timezone=UTC"
    // ./spark-submit --conf 'spark.driver.extraJavaOptions=-Duser.timezone=UTC'

    // define a function to reformat the date field
    def convert(date:String) : String = {
      val df1 = new SimpleDateFormat("dd/MM/yyyy")
      val dt = df1.parse(date)
      val df2 = new SimpleDateFormat("yyyy-MM-dd")
      df2.format(dt)
    }

    // create and save oil price changes
    def createOilPriceDF(inputfile: String, outputfile: String, spark: SparkSession) = {

      val oilPriceDF = spark.
        read.
        option("header", "true").
        option("inferSchema", "true").
        csv(inputfile)

      val convertDateUDF = udf { (Date: String) => convert(Date) }

      val oilPriceDatedDF = oilPriceDF.withColumn("DATE", convertDateUDF(oilPriceDF("DATE")))

      // offset to start at beginning of week
      val windowDF = oilPriceDatedDF.groupBy(window(oilPriceDatedDF.col("DATE"), "7 days", "7 days", "4 days"))

      val windowLastDF = windowDF.agg(last("PRICE") as "last(PRICE)").sort("window")

//      windowLastDF.show(20, false)

      val sortedWindow = Window.orderBy("window.start")

      val lagLastCol = lag(col("last(PRICE)"), 1).over(sortedWindow)
      val lagLastColDF = windowLastDF.withColumn("lastPrev(PRICE)", lagLastCol)

//      lagLastColDF.show(20, false)

      val simplePriceChangeFunc = udf { (last: Double, prevLast: Double) =>
        var change = ((last - prevLast) compare 0).signum
        if (change == -1)
          change = 0
        change.toDouble
      }

      val findDateTwoDaysAgoUDF = udf { (date: String) =>
        val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
        val cal = Calendar.getInstance
        cal.setTime(dateFormat.parse(date))
        cal.add(Calendar.DATE, -3)
        dateFormat.format(cal.getTime)
      }

      val oilPriceChangeDF = lagLastColDF.withColumn("label", simplePriceChangeFunc(
        lagLastColDF("last(PRICE)"),
        lagLastColDF("lastPrev(PRICE)")
      )).withColumn("commonFriday", findDateTwoDaysAgoUDF(lagLastColDF("window.end")))

//      oilPriceChangeDF.show(20, false)

      oilPriceChangeDF.select("label", "commonFriday").
        write.
        format("com.databricks.spark.csv").
        option("header", "true").
        //.option("codec", "org.apache.hadoop.io.compress.GzipCodec")
        save(outputfile)
    }
}

Source File: StructuralProfiler.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.profilers.raw

import au.com.bytecode.opencsv.CSVParser
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Row}

case class StructuralProfiler(delimiter: String = ",") {

  def profile(df: Dataset[String]): Dataset[StructuralReport] = {

    import df.sparkSession.implicits._

    val rows = df.mapPartitions({ lines =>
      val parser = new CSVParser(delimiter.charAt(0))
      lines.map(line => (parser.parseLine(line).length, line))
    })

    val fieldCount = rows.groupByKey({ case (fields, line) =>
      fields
    }).count()
      .withColumnRenamed("value", "fields")
      .withColumnRenamed("count(1)", "count")

    val fieldLine = rows.groupByKey({ case (fields, line) =>
      fields
    }).reduceGroups({ (v1, v2) => v1 }).map({ case (fields, (_, line)) =>
      (fields, line)
    })
      .withColumnRenamed("_1", "_fieldLine_")
      .withColumnRenamed("_2", "line")

    fieldCount.join(fieldLine, col("fields") === col("_fieldLine_"))
      .drop("_fieldLine_")
      .map({ case Row(columns: Int, count: Long, line: String) =>
        StructuralReport(
          columns,
          count,
          line
        )
      })
  }
}

case class StructuralReport(
                             fields: Int,
                             metricValue: Double,
                             description: String
                           )

Source File: CardinalityProfiler.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.profilers.field

import io.gzet.profilers.Utils
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Row}

import scala.collection.mutable
import scalaz.Scalaz._

case class CardinalityProfiler(topN: Int = 5) {

  def profile(df: Dataset[Array[String]]): Dataset[CardinalityReport] = {

    val total = df.sparkSession.sparkContext.broadcast(df.count())

    import df.sparkSession.implicits._

    val features = Utils.buildColumns(df)

    val topNValues = features.groupByKey({ field =>
      field
    }).count().map({ case (field, count) =>
      (field.idx, Map(field.value -> count))
    }).groupByKey({ case (column, map) =>
      column
    }).reduceGroups({ (v1, v2) =>
      val m1 = v1._2
      val m2 = v2._2
      val m = (m1 |+| m2).toSeq.sortBy(_._2).reverse
      (v1._1, m.take(math.min(m.size, topN)).toMap)
    }).map({ case (column, (_, map)) =>
      val top = map.keySet.toArray
      (column, top)
    })
      .withColumnRenamed("_1", "_topNValues_")
      .withColumnRenamed("_2", "description")

    val cardinalities = features.distinct().groupByKey(_.idx).count().map({
      case (column, distinctValues) =>
        val cardinality = distinctValues / total.value.toDouble
        (column, cardinality)
    })
      .withColumnRenamed("_1", "column")
      .withColumnRenamed("_2", "cardinality")

    cardinalities.join(topNValues, col("column") === col("_topNValues_"))
      .drop("_topNValues_")
      .map({ case Row(column: Int, cardinality: Double, description: mutable.WrappedArray[String]) =>
        CardinalityReport(
          column,
          cardinality,
          description.toArray
        )
      })

  }

}

case class CardinalityReport(
                              field: Int,
                              metricValue: Double,
                              description: Array[String]
                            )

Source File: OneHotEncoderSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col

class OneHotEncoderSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  def stringIndexed(): DataFrame = {
    val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
    val df = sqlContext.createDataFrame(data).toDF("id", "label")
    val indexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("labelIndex")
      .fit(df)
    indexer.transform(df)
  }

  test("params") {
    ParamsSuite.checkParams(new OneHotEncoder)
  }

  test("OneHotEncoder dropLast = false") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
      .setInputCol("labelIndex")
      .setOutputCol("labelVec")
      .setDropLast(false)
    val encoded = encoder.transform(transformed)

    val output = encoded.select("id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1), vec(2))
    }.collect().toSet
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0),
      (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0))
    assert(output === expected)
  }

  test("OneHotEncoder dropLast = true") {
    val transformed = stringIndexed()
    val encoder = new OneHotEncoder()
      .setInputCol("labelIndex")
      .setOutputCol("labelVec")
    val encoded = encoder.transform(transformed)

    val output = encoded.select("id", "labelVec").map { r =>
      val vec = r.getAs[Vector](1)
      (r.getInt(0), vec(0), vec(1))
    }.collect().toSet
    // a -> 0, b -> 2, c -> 1
    val expected = Set((0, 1.0, 0.0), (1, 0.0, 0.0), (2, 0.0, 1.0),
      (3, 1.0, 0.0), (4, 1.0, 0.0), (5, 0.0, 1.0))
    assert(output === expected)
  }

  test("input column with ML attribute") {
    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("size")
      .select(col("size").as("size", attr.toMetadata()))
    val encoder = new OneHotEncoder()
      .setInputCol("size")
      .setOutputCol("encoded")
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1))
  }

  test("input column without ML attribute") {
    val df = sqlContext.createDataFrame(Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply)).toDF("index")
    val encoder = new OneHotEncoder()
      .setInputCol("index")
      .setOutputCol("encoded")
    val output = encoder.transform(df)
    val group = AttributeGroup.fromStructField(output.schema("encoded"))
    assert(group.size === 2)
    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0))
    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1))
  }

  test("read/write") {
    val t = new OneHotEncoder()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setDropLast(false)
    testDefaultReadWrite(t)
  }
}

Source File: HashingTF.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Source File: SampleStratifiedOn.scala From mimir with Apache License 2.0

5 votes

package mimir.algebra.sampling

import org.apache.spark.sql.functions.{rand, udf, col}
import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter }
import play.api.libs.json._
import mimir.algebra._
import mimir.exec.spark.RAToSpark
import mimir.serialization.{ Json => MimirJson }


case class SampleStratifiedOn(column:ID, t:Type, strata:Map[PrimitiveValue,Double]) extends SamplingMode
{
  val sparkStrata = 
    strata.map { case (v, p) => RAToSpark.getNative(v, t) -> p }
          .toMap

  override def toString = s"ON $column WITH STRATA ${strata.map { case (v,p) => s"$v -> $p"}.mkString(" | ")}"


  def apply(plan: LogicalPlan, seed: Long): LogicalPlan =
  {
    // Adapted from Spark's df.stat.sampleBy method
    val c = col(column.id)
    val r = rand(seed)
    val f = udf { (stratum: Any, x: Double) =>
              x < sparkStrata.getOrElse(stratum, 0.0)
            }
    Filter(
      f(c, r).expr,
      plan
    )
  }
  def expressions: Seq[Expression] = Seq(Var(column))
  def rebuildExpressions(x: Seq[Expression]): SamplingMode =
  {
    x(0) match { 
      case Var(newColumn) => SampleStratifiedOn(newColumn, t, strata) 
      case _ => throw new RAException("Internal Error: Rewriting stratification variable with arbitrary expression")
    }
  }

  def toJson: JsValue = JsObject(Map[String,JsValue](
    "mode" -> JsString(SampleStratifiedOn.MODE),
    "column" -> JsString(column.id),
    "type" -> MimirJson.ofType(t),
    "strata" -> JsArray(
      strata
        .toSeq
        .map { case (v, p) => JsObject(Map[String,JsValue](
            "value" -> MimirJson.ofPrimitive(v),
            "probability" -> JsNumber(p)
          ))
        }
    )
  ))
}

object SampleStratifiedOn
{
  val MODE = "stratified_on"

  def parseJson(json:Map[String, JsValue]): Option[SampleStratifiedOn] =
  {
    if(json("mode").as[String].equals(MODE)){
      val t = MimirJson.toType(json("type"))

      Some(SampleStratifiedOn(
        ID(json("column").as[String]),
        t,
        json("strata")
          .as[Seq[Map[String,JsValue]]]
          .map { stratum => 
            MimirJson.toPrimitive(t, stratum("value")) -> 
              stratum("probability").as[Double]
          }
          .toMap
      ))
    } else {
      None
    }
  }
}

Source File: ClassifierDatasetEncoder.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.ml.tensorflow

import com.johnsnowlabs.nlp.Annotation
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{size, explode, col}

import scala.collection.mutable

class ClassifierDatasetEncoder(val params: ClassifierDatasetEncoderParams) extends Serializable {

  val tags2Id: Map[String, Int] = params.tags.zipWithIndex
    .map(p => (p._1, p._2))
    .toMap

  val tags: Array[String] = tags2Id
    .map(p => (p._2, p._1))
    .toArray
    .sortBy(p => p._1)
    .map(p => p._2)

  def encodeTags(labels: Array[String]): Array[Array[Int]] = {
    labels.map { t =>
      val labelIDsArray = Array.fill(tags.length)(0)
      labelIDsArray(tags2Id(t)) = 1
      labelIDsArray
    }
  }

  
  def decodeOutputData(tagIds: Array[Array[Float]]): Array[Array[(String, Float)]] = {
    val scoresMetadata = tagIds.map { scores =>
      scores.zipWithIndex.flatMap {
        case (score, idx) =>
          val tag = tags2Id.find(_._2 == idx).map(_._1).getOrElse("NA")
          Map(tag -> score)
      }
    }

    scoresMetadata
  }
}

case class ClassifierDatasetEncoderParams(tags: Array[String])

Source File: HashingTF.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Source File: UnaryTransformerExample.scala From Spark-2.3.1 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.DoubleParam
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.{DataType, DataTypes}
import org.apache.spark.util.Utils
// $example off$


  object MyTransformer extends DefaultParamsReadable[MyTransformer]
  // $example off$

  def main(args: Array[String]) {
    val spark = SparkSession
      .builder()
      .appName("UnaryTransformerExample")
      .getOrCreate()

    // $example on$
    val myTransformer = new MyTransformer()
      .setShift(0.5)
      .setInputCol("input")
      .setOutputCol("output")

    // Create data, transform, and display it.
    val data = spark.range(0, 5).toDF("input")
      .select(col("input").cast("double").as("input"))
    val result = myTransformer.transform(data)
    println("Transformed by adding constant value")
    result.show()

    // Save and load the Transformer.
    val tmpDir = Utils.createTempDir()
    val dirName = tmpDir.getCanonicalPath
    myTransformer.write.overwrite().save(dirName)
    val sameTransformer = MyTransformer.load(dirName)

    // Transform the data to show the results are identical.
    println("Same transform applied from loaded model")
    val sameResult = sameTransformer.transform(data)
    sameResult.show()

    Utils.deleteRecursively(tmpDir)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: Distinctness.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import org.apache.spark.sql.functions.{col, sum}
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.Column


case class Distinctness(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("Distinctness", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    (sum(col(COUNT_COL).geq(1).cast(DoubleType)) / numRows) :: Nil
  }

  override def filterCondition: Option[String] = where
}

object Distinctness {
  def apply(column: String): Distinctness = {
    new Distinctness(column :: Nil)
  }
}

Source File: Uniqueness.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{col, lit, sum}
import org.apache.spark.sql.types.DoubleType


case class Uniqueness(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil
  }

  override def filterCondition: Option[String] = where
}

object Uniqueness {
  def apply(column: String): Uniqueness = {
    new Uniqueness(column :: Nil)
  }

  def apply(column: String, where: Option[String]): Uniqueness = {
    new Uniqueness(column :: Nil, where)
  }
}

Source File: Entropy.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{col, sum, udf}


case class Entropy(column: String, where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    val summands = udf { (count: Double) =>
      if (count == 0.0) {
        0.0
      } else {
        -(count / numRows) * math.log(count / numRows)
      }
    }

    sum(summands(col(COUNT_COL))) :: Nil
  }

  override def filterCondition: Option[String] = where
}

Source File: MutualInformation.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers._
import com.amazon.deequ.metrics.{DoubleMetric, Entity}
import org.apache.spark.sql.functions.{col, sum, udf}
import org.apache.spark.sql.types.StructType
import Analyzers.COUNT_COL
import com.amazon.deequ.analyzers.runners.MetricCalculationException


  override def preconditions: Seq[StructType => Unit] = {
    Preconditions.exactlyNColumns(columns, 2) +: super.preconditions
  }

  override def toFailureMetric(exception: Exception): DoubleMetric = {
    metricFromFailure(exception, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn)
  }

  override def filterCondition: Option[String] = where
}

object MutualInformation {
  def apply(columnA: String, columnB: String): MutualInformation = {
    new MutualInformation(columnA :: columnB :: Nil)
  }
}

Source File: ApproxQuantiles.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
import com.amazon.deequ.analyzers.runners.{IllegalAnalyzerParameterException, MetricCalculationException}
import com.amazon.deequ.metrics.{Entity, KeyedDoubleMetric}
import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DeequFunctions, Row}
import scala.util.{Failure, Success}


case class ApproxQuantiles(column: String, quantiles: Seq[Double], relativeError: Double = 0.01)
  extends ScanShareableAnalyzer[ApproxQuantileState, KeyedDoubleMetric] {

  val PARAM_CHECKS: StructType => Unit = { _ =>
    quantiles.foreach { quantile =>
      if (quantile < 0.0 || quantile > 1.0) {
        throw new IllegalAnalyzerParameterException(MetricCalculationException
          .getApproxQuantileIllegalParamMessage(quantile))
      }
    }
    if (relativeError < 0.0 || relativeError > 1.0) {
      throw new IllegalAnalyzerParameterException(MetricCalculationException
        .getApproxQuantileIllegalErrorParamMessage(relativeError))
    }
  }

  override private[deequ] def aggregationFunctions() = {
    DeequFunctions.stateful_approx_quantile(col(column), relativeError) :: Nil
  }

  override private[deequ] def fromAggregationResult(
      result: Row,
      offset: Int)
    : Option[ApproxQuantileState] = {

    if (result.isNullAt(offset)) {
      None
    } else {

      val percentileDigest = ApproximatePercentile.serializer.deserialize(
        result.getAs[Array[Byte]](offset))

      Some(ApproxQuantileState(percentileDigest))
    }
  }

  override def computeMetricFrom(state: Option[ApproxQuantileState]): KeyedDoubleMetric = {

    state match {
      case Some(theState) =>
        val digest = theState.percentileDigest
        val computedQuantiles = digest.getPercentiles(quantiles.toArray)

        val results = quantiles.zip(computedQuantiles)
          .map { case (quantile, result) => quantile.toString -> result }
          .toMap

        KeyedDoubleMetric(Entity.Column, "ApproxQuantiles", column, Success(results))

      case _ =>
        toFailureMetric(Analyzers.emptyStateException(this))
    }
  }

  override def toFailureMetric(exception: Exception): KeyedDoubleMetric = {
    KeyedDoubleMetric(Entity.Column, "ApproxQuantiles", column, Failure(
      MetricCalculationException.wrapIfNecessary(exception)))
  }

  override def preconditions: Seq[StructType => Unit] = {
    PARAM_CHECKS :: hasColumn(column) :: isNumeric(column) :: Nil
  }
}

Source File: UniqueValueRatio.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import com.amazon.deequ.metrics.DoubleMetric
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.{col, count, lit, sum}
import org.apache.spark.sql.types.DoubleType

case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = {
    val numUniqueValues = result.getDouble(offset)
    val numDistinctValues = result.getLong(offset + 1).toDouble

    toSuccessMetric(numUniqueValues / numDistinctValues)
  }

  override def filterCondition: Option[String] = where
}

object UniqueValueRatio {
  def apply(column: String): UniqueValueRatio = {
    new UniqueValueRatio(column :: Nil)
  }

  def apply(column: String, where: Option[String]): UniqueValueRatio = {
    new UniqueValueRatio(column :: Nil, where)
  }
}

Source File: WholeStageCodegenSparkSubmitSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers}
import org.scalatest.concurrent.TimeLimits

import org.apache.spark.{SparkFunSuite, TestUtils}
import org.apache.spark.deploy.SparkSubmitSuite
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{LocalSparkSession, QueryTest, Row, SparkSession}
import org.apache.spark.sql.functions.{array, col, count, lit}
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.unsafe.Platform
import org.apache.spark.util.ResetSystemProperties

// Due to the need to set driver's extraJavaOptions, this test needs to use actual SparkSubmit.
class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite
  with Matchers
  with BeforeAndAfterEach
  with ResetSystemProperties {

  test("Generated code on driver should not embed platform-specific constant") {
    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)

    // HotSpot JVM specific: Set up a local cluster with the driver/executor using mismatched
    // settings of UseCompressedOops JVM option.
    val argsForSparkSubmit = Seq(
      "--class", WholeStageCodegenSparkSubmitSuite.getClass.getName.stripSuffix("$"),
      "--master", "local-cluster[1,1,1024]",
      "--driver-memory", "1g",
      "--conf", "spark.ui.enabled=false",
      "--conf", "spark.master.rest.enabled=false",
      "--conf", "spark.driver.extraJavaOptions=-XX:-UseCompressedOops",
      "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops",
      unusedJar.toString)
    SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..")
  }
}

object WholeStageCodegenSparkSubmitSuite extends Assertions with Logging {

  var spark: SparkSession = _

  def main(args: Array[String]): Unit = {
    TestUtils.configTestLog4j("INFO")

    spark = SparkSession.builder().getOrCreate()

    // Make sure the test is run where the driver and the executors uses different object layouts
    val driverArrayHeaderSize = Platform.BYTE_ARRAY_OFFSET
    val executorArrayHeaderSize =
      spark.sparkContext.range(0, 1).map(_ => Platform.BYTE_ARRAY_OFFSET).collect.head.toInt
    assert(driverArrayHeaderSize > executorArrayHeaderSize)

    val df = spark.range(71773).select((col("id") % lit(10)).cast(IntegerType) as "v")
      .groupBy(array(col("v"))).agg(count(col("*")))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)

    val expectedAnswer =
      Row(Array(0), 7178) ::
        Row(Array(1), 7178) ::
        Row(Array(2), 7178) ::
        Row(Array(3), 7177) ::
        Row(Array(4), 7177) ::
        Row(Array(5), 7177) ::
        Row(Array(6), 7177) ::
        Row(Array(7), 7177) ::
        Row(Array(8), 7177) ::
        Row(Array(9), 7177) :: Nil
    val result = df.collect
    QueryTest.sameRows(result.toSeq, expectedAnswer) match {
      case Some(errMsg) => fail(errMsg)
      case _ =>
    }
  }
}

Source File: MultiStreamHandler.scala From structured-streaming-application with Apache License 2.0

5 votes

package knolx.spark

import knolx.Config._
import knolx.KnolXLogger
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode}
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{Encoders, SparkSession}


case class CurrentPowerConsumption(kwh: Double)

case class PowerConsumptionStatus(numOfReadings: Long, total: Double, avg: Double, status: String) {
  def compute(newReadings: List[Double]) = {
    val newTotal = newReadings.sum + total
    val newNumOfReadings = numOfReadings + newReadings.size
    val newAvg = newTotal / newNumOfReadings.toDouble

    PowerConsumptionStatus(newNumOfReadings, newTotal, newAvg, "ON")
  }
}

object MultiStreamHandler extends App with KnolXLogger {
  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()
  spark.sparkContext.setLogLevel("WARN")

  val updateStateFunc =
    (deviceId: String, newReadings: Iterator[(String, CurrentPowerConsumption)], state: GroupState[PowerConsumptionStatus]) => {
      val data = newReadings.toList.map { case(_, reading) => reading }.map(_.kwh)

      lazy val initialPowerConsumptionStatus = PowerConsumptionStatus(0L, 0D, 0D, "OFF")
      val currentState = state.getOption.fold(initialPowerConsumptionStatus.compute(data))(_.compute(data))

      val currentStatus =
        if(state.hasTimedOut) {
          // If we do not receive any reading, for a device, we will assume that it is OFF.
          currentState.copy(status = "OFF")
        } else {
          state.setTimeoutDuration("10 seconds")
          currentState
        }

      state.update(currentStatus)
      (deviceId, currentStatus)
    }

  info("Creating Streaming DF...")
  val dataStream =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", topic)
      .option("failOnDataLoss", false)
      .option("includeTimestamp", true)
      .load()

  info("Writing data to Console...")
  import spark.implicits._

  implicit val currentPowerConsumptionEncoder = Encoders.kryo[CurrentPowerConsumption]
  implicit val powerConsumptionStatusEncoder = Encoders.kryo[PowerConsumptionStatus]

  val query =
    dataStream
      .select(col("key").cast(StringType).as("key"), col("value").cast(StringType).as("value"))
      .as[(String, String)]
      .map { case(deviceId, unit) =>
        (deviceId, CurrentPowerConsumption(Option(unit).fold(0D)(_.toDouble)))
      }
      .groupByKey { case(deviceId, _) => deviceId }
      .mapGroupsWithState[PowerConsumptionStatus, (String, PowerConsumptionStatus)](GroupStateTimeout.ProcessingTimeTimeout())(updateStateFunc)
      .toDF("deviceId", "current_status")
      .writeStream
      .format("console")
      .option("truncate", false)
      .outputMode(OutputMode.Update())
      .option("checkpointLocation", checkPointDir)
      .start()

  info("Waiting for the query to terminate...")
  query.awaitTermination()
  query.stop()
}

Source File: StreamStreamOuterJoiner.scala From structured-streaming-application with Apache License 2.0

5 votes

package knolx.spark

import knolx.Config._
import knolx.KnolXLogger
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.functions.{col, expr, from_json}
import org.apache.spark.sql.types.StructType


object StreamStreamOuterJoiner extends App with KnolXLogger {
  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()
  spark.sparkContext.setLogLevel("WARN")

  info("Streaming companies Dataframe")
  val companiesDF =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", companiesTopic)
      .load()
      .select(col("value").cast("string").as("companyName"),
        col("timestamp").as("companyTradingTime"))
      .withWatermark("companyTradingTime", "10 seconds")

  companiesDF.writeStream.format("console").option("truncate", false).start()

  info("Original Streaming Dataframe")
  val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType]
  val stockStreamDF =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", stocksTopic)
      .load()
      .select(from_json(col("value").cast("string"), schema).as("value"),
        col("timestamp").as("stockInputTime"))
      .select("value.*", "stockInputTime")
      .withWatermark("stockInputTime", "10 seconds")

  info("Filtered Streaming Dataframe")
  val filteredStockStreamDF = stockStreamDF.join(companiesDF,
    expr("companyName = stockName AND stockInputTime >= companyTradingTime AND stockInputTime <= companyTradingTime + interval 20 seconds"),
    joinType = "leftOuter")
  val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").option("truncate", false).start()

  info("Waiting for the query to terminate...")
  filteredStockStreamingQuery.awaitTermination()
  filteredStockStreamingQuery.stop()
}

Source File: StreamStaticJoiner.scala From structured-streaming-application with Apache License 2.0

5 votes

package knolx.spark

import knolx.Config._
import knolx.KnolXLogger
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.functions.{col, from_json}
import org.apache.spark.sql.types.StructType


object StreamStaticJoiner extends App with KnolXLogger {
  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()
  spark.sparkContext.setLogLevel("WARN")

  info("Static Dataframe")
  val companiesDF = spark.read.option("header", "true").csv("src/main/resources/companies.csv")
  companiesDF.show(false)

  info("Original Streaming Dataframe")
  val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType]
  val stockStreamDF =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", topic)
      .load()
      .select(from_json(col("value").cast("string"), schema).as("value"))
      .select("value.*")

  stockStreamDF.printSchema()
  stockStreamDF.writeStream.format("console").start()

  info("Filtered Streaming Dataframe")
  val filteredStockStreamDF = stockStreamDF.join(companiesDF, "companyName")
  val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").start()

  info("Waiting for the query to terminate...")
  filteredStockStreamingQuery.awaitTermination()
  filteredStockStreamingQuery.stop()
}

Source File: MultinomialLabeler.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.MultinomialLabelerModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasFeaturesCol
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.{udf, col}
import ml.combust.mleap.core.util.VectorConverters._


class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"),
                         val model: MultinomialLabelerModel) extends Transformer
  with HasFeaturesCol
  with HasProbabilitiesCol
  with HasLabelsCol {

  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
  def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value)
  def setLabelsCol(value: String): this.type = set(labelsCol, value)

  @org.apache.spark.annotation.Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val probabilitiesUdf = udf {
      (vector: Vector) => model.top(vector).map(_._1).toArray
    }

    val labelsUdf = udf {
      (vector: Vector) => model.topLabels(vector).toArray
    }

    dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))).
      withColumn($(labelsCol), labelsUdf(col($(featuresCol))))
  }

  override def copy(extra: ParamMap): Transformer =
    copyValues(new MultinomialLabeler(uid, model), extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT],
      s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}")
    val inputFields = schema.fields
    require(!inputFields.exists(_.name == $(probabilitiesCol)),
      s"Output column ${$(probabilitiesCol)} already exists.")
    require(!inputFields.exists(_.name == $(labelsCol)),
      s"Output column ${$(labelsCol)} already exists.")

    StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)),
      StructField($(labelsCol), ArrayType(StringType))))
  }
}

Source File: CoverTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class CoverTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")

  property("argument numSplits must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setNumSplits(0)
    }
  }

  property("argument overlapRatio must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setOverlapRatio(0.0)
    }
  }

  property("cover estimator changes nothing with the original dataframe") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        val covered = cover
          .fit(transformed)
          .transform(transformed)
          .drop("cover_ids")
          .except(transformed)
          .count() should be(0)
      }
    }
  }

  property("generated cover covers all range of specified columns") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")
    val uncovered = udf { xs: Seq[Long] =>
      xs.length == 0
    }

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        cover
          .fit(transformed)
          .transform(transformed)
          .where(uncovered(col("cover_ids")))
          .count() should be(0)
      }
    }
  }

  property("Cover is readable/writable") {
    val cover = new Cover()
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    testDefaultReadWrite(cover)
  }

  property("CoverModel is readable/writable") {
    val model = new CoverModel("myCoverModel",
                               Vectors.dense(-1.0, 0.0),
                               Vectors.dense(1.0, 10.0))
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    val newModel = testDefaultReadWrite(model)
    assert(newModel.min === model.min)
    assert(newModel.max === model.max)
  }
}

Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector}
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class ReebDiagramTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")
  val cover = new Cover()
    .setExploding(true)
    .setInputCols("double", "integer")
    .setOutputCol("cover_id")

  property("argument topTreeSize must be positive") {
    intercept[IllegalArgumentException] {
      val reeb = new ReebDiagram()
//        .setIdCol("id")
//        .setCoverCol("cover_id")
//        .setFeaturesCol("vector")
//        .setOutputCol("cluster_id")
        .setTopTreeSize(0)
    }
  }

  property("placeholder") {
    val reeb = new ReebDiagram()
      .setK(15)
      .setIdCol("id")
      .setCoverCol("cover_id")
      .setFeaturesCol("vector")
      .setOutputCol("cluster_id")
    forAll(dataframeGen.arbitrary) { df =>
      val assembled = assembler.transform(df)
      whenever(
        assembled.count() > 0 && hasDistinctValues(assembled,
                                                   "double",
                                                   "integer")) {
        val transformed = cover
          .fit(assembled)
          .transform(assembled)
        val result = reeb
          .setTopTreeSize(1)
          .fit(transformed)
          .transform(transformed)
//        result.show()
      }
    }
  }
}

Source File: DebugConfluentSparkAvroUtils.scala From confluent-spark-avro with Apache License 2.0

5 votes

package com.databricks.spark.avro

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col


object DebugConfluentSparkAvroUtils {
  def main(args: Array[String]): Unit = {
    val kafkaUrl = args(0)
    val schemaRegistryUrl = args(1)
    val topic = args(2)

    val spark = SparkSession.builder().master("local[2]").getOrCreate()
    val df = spark.read.format("kafka")
      .option("kafka.bootstrap.servers", kafkaUrl)
      .option("subscribe", topic)
      .load()

    val utils = new ConfluentSparkAvroUtils(schemaRegistryUrl)
    val keyDes = utils.deserializerForSubject(topic + "-key")
    val valDes = utils.deserializerForSubject(topic + "-value")

    df.select(
      keyDes(col("key")).alias("key"),
      valDes(col("value")).alias("value")
    ).show(10)

    spark.stop()
  }
}

Source File: ExecutorSideSQLConfSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.internal

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.{AnalysisException, SparkSession}
import org.apache.spark.sql.execution.debug.codegenStringSeq
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.test.SQLTestUtils

class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils {
  import testImplicits._

  protected var spark: SparkSession = null

  // Create a new [[SparkSession]] running in local-cluster mode.
  override def beforeAll(): Unit = {
    super.beforeAll()
    spark = SparkSession.builder()
      .master("local-cluster[2,1,1024]")
      .appName("testing")
      .getOrCreate()
  }

  override def afterAll(): Unit = {
    spark.stop()
    spark = null
  }

  override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
    pairs.foreach { case (k, v) =>
      SQLConf.get.setConfString(k, v)
    }
    try f finally {
      pairs.foreach { case (k, _) =>
        SQLConf.get.unsetConf(k)
      }
    }
  }

  test("ReadOnlySQLConf is correctly created at the executor side") {
    withSQLConf("spark.sql.x" -> "a") {
      val checks = spark.range(10).mapPartitions { _ =>
        val conf = SQLConf.get
        Iterator(conf.isInstanceOf[ReadOnlySQLConf] && conf.getConfString("spark.sql.x") == "a")
      }.collect()
      assert(checks.forall(_ == true))
    }
  }

  test("case-sensitive config should work for json schema inference") {
    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
      withTempPath { path =>
        val pathString = path.getCanonicalPath
        spark.range(10).select('id.as("ID")).write.json(pathString)
        spark.range(10).write.mode("append").json(pathString)
        assert(spark.read.json(pathString).columns.toSet == Set("id", "ID"))
      }
    }
  }

  test("SPARK-24727 CODEGEN_CACHE_MAX_ENTRIES is correctly referenced at the executor side") {
    withSQLConf(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key -> "300") {
      val checks = spark.range(10).mapPartitions { _ =>
        val conf = SQLConf.get
        Iterator(conf.isInstanceOf[ReadOnlySQLConf] &&
          conf.getConfString(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key) == "300")
      }.collect()
      assert(checks.forall(_ == true))
    }
  }

  test("SPARK-22219: refactor to control to generate comment") {
    Seq(true, false).foreach { flag =>
      withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> flag.toString) {
        val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count()
          .queryExecution.executedPlan)
        assert(res.length == 2)
        assert(res.forall { case (_, code) =>
          (code.contains("* Codegend pipeline") == flag) &&
            (code.contains("// input[") == flag)
        })
      }
    }
  }
}

Source File: StreamStreamJoiner.scala From structured-streaming-application with Apache License 2.0

5 votes

package knolx.spark

import knolx.Config._
import knolx.KnolXLogger
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.functions.{col, expr, from_json}
import org.apache.spark.sql.types.StructType


object StreamStreamJoiner extends App with KnolXLogger {
  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()
  spark.sparkContext.setLogLevel("WARN")

  info("Streaming companies Dataframe")
  val companiesDF =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", companiesTopic)
      .load()
      .select(col("value").cast("string").as("companyName"),
        col("timestamp").as("companyTradingTime"))

  companiesDF.writeStream.format("console").option("truncate", false).start()

  info("Original Streaming Dataframe")
  val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType]
  val stockStreamDF =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", stocksTopic)
      .load()
      .select(from_json(col("value").cast("string"), schema).as("value"),
        col("timestamp").as("stockInputTime"))
      .select("value.*", "stockInputTime")

  info("Filtered Streaming Dataframe")
  val filteredStockStreamDF = stockStreamDF.join(companiesDF,
    expr("companyName = stockName AND stockInputTime >= companyTradingTime AND stockInputTime <= companyTradingTime + interval 20 seconds"))
  val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").option("truncate", false).start()

  info("Waiting for the query to terminate...")
  filteredStockStreamingQuery.awaitTermination()
  filteredStockStreamingQuery.stop()
}

Source File: ExtractPythonUDFsSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.python

import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest}
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.test.SharedSQLContext

class ExtractPythonUDFsSuite extends SparkPlanTest with SharedSQLContext {
  import testImplicits.newProductEncoder
  import testImplicits.localSeqToDatasetHolder

  val batchedPythonUDF = new MyDummyPythonUDF
  val scalarPandasUDF = new MyDummyScalarPandasUDF

  private def collectBatchExec(plan: SparkPlan): Seq[BatchEvalPythonExec] = plan.collect {
    case b: BatchEvalPythonExec => b
  }

  private def collectArrowExec(plan: SparkPlan): Seq[ArrowEvalPythonExec] = plan.collect {
    case b: ArrowEvalPythonExec => b
  }

  test("Chained Batched Python UDFs should be combined to a single physical node") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
    val df2 = df.withColumn("c", batchedPythonUDF(col("a")))
      .withColumn("d", batchedPythonUDF(col("c")))
    val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan)
    assert(pythonEvalNodes.size == 1)
  }

  test("Chained Scalar Pandas UDFs should be combined to a single physical node") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
    val df2 = df.withColumn("c", scalarPandasUDF(col("a")))
      .withColumn("d", scalarPandasUDF(col("c")))
    val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan)
    assert(arrowEvalNodes.size == 1)
  }

  test("Mixed Batched Python UDFs and Pandas UDF should be separate physical node") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
    val df2 = df.withColumn("c", batchedPythonUDF(col("a")))
      .withColumn("d", scalarPandasUDF(col("b")))

    val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan)
    val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan)
    assert(pythonEvalNodes.size == 1)
    assert(arrowEvalNodes.size == 1)
  }

  test("Independent Batched Python UDFs and Scalar Pandas UDFs should be combined separately") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
    val df2 = df.withColumn("c1", batchedPythonUDF(col("a")))
      .withColumn("c2", batchedPythonUDF(col("c1")))
      .withColumn("d1", scalarPandasUDF(col("a")))
      .withColumn("d2", scalarPandasUDF(col("d1")))

    val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan)
    val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan)
    assert(pythonEvalNodes.size == 1)
    assert(arrowEvalNodes.size == 1)
  }

  test("Dependent Batched Python UDFs and Scalar Pandas UDFs should not be combined") {
    val df = Seq(("Hello", 4)).toDF("a", "b")
    val df2 = df.withColumn("c1", batchedPythonUDF(col("a")))
      .withColumn("d1", scalarPandasUDF(col("c1")))
      .withColumn("c2", batchedPythonUDF(col("d1")))
      .withColumn("d2", scalarPandasUDF(col("c2")))

    val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan)
    val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan)
    assert(pythonEvalNodes.size == 2)
    assert(arrowEvalNodes.size == 2)
  }
}

Source File: GroupSortedDataset.scala From spark-sorted with Apache License 2.0

5 votes

package com.tresata.spark.sorted.sql

import scala.reflect.ClassTag

import org.apache.spark.sql.{ Column, Dataset, Encoder }
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder }

import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate }

object GroupSortedDataset {
  private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = {
    val key = col(dataset.columns.head)
    val valueSort = {
      val sort = sortBy(col(dataset.columns.last))
      if (reverse) sort.desc else sort.asc
    }
    new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort))
  }
}

class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable {
  def toDS: Dataset[(K, V)] = dataset

  def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f))
  }

  def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIterator(_)(f))
  }

  def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f)))
  }

  def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] =
    mapStreamByKey(iter => Iterator(iter.reduceLeft(f)))

  def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(_.scanLeft(wCreate())(f))
  }
}

Source File: ConcatArrowAndExplodeSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries

import java.io.ByteArrayOutputStream
import java.nio.channels.Channels
import java.util.concurrent.TimeUnit

import com.twosigma.flint.arrow.ArrowUtils
import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector.ipc.ArrowFileWriter
import org.apache.arrow.vector.{ BigIntVector, Float8Vector, VectorSchemaRoot }
import org.apache.spark.sql.functions.{ array, col, lit, struct }
import org.apache.spark.sql.types._

class ConcatArrowAndExplodeSpec extends TimeSeriesSuite {

  "ConcatArrowAndExplode" should "work" in {

    val batchSize = 10

    var df = spark.range(1000, 2000, 1000).toDF("time")
    val columns = (0 until batchSize).map(v => struct((df("time") + v).as("time"), lit(v.toDouble).as("v")))
    df = df.withColumn("base_rows", array(columns: _*))

    val allocator = new RootAllocator(Long.MaxValue)

    val schema1 = StructType(Seq(StructField("v1", DoubleType)))
    val root1 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema1), allocator)
    val vector1 = root1.getVector("v1").asInstanceOf[Float8Vector]
    vector1.allocateNew()

    for (i <- 0 until batchSize) {
      vector1.set(i, i + 10.0)
    }
    vector1.setValueCount(batchSize)
    val out1 = new ByteArrayOutputStream()
    val arrowWriter1 = new ArrowFileWriter(root1, null, Channels.newChannel(out1))
    arrowWriter1.writeBatch()
    arrowWriter1.close()
    root1.close()
    df = df.withColumn("f1_schema", struct(lit(0.0).as("v1")))
    df = df.withColumn("f1_data", lit(out1.toByteArray))

    val schema2 = StructType(Seq(StructField("v2", DoubleType), StructField("v3", LongType)))
    val root2 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema2), allocator)
    val vector2 = root2.getVector("v2").asInstanceOf[Float8Vector]
    val vector3 = root2.getVector("v3").asInstanceOf[BigIntVector]
    vector2.allocateNew()
    vector3.allocateNew()

    for (i <- 0 until batchSize) {
      vector2.set(i, i + 20.0)
    }
    vector2.setValueCount(batchSize)

    for (i <- 0 until batchSize) {
      vector3.set(i, i + 30L)
    }
    vector3.setValueCount(batchSize)
    val out2 = new ByteArrayOutputStream()
    val arrowWriter2 = new ArrowFileWriter(root2, null, Channels.newChannel(out2))
    arrowWriter2.writeBatch()
    arrowWriter2.close()
    root2.close()
    df = df.withColumn("f2_schema", struct(lit(0.0).as("v2"), lit(0L).as("v3")))
    df = df.withColumn("f2_data", lit(out2.toByteArray))

    var tsrdd = TimeSeriesRDD.fromDF(df)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS)
    tsrdd = tsrdd.concatArrowAndExplode("base_rows", Seq("f1_schema", "f2_schema"), Seq("f1_data", "f2_data"))
    tsrdd.toDF.show()

    var expected = spark.range(1000, 1000 + batchSize).toDF("time")
    expected = expected.withColumn("v", col("time") - 1000.0)
    expected = expected.withColumn("v1", col("time") - 1000 + 10.0)
    expected = expected.withColumn("v2", col("time") - 1000 + 20.0)
    expected = expected.withColumn("v3", col("time") - 1000 + 30)

    val expectedTsrdd = TimeSeriesRDD.fromDF(expected)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS)
    assertEquals(tsrdd, expectedTsrdd)
  }

}

Source File: RateSource.scala From spark-structured-streaming-examples with Apache License 2.0

5 votes

package com.phylosoft.spark.learning.sql.streaming.source.rate

import com.phylosoft.spark.learning.sql.streaming.source.StreamingSource
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{DataFrame, SparkSession}

trait RateSource
  extends StreamingSource {

  val spark: SparkSession
  val rowsPerSecond: String
  val numPartitions: String

  override def readStream(): DataFrame = {
    spark.readStream
      .format("rate")
      .option("rowsPerSecond", rowsPerSecond)
      .option("numPartitions", numPartitions)
      .load()
      .select(col("*"))
  }

}

Source File: AdRateSources.scala From spark-structured-streaming-examples with Apache License 2.0

5 votes

package com.phylosoft.spark.learning.sql.streaming.source.rate

import org.apache.spark.sql.functions.{col, rand}
import org.apache.spark.sql.{DataFrame, SparkSession}

class AdRateSources(val spark: SparkSession,
                    val rowsPerSecond: String = "5",
                    val numPartitions: String = "1")
  extends RateSource {

  def loadImpressions(): DataFrame = {
    readStream()
      .select(
        col("value").as("adId"),
        col("timestamp").as("impressionTime"))
  }

  def loadClicks(): DataFrame = {
    readStream()
      .where((rand() * 100).cast("integer") < 10) // 10 out of every 100 impressions result in a click
      .select((col("value") - 50).as("adId"), col("timestamp").as("clickTime")) // -50 so that a click with same id as impression is generated much later (i.e. delayed data).
      .where("adId > 0")
  }

}

Source File: UserActionsRateSource.scala From spark-structured-streaming-examples with Apache License 2.0

5 votes

package com.phylosoft.spark.learning.sql.streaming.source.rate

import org.apache.spark.sql.functions.{col, lit, pmod, rand}
import org.apache.spark.sql.{DataFrame, SparkSession}


class UserActionsRateSource(val spark: SparkSession,
                            val rowsPerSecond: String = "5",
                            val numPartitions: String = "1")
  extends RateSource {

  def loadUserActions(): DataFrame = {
    readStream()
      .where((rand() * 100).cast("integer") < 30) // 30 out of every 100 user actions
      .select(pmod(col("value"), lit(9)).as("userId"), col("timestamp").as("actionTime"))
  }

}

Source File: FilterTopFeaturesProcess.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs.wal.process

import org.apache.s2graph.s2jobs.task.TaskConf
import org.apache.s2graph.s2jobs.wal.WalLogAgg
import org.apache.s2graph.s2jobs.wal.transformer.{DefaultTransformer, Transformer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import play.api.libs.json.{JsObject, Json}

object FilterTopFeaturesProcess {
  private var validFeatureHashKeys: Set[Long] = null
  def getValidFeatureHashKeys(validFeatureHashKeysBCast: Broadcast[Array[Long]]): Set[Long] = {
    if (validFeatureHashKeys == null) {
      validFeatureHashKeys = validFeatureHashKeysBCast.value.toSet
    }

    validFeatureHashKeys
  }

  def collectDistinctFeatureHashes(ss: SparkSession,
                                   filteredDict: DataFrame): Array[Long] = {
    import ss.implicits._

    val featureHashUDF = udf((dim: String, value: String) => WalLogAgg.toFeatureHash(dim, value))

    filteredDict.withColumn("featureHash", featureHashUDF(col("dim"), col("value")))
      .select("featureHash")
      .distinct().as[Long].collect()
  }

  def filterTopKsPerDim(dict: DataFrame,
                        maxRankPerDim: Broadcast[Map[String, Int]],
                        defaultMaxRank: Int): DataFrame = {
    val filterUDF = udf((dim: String, rank: Long) => {
      rank < maxRankPerDim.value.getOrElse(dim, defaultMaxRank)
    })

    dict.filter(filterUDF(col("dim"), col("rank")))
  }

  def filterWalLogAgg(ss: SparkSession,
                      walLogAgg: Dataset[WalLogAgg],
                      transformers: Seq[Transformer],
                      validFeatureHashKeysBCast: Broadcast[Array[Long]]) = {
    import ss.implicits._
    walLogAgg.mapPartitions { iter =>
      val validFeatureHashKeys = getValidFeatureHashKeys(validFeatureHashKeysBCast)

      iter.map { walLogAgg =>
        WalLogAgg.filterProps(walLogAgg, transformers, validFeatureHashKeys)
      }
    }
  }
}

class FilterTopFeaturesProcess(taskConf: TaskConf) extends org.apache.s2graph.s2jobs.task.Process(taskConf) {

  import FilterTopFeaturesProcess._

  
  override def execute(ss: SparkSession, inputMap: Map[String, DataFrame]): DataFrame = {
    import ss.implicits._

    val maxRankPerDim = taskConf.options.get("maxRankPerDim").map { s =>
      Json.parse(s).as[JsObject].fields.map { case (k, jsValue) =>
        k -> jsValue.as[Int]
      }.toMap
    }
    val maxRankPerDimBCast = ss.sparkContext.broadcast(maxRankPerDim.getOrElse(Map.empty))

    val defaultMaxRank = taskConf.options.get("defaultMaxRank").map(_.toInt)

    val featureDict = inputMap(taskConf.options("featureDict"))
    val walLogAgg = inputMap(taskConf.options("walLogAgg")).as[WalLogAgg]

    val transformers = TaskConf.parseTransformers(taskConf)

    val filteredDict = filterTopKsPerDim(featureDict, maxRankPerDimBCast, defaultMaxRank.getOrElse(Int.MaxValue))
    val validFeatureHashKeys = collectDistinctFeatureHashes(ss, filteredDict)
    val validFeatureHashKeysBCast = ss.sparkContext.broadcast(validFeatureHashKeys)

    filterWalLogAgg(ss, walLogAgg, transformers, validFeatureHashKeysBCast).toDF()
  }

  override def mandatoryOptions: Set[String] = Set("featureDict", "walLogAgg")
}

Source File: WholeStageCodegenSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.aggregate.HashAggregateExec
import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
import org.apache.spark.sql.expressions.scalalang.typed
import org.apache.spark.sql.functions.{avg, broadcast, col, max}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {

  test("range/filter should be combined") {
    val df = spark.range(10).filter("id = 1").selectExpr("id + 1")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)
    assert(df.collect() === Array(Row(2)))
  }

  test("Aggregate should be included in WholeStageCodegen") {
    val df = spark.range(10).groupBy().agg(max(col("id")), avg(col("id")))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(9, 4.5)))
  }

  test("Aggregate with grouping keys should be included in WholeStageCodegen") {
    val df = spark.range(3).groupBy("id").count().orderBy("id")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(0, 1), Row(1, 1), Row(2, 1)))
  }

  test("BroadcastHashJoin should be included in WholeStageCodegen") {
    val rdd = spark.sparkContext.makeRDD(Seq(Row(1, "1"), Row(1, "1"), Row(2, "2")))
    val schema = new StructType().add("k", IntegerType).add("v", StringType)
    val smallDF = spark.createDataFrame(rdd, schema)
    val df = spark.range(10).join(broadcast(smallDF), col("k") === col("id"))
    assert(df.queryExecution.executedPlan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[BroadcastHashJoinExec]).isDefined)
    assert(df.collect() === Array(Row(1, 1, "1"), Row(1, 1, "1"), Row(2, 2, "2")))
  }

  test("Sort should be included in WholeStageCodegen") {
    val df = spark.range(3, 0, -1).toDF().sort(col("id"))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SortExec]).isDefined)
    assert(df.collect() === Array(Row(1), Row(2), Row(3)))
  }

  test("MapElements should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = spark.range(10).map(_.toString)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SerializeFromObjectExec]).isDefined)
    assert(ds.collect() === 0.until(10).map(_.toString).toArray)
  }

  test("typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 2, 4, 6, 8))
  }

  test("back-to-back typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0).filter(_ % 3 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 6))
  }

  test("simple typed UDAF should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS()
      .groupByKey(_._1).agg(typed.sum(_._2))

    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0)))
  }
}

Source File: SuiteKickoff.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.workload

import com.ibm.sparktc.sparkbench.utils.SparkFuncs._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.functions.{col, lit}

import scala.collection.parallel.ForkJoinTaskSupport



object SuiteKickoff {
  private val log = org.slf4j.LoggerFactory.getLogger(getClass)

  def run(s: Suite, spark: SparkSession): Unit = {
    verifyOutput(s.benchmarkOutput, s.saveMode, spark)

    // Translate the maps into runnable workloads
    val workloads: Seq[Workload] = s.workloadConfigs.map(ConfigCreator.mapToConf)

    val dataframes: Seq[DataFrame] = (0 until s.repeat).flatMap { i =>
      // This will produce one DataFrame of one row for each workload in the sequence.
      // We're going to produce one coherent DF later from these
      val dfSeqFromOneRun: Seq[DataFrame] = {
        if (s.parallel) runParallel(workloads, spark)
        else runSerially(workloads, spark)
      }
      // Indicate which run of this suite this was.
      dfSeqFromOneRun.map(_.withColumn("run", lit(i)))
    }

    // getting the Spark confs so we can output them in the results.
    val strSparkConfs = spark.conf.getAll

    // Ah, see, here's where we're joining that series of one-row DFs
    val singleDF = joinDataFrames(dataframes, spark)
    s.description.foreach(log.info)
    // And now we're going to curry in the results
    val plusSparkConf = addConfToResults(singleDF, strSparkConfs)
    val plusDescription = addConfToResults(plusSparkConf, Map("description" -> s.description)).coalesce(1)
    // And write to disk. We're done with this suite!
    if(s.benchmarkOutput.nonEmpty) writeToDisk(s.benchmarkOutput.get, s.saveMode, plusDescription, spark)
  }

  private def runParallel(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = {
    val confSeqPar = workloadConfigs.par
    confSeqPar.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(confSeqPar.size))
    confSeqPar.map(_.run(spark)).seq
  }

  private def runSerially(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = {
    workloadConfigs.map(_.run(spark))
  }

  private def joinDataFrames(seq: Seq[DataFrame], spark: SparkSession): DataFrame = {
    if (seq.length == 1) seq.head
    else {
      val seqOfColNames = seq.map(_.columns.toSet)
      val allTheColumns = seqOfColNames.foldLeft(Set[String]())(_ ++ _)

      def expr(myCols: Set[String], allCols: Set[String]) = {
        allCols.toList.map {
          case x if myCols.contains(x) => col(x)
          case x => lit(null).as(x)
        }
      }

      val seqFixedDfs = seq.map(df => df.select(expr(df.columns.toSet, allTheColumns): _*))

      // Folding left across this sequence should be fine because each DF should only have 1 row
      // Nevarr Evarr do this to legit dataframes that are all like big and stuff
      seqFixedDfs.foldLeft(spark.createDataFrame(spark.sparkContext.emptyRDD[Row], seqFixedDfs.head.schema))(_ union _)
    }
  }
}

Source File: Cleaner.scala From CkoocNLP with Apache License 2.0

5 votes

package functions.clean

import com.hankcs.hanlp.HanLP
import config.paramconf.{HasOutputCol, HasInputCol}
import functions.MySchemaUtils
import functions.clean.chinese.BCConvert
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}



  setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1)

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema, logging = true)

    val cleanFunc = udf {line: String =>
      var cleaned = ""
      getFanJian match {
        case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line)
        case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line)
        case _ => cleaned = line
      }

      getQuanBan match {
        case "q2b" => cleaned = BCConvert.qj2bj(cleaned)
        case "b2q" => cleaned = BCConvert.bj2qj(cleaned)
        case _ => cleaned = cleaned
      }

      cleaned
    }

    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record =>
      val outputIndex = record.fieldIndex($(outputCol))
      record.getString(outputIndex).length >= getMinLineLen
    }
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.typeName.equals(StringType.typeName),
      s"Input type must be StringType but got $inputType.")
    MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
  }
}


object Cleaner extends DefaultParamsReadable[Cleaner] {
  override def load(path: String): Cleaner = super.load(path)
}

Source File: implicits.scala From spark-dynamodb with Apache License 2.0

5 votes

package com.audienceproject.spark.dynamodb

import com.audienceproject.spark.dynamodb.reflect.SchemaAnalysis
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StructField

import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

object implicits {

    implicit class DynamoDBDataFrameReader(reader: DataFrameReader) {

        def dynamodb(tableName: String): DataFrame =
            getDynamoDBSource(tableName).load()

        def dynamodb(tableName: String, indexName: String): DataFrame =
            getDynamoDBSource(tableName).option("indexName", indexName).load()

        def dynamodbAs[T <: Product : ClassTag : TypeTag](tableName: String): Dataset[T] = {
            implicit val encoder: Encoder[T] = ExpressionEncoder()
            getColumnsAlias(getDynamoDBSource(tableName)
                .schema(SchemaAnalysis[T]).load()).as
        }

        def dynamodbAs[T <: Product : ClassTag : TypeTag](tableName: String, indexName: String): Dataset[T] = {
            implicit val encoder: Encoder[T] = ExpressionEncoder()
            getColumnsAlias(getDynamoDBSource(tableName)
                .option("indexName", indexName)
                .schema(SchemaAnalysis[T]).load()).as
        }

        private def getDynamoDBSource(tableName: String): DataFrameReader =
            reader.format("com.audienceproject.spark.dynamodb.datasource").option("tableName", tableName)

        private def getColumnsAlias(dataFrame: DataFrame): DataFrame = {
            val columnsAlias = dataFrame.schema.collect({
                case StructField(name, _, _, metadata) if metadata.contains("alias") =>
                    col(name).as(metadata.getString("alias"))
                case StructField(name, _, _, _) =>
                    col(name)
            })
            dataFrame.select(columnsAlias: _*)
        }

    }

    implicit class DynamoDBDataFrameWriter[T](writer: DataFrameWriter[T]) {

        def dynamodb(tableName: String): Unit =
            writer.format("com.audienceproject.spark.dynamodb.datasource").option("tableName", tableName).save()

    }

}

Source File: TestUtils.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.utils

import org.apache.spark.sql.functions.{col, count, lit}
import org.apache.spark.sql.{DataFrame, Row}

object TestUtils {

  implicit class ExtendedDataFrame(df: DataFrame) {

    def hasDiff(anotherDf: DataFrame): Boolean = {
      def printDiff(incoming: Boolean)(row: Row): Unit = {
        if (incoming) print("+ ") else print("- ")
        println(row)
      }

      val groupedDf = df.groupBy(df.columns.map(col): _*).agg(count(lit(1))).collect().toSet
      val groupedAnotherDf = anotherDf.groupBy(anotherDf.columns.map(col): _*).agg(count(lit(1))).collect().toSet

      groupedDf.diff(groupedAnotherDf).foreach(printDiff(incoming = true))
      groupedAnotherDf.diff(groupedDf).foreach(printDiff(incoming = false))

      groupedDf.diff(groupedAnotherDf).nonEmpty || groupedAnotherDf.diff(groupedDf).nonEmpty
    }
  }
}

Source File: PartitionHelpers.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.analytics.algo.core

import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}


trait PartitionHelpers {

  protected def getDistinctPartitions(outputDataFrame: DataFrame, targetPartitions: Seq[String]): Dataset[Row] = {
    val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString))

    outputDataFrame.select(targetPartitionsColumns: _*).distinct
  }

  protected def getParameterValue(row: Row, partitionString: String): String =
    createParameterValue(row.get(row.fieldIndex(partitionString)))

  protected def createParameterValue(partitionRawValue: Any): String =
    partitionRawValue match {
      case value: java.lang.Short => value.toString
      case value: java.lang.Integer => value.toString
      case value: scala.Predef.String => "'" + value + "'"
      case null => throw new Exception("Partition Value is null. No support for null partitions!")
      case value => throw new Exception("Unsupported partition DataType: " + value.getClass)
    }
}

Source File: Example3_7.scala From LearningSparkV2 with Apache License 2.0

5 votes

package main.scala.chapter3

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.{col, expr}

object Example3_7 {
  def main(args: Array[String]) {

    val spark = SparkSession
      .builder
      .appName("Example-3_7")
      .getOrCreate()

    if (args.length <= 0) {
      println("usage Example3_7 <file path to blogs.json")
      System.exit(1)
    }
    //get the path to the JSON file
    val jsonFile = args(0)
    //define our schema as before
    val schema = StructType(Array(StructField("Id", IntegerType, false),
      StructField("First", StringType, false),
      StructField("Last", StringType, false),
      StructField("Url", StringType, false),
      StructField("Published", StringType, false),
      StructField("Hits", IntegerType, false),
      StructField("Campaigns", ArrayType(StringType), false)))

    //Create a DataFrame by reading from the JSON file a predefined Schema
    val blogsDF = spark.read.schema(schema).json(jsonFile)
    //show the DataFrame schema as output
    blogsDF.show(truncate = false)
    // print the schemas
    print(blogsDF.printSchema)
    print(blogsDF.schema)
    // Show columns and expressions
    blogsDF.select(expr("Hits") * 2).show(2)
    blogsDF.select(col("Hits") * 2).show(2)
    blogsDF.select(expr("Hits * 2")).show(2)
   // show heavy hitters
   blogsDF.withColumn("Big Hitters", (expr("Hits > 10000"))).show()

  }
}

Source File: Word2Vec.scala From spark-sql-perf with Apache License 2.0

5 votes

package com.databricks.spark.sql.perf.mllib.feature

import scala.util.Random

import org.apache.spark.ml
import org.apache.spark.ml.{PipelineStage, Transformer}
import org.apache.spark.ml.feature.Word2VecModel
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, split}

import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator


object Word2Vec extends BenchmarkAlgorithm with TestFromTraining {

  override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
    import ctx.params._

    val df = DataGenerator.generateDoc(
      ctx.sqlContext,
      numExamples,
      ctx.seed(),
      numPartitions,
      vocabSize,
      docLength,
      "text"
    )
    df.select(split(col("text"), " ").as("text"))
  }

  override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
    new ml.feature.Word2Vec().setInputCol("text")
  }

  override def testAdditionalMethods(
      ctx: MLBenchContext,
      model: Transformer): Map[String, () => _] = {
    import ctx.params._

    val rng = new Random(ctx.seed())
    val word2vecModel = model.asInstanceOf[Word2VecModel]
    val testWord = Vectors.dense(Array.fill(word2vecModel.getVectorSize)(rng.nextGaussian()))

    Map("findSynonyms" -> (() => {
      word2vecModel.findSynonyms(testWord, numSynonymsToFind)
    }))
  }

}

Source File: WholeStageCodegenSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.aggregate.HashAggregateExec
import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
import org.apache.spark.sql.expressions.scalalang.typed
import org.apache.spark.sql.functions.{avg, broadcast, col, max}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {

  test("range/filter should be combined") {
    val df = spark.range(10).filter("id = 1").selectExpr("id + 1")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)
    assert(df.collect() === Array(Row(2)))
  }

  test("Aggregate should be included in WholeStageCodegen") {
    val df = spark.range(10).groupBy().agg(max(col("id")), avg(col("id")))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(9, 4.5)))
  }

  test("Aggregate with grouping keys should be included in WholeStageCodegen") {
    val df = spark.range(3).groupBy("id").count().orderBy("id")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(0, 1), Row(1, 1), Row(2, 1)))
  }

  test("BroadcastHashJoin should be included in WholeStageCodegen") {
    val rdd = spark.sparkContext.makeRDD(Seq(Row(1, "1"), Row(1, "1"), Row(2, "2")))
    val schema = new StructType().add("k", IntegerType).add("v", StringType)
    val smallDF = spark.createDataFrame(rdd, schema)
    val df = spark.range(10).join(broadcast(smallDF), col("k") === col("id"))
    assert(df.queryExecution.executedPlan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[BroadcastHashJoinExec]).isDefined)
    assert(df.collect() === Array(Row(1, 1, "1"), Row(1, 1, "1"), Row(2, 2, "2")))
  }

  test("Sort should be included in WholeStageCodegen") {
    val df = spark.range(3, 0, -1).toDF().sort(col("id"))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SortExec]).isDefined)
    assert(df.collect() === Array(Row(1), Row(2), Row(3)))
  }

  test("MapElements should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = spark.range(10).map(_.toString)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SerializeFromObjectExec]).isDefined)
    assert(ds.collect() === 0.until(10).map(_.toString).toArray)
  }

  test("typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 2, 4, 6, 8))
  }

  test("back-to-back typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0).filter(_ % 3 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 6))
  }

  test("simple typed UDAF should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS()
      .groupByKey(_._1).agg(typed.sum(_._2))

    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0)))
  }
}

Source File: HashingTF.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Source File: HashingTF.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
}

Source File: PredicatePushdownSuite.scala From spark-exasol-connector with Apache License 2.0

5 votes

package com.exasol.spark

import java.sql.Timestamp

import org.apache.spark.sql.functions.col

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.scalatest.funsuite.AnyFunSuite


class PredicatePushdownSuite extends AnyFunSuite with BaseDockerSuite with DataFrameSuiteBase {

  test("with where clause build from filters: filter") {
    createDummyTable()

    import spark.implicits._

    val df = spark.read
      .format("exasol")
      .option("host", container.host)
      .option("port", s"${container.port}")
      .option("query", s"SELECT * FROM $EXA_SCHEMA.$EXA_TABLE")
      .load()
      .filter($"id" < 3)
      .filter(col("city").like("Ber%"))
      .select("id", "city")

    val result = df.collect().map(x => (x.getLong(0), x.getString(1))).toSet
    assert(result.size === 1)
    assert(result === Set((1, "Berlin")))
  }

  test("with where clause build from filters: createTempView and spark.sql") {
    createDummyTable()

    val df = spark.read
      .format("exasol")
      .option("host", container.host)
      .option("port", s"${container.port}")
      .option("query", s"SELECT * FROM $EXA_SCHEMA.$EXA_TABLE")
      .load()

    df.createOrReplaceTempView("myTable")

    val myDF = spark
      .sql("SELECT id, city FROM myTable WHERE id BETWEEN 1 AND 3 AND name < 'Japan'")

    val result = myDF.collect().map(x => (x.getLong(0), x.getString(1))).toSet
    assert(result.size === 2)
    assert(result === Set((1, "Berlin"), (2, "Paris")))
  }

  test("date and timestamp should be read and filtered correctly") {
    import java.sql.Date

    createDummyTable()
    val df = spark.read
      .format("exasol")
      .option("host", container.host)
      .option("port", s"${container.port}")
      .option("query", s"SELECT date_info, updated_at FROM $EXA_SCHEMA.$EXA_TABLE")
      .load()
    val minTimestamp = Timestamp.valueOf("2017-12-30 00:00:00.0000")
    val testDate = Date.valueOf("2017-12-31")

    val resultDate = df.collect().map(_.getDate(0))
    assert(resultDate.contains(testDate))

    val resultTimestamp = df.collect().map(_.getTimestamp(1)).map(x => x.after(minTimestamp))
    assert(!resultTimestamp.contains(false))

    val filteredByDateDF = df.filter(col("date_info") === testDate)
    assert(filteredByDateDF.count() === 1)

    val filteredByTimestampDF = df.filter(col("updated_at") < minTimestamp)
    assert(filteredByTimestampDF.count() === 0)
  }

  test("count should be performed successfully") {
    createDummyTable()
    val df = spark.read
      .format("exasol")
      .option("host", container.host)
      .option("port", s"${container.port}")
      .option("query", s"SELECT * FROM $EXA_SCHEMA.$EXA_TABLE")
      .load()
    val result = df.count()
    assert(result === 3)
  }
}

Source File: GaussianProcessCommons.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.commons

import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
import breeze.optimize.LBFGSB
import org.apache.spark.ml.commons.kernel.{EyeKernel, Kernel, _}
import org.apache.spark.ml.commons.util.DiffFunctionMemoized
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.util.Instrumentation
import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Row}

private[ml] trait GaussianProcessCommons[F, E <: Predictor[F, E, M], M <: PredictionModel[F, M]]
  extends ProjectedGaussianProcessHelper {  this: Predictor[F, E, M] with GaussianProcessParams =>

  protected val getKernel : () => Kernel = () => $(kernel)() + $(sigma2).const * new EyeKernel

  protected def getPoints(dataset: Dataset[_]) = {
    dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map {
      case Row(label: Double, features: Vector) => LabeledPoint(label, features)
    }
  }

  protected def groupForExperts(points: RDD[LabeledPoint]) = {
    val numberOfExperts = Math.round(points.count().toDouble / $(datasetSizeForExpert))
    points.zipWithIndex.map { case(instance, index) =>
      (index % numberOfExperts, instance)
    }.groupByKey().map(_._2)
  }

  protected def getExpertLabelsAndKernels(points: RDD[LabeledPoint]): RDD[(BDV[Double], Kernel)] = {
    groupForExperts(points).map { chunk =>
      val (labels, trainingVectors) = chunk.map(lp => (lp.label, lp.features)).toArray.unzip
      (BDV(labels: _*), getKernel().setTrainingVectors(trainingVectors))
    }
  }

  protected def projectedProcess(expertLabelsAndKernels: RDD[(BDV[Double], Kernel)],
                                 points: RDD[LabeledPoint],
                                 optimalHyperparameters: BDV[Double]) = {
    val activeSet = $(activeSetProvider)($(activeSetSize), expertLabelsAndKernels, points,
      getKernel, optimalHyperparameters, $(seed))

    points.unpersist()

    val (matrixKmnKnm, vectorKmny) = getMatrixKmnKnmAndVectorKmny(expertLabelsAndKernels, activeSet)

    expertLabelsAndKernels.unpersist()

    val optimalKernel = getKernel().setHyperparameters(optimalHyperparameters).setTrainingVectors(activeSet)

    // inv(sigma^2 K_mm + K_mn * K_nm) * K_mn * y
    val (magicVector, magicMatrix) = getMagicVector(optimalKernel,
      matrixKmnKnm, vectorKmny, activeSet, optimalHyperparameters)

    new GaussianProjectedProcessRawPredictor(magicVector, magicMatrix, optimalKernel)
  }

  
  protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor) : M
}

class GaussianProjectedProcessRawPredictor private[commons] (val magicVector: BDV[Double],
                                                             val magicMatrix: BDM[Double],
                                                             val kernel: Kernel) extends Serializable {
  def predict(features: Vector): (Double, Double) = {
    val cross = kernel.crossKernel(features)
    val selfKernel = kernel.selfKernel(features)
    (cross * magicVector, selfKernel + cross * magicMatrix * cross.t)
  }
}

Source File: PageRankSuite.scala From graphframes with Apache License 2.0

5 votes

package org.graphframes.lib

import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.DataTypes

import org.graphframes.examples.Graphs
import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils}

class PageRankSuite extends SparkFunSuite with GraphFrameTestSparkContext {

  val n = 100

  test("Star example") {
    val g = Graphs.star(n)
    val resetProb = 0.15
    val errorTol = 1.0e-5
    val pr = g.pageRank
      .resetProbability(resetProb)
      .tol(errorTol).run()
    TestUtils.testSchemaInvariants(g, pr)
    TestUtils.checkColumnType(pr.vertices.schema, "pagerank", DataTypes.DoubleType)
    TestUtils.checkColumnType(pr.edges.schema, "weight", DataTypes.DoubleType)
  }

  test("friends graph with personalized PageRank") {
    val results = Graphs.friends.pageRank.resetProbability(0.15).maxIter(10).sourceId("a").run()

    val gRank = results.vertices.filter(col("id") === "g").select("pagerank").first().getDouble(0)
    assert(gRank === 0.0,
      s"User g (Gabby) doesn't connect with a. So its pagerank should be 0 but we got $gRank.")
  }
}

Source File: ParallelPersonalizedPageRankSuite.scala From graphframes with Apache License 2.0

5 votes

package org.graphframes.lib

import com.github.zafarkhaja.semver.Version

import org.apache.spark.ml.linalg.{SQLDataTypes, SparseVector}
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.DataTypes

import org.graphframes.examples.Graphs
import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils}

class ParallelPersonalizedPageRankSuite extends SparkFunSuite with GraphFrameTestSparkContext {

  val n = 100

  test("Illegal function call argument setting") {
    val g = Graphs.star(n)
    val vertexIds: Array[Any] = Array(1L, 2L, 3L)

    // Not providing number of iterations
    intercept[IllegalArgumentException] {
      g.parallelPersonalizedPageRank.sourceIds(vertexIds).run()
    }

    // Not providing sourceIds
    intercept[IllegalArgumentException] {
      g.parallelPersonalizedPageRank.maxIter(15).run()
    }

    // Provided empty sourceIds
    intercept[IllegalArgumentException] {
      g.parallelPersonalizedPageRank.maxIter(15).sourceIds(Array()).run()
    }
  }

  test("Star example parallel personalized PageRank") {
    val g = Graphs.star(n)
    val resetProb = 0.15
    val maxIter = 10
    val vertexIds: Array[Any] = Array(1L, 2L, 3L)

    lazy val prc = g.parallelPersonalizedPageRank
      .maxIter(maxIter)
      .sourceIds(vertexIds)
      .resetProbability(resetProb)

    val pr = prc.run()
    TestUtils.testSchemaInvariants(g, pr)
    TestUtils.checkColumnType(pr.vertices.schema, "pageranks", SQLDataTypes.VectorType)
    TestUtils.checkColumnType(pr.edges.schema, "weight", DataTypes.DoubleType)
  }

  // In Spark <2.4, sourceIds must be smaller than Int.MaxValue,
  // which might not be the case for LONG_ID in graph.indexedVertices.
  if (Version.valueOf(org.apache.spark.SPARK_VERSION)
    .greaterThanOrEqualTo(Version.valueOf("2.4.0"))) {
    test("friends graph with parallel personalized PageRank") {
      val g = Graphs.friends
      val resetProb = 0.15
      val maxIter = 10
      val vertexIds: Array[Any] = Array("a")
      lazy val prc = g.parallelPersonalizedPageRank
        .maxIter(maxIter)
        .sourceIds(vertexIds)
        .resetProbability(resetProb)

      val pr = prc.run()
      val prInvalid = pr.vertices
        .select("pageranks")
        .collect()
        .filter { row: Row =>
          vertexIds.size != row.getAs[SparseVector](0).size
        }
      assert(prInvalid.size === 0,
        s"found ${prInvalid.size} entries with invalid number of returned personalized pagerank vector")

      val gRank = pr.vertices
        .filter(col("id") === "g")
        .select("pageranks")
        .first().getAs[SparseVector](0)
      assert(gRank.numNonzeros === 0,
        s"User g (Gabby) doesn't connect with a. So its pagerank should be 0 but we got ${gRank.numNonzeros}.")
    }
  }
}

Source File: ShortestPaths.scala From graphframes with Apache License 2.0

5 votes

package org.graphframes.lib

import java.util

import scala.collection.JavaConverters._

import org.apache.spark.graphx.{lib => graphxlib}
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.apache.spark.sql.api.java.UDF1
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{IntegerType, MapType}

import org.graphframes.GraphFrame


  def landmarks(value: util.ArrayList[Any]): this.type = {
    landmarks(value.asScala)
  }

  def run(): DataFrame = {
    ShortestPaths.run(graph, check(lmarks, "landmarks"))
  }
}

private object ShortestPaths {

  private def run(graph: GraphFrame, landmarks: Seq[Any]): DataFrame = {
    val idType = graph.vertices.schema(GraphFrame.ID).dataType
    val longIdToLandmark = landmarks.map(l => GraphXConversions.integralId(graph, l) -> l).toMap
    val gx = graphxlib.ShortestPaths.run(
      graph.cachedTopologyGraphX,
      longIdToLandmark.keys.toSeq.sorted).mapVertices { case (_, m) => m.toSeq }
    val g = GraphXConversions.fromGraphX(graph, gx, vertexNames = Seq(DISTANCE_ID))
    val distanceCol: Column = if (graph.hasIntegralIdType) {
      // It seems there are no easy way to convert a sequence of pairs into a map
      val mapToLandmark = udf { distances: Seq[Row] =>
        distances.map { case Row(k: Long, v: Int) =>
          k -> v
        }.toMap
      }
      mapToLandmark(g.vertices(DISTANCE_ID))
    } else {
      val func = new UDF1[Seq[Row], Map[Any, Int]] {
        override def call(t1: Seq[Row]): Map[Any, Int] = {
          t1.map { case Row(k: Long, v: Int) =>
              longIdToLandmark(k) -> v
          }.toMap
        }
      }
      val mapToLandmark = udf(func, MapType(idType, IntegerType, false))
      mapToLandmark(col(DISTANCE_ID))
    }
    val cols = graph.vertices.columns.map(col) :+ distanceCol.as(DISTANCE_ID)
    g.vertices.select(cols: _*)
  }

  private val DISTANCE_ID = "distances"

}

Source File: TriangleCount.scala From graphframes with Apache License 2.0

5 votes

package org.graphframes.lib

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{array, col, explode, when}

import org.graphframes.GraphFrame
import org.graphframes.GraphFrame.{DST, ID, LONG_DST, LONG_SRC, SRC}


class TriangleCount private[graphframes] (private val graph: GraphFrame) extends Arguments {

  def run(): DataFrame = {
    TriangleCount.run(graph)
  }
}

private object TriangleCount {

  private def run(graph: GraphFrame): DataFrame = {
    // Dedup edges by flipping them to have LONG_SRC < LONG_DST
    // TODO (when we drop support for Spark 1.4): Use functions greatest, smallest instead of UDFs
    val dedupedE = graph.indexedEdges
      .filter(s"$LONG_SRC != $LONG_DST")
      .selectExpr(
        s"if($LONG_SRC < $LONG_DST, $SRC, $DST) as $SRC",
        s"if($LONG_SRC < $LONG_DST, $DST, $SRC) as $DST")
      .dropDuplicates(Seq(SRC, DST))
    val g2 = GraphFrame(graph.vertices, dedupedE)

    // Because SRC < DST, there exists only one type of triangles:
    // - Non-cycle with one edge flipped.  These are counted 1 time each by motif finding.
    val triangles = g2.find("(a)-[]->(b); (b)-[]->(c); (a)-[]->(c)")

    val triangleCounts = triangles
      .select(explode(array(col("a.id"), col("b.id"), col("c.id"))).as(ID))
      .groupBy(ID)
      .count()

    val v = graph.vertices
    val countsCol = when(col("count").isNull, 0L).otherwise(col("count"))
    val newV = v.join(triangleCounts, v(ID) === triangleCounts(ID), "left_outer")
      .select(countsCol.as(COUNT_ID) +: v.columns.map(v.apply) :_ *)
    newV
  }

  private val COUNT_ID = "count"
}

Source File: BaseSparkSpec.scala From gemini with GNU General Public License v3.0

5 votes

package tech.sourced.gemini

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions.col
import org.scalatest.{BeforeAndAfterAll, Suite}
import org.slf4j.{Logger => Slf4jLogger}

trait BaseSparkSpec extends BeforeAndAfterAll {
  this: Suite =>

  @transient var sparkSession: SparkSession = _
  private var _conf: SparkConf = _

  def useSparkConf(conf: SparkConf): SparkConf = {
    _conf = conf
    _conf
  }

  def useDefaultSparkConf(): SparkConf = {
    val defaultConf: SparkConf = new SparkConf(true)
      .setAppName(this.getClass.getSimpleName)
      .set("spark.cassandra.connection.host", Gemini.defaultCassandraHost)
      .set("spark.cassandra.connection.port", Gemini.defaultCassandraPort.toString)
      .set("spark.cassandra.connection.keep_alive_ms", "5000")
      .set("spark.cassandra.connection.timeout_ms", "30000")
      .set("spark.tech.sourced.bblfsh.grpc.host", "127.0.0.1")
      .set("spark.ui.showConsoleProgress", "false")
      .set("spark.ui.enabled", "false")
      .set("spark.cleaner.ttl", "3600")

    useSparkConf(defaultConf)
  }

  override protected def beforeAll(): Unit = {
    super.beforeAll()
    if (_conf == null) {
        useDefaultSparkConf()
    }
    sparkSession = SparkSession.builder()
      .master("local[*]")
      .config(_conf)
      .config("spark.driver.host", "localhost")
      .getOrCreate()
  }

  override protected def afterAll(): Unit = {
    // commented due to "Cannot call methods on a stopped SparkContext"
    // but for tests we don't really need to stop spark
    // it will be stopped automatically when tests exit
    // resetSparkContext()

    // make sure different suites don't use the same cache
    cleanSparkCache()
    super.afterAll()
  }

  def resetSparkContext(): Unit = {
    if (sparkSession != null) {
      sparkSession.stop()
    }
    sparkSession = null
  }

  def cleanSparkCache(): Unit = {
    if (sparkSession != null) {
      sparkSession.sqlContext.clearCache()
    }
  }

  // don't process all content of repos to speedup tests
  class LimitedHash(s: SparkSession,
                    log: Slf4jLogger,
                    mode: String,
                    filePaths: Seq[String]) extends Hash(s, log, mode) {
    override def filesForRepos(repos: DataFrame): DataFrame =
      super.filesForRepos(repos).filter(col("path").isin(filePaths: _*))
  }
  object LimitedHash {
    def apply(s: SparkSession, log: Slf4jLogger, mode: String, paths: Seq[String]): LimitedHash =
      new LimitedHash(s, log, mode, paths)
  }
}

Source File: StructuredStreamingWordCount.scala From structured-streaming-application with Apache License 2.0

5 votes

package knolx.spark

import com.datastax.driver.core.Cluster
import knolx.Config._
import knolx.KnolXLogger
import knolx.spark.CassandraForeachWriter.writeToCassandra
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, lit, sum}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StringType


object StructuredStreamingWordCount extends App with KnolXLogger {
  val cluster = Cluster.builder.addContactPoints(cassandraHosts).build
  val session = cluster.newSession()

  info("Creating Keypsace and tables in Cassandra...")
  session.execute(s"CREATE KEYSPACE IF NOT EXISTS $keyspace WITH " +
    "replication = {'class':'SimpleStrategy','replication_factor':1};")

  session.execute(s"CREATE TABLE IF NOT EXISTS $keyspace.wordcount ( word text PRIMARY KEY,count int );")

  info("Closing DB connection...")
  session.close()
  session.getCluster.close()

  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()
  spark.sparkContext.setLogLevel("WARN")

  info("Creating Streaming DF...")
  val dataStream =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", topic)
      .load()

  info("Writing data to Cassandra...")
  val query =
    dataStream
      .select(col("value").cast(StringType).as("word"), lit(1).as("count"))
      .groupBy(col("word"))
      .agg(sum("count").as("count"))
      .writeStream
      .outputMode(OutputMode.Update())
      .foreach(writeToCassandra)
      .option("checkpointLocation", checkPointDir)
      .start()

  info("Waiting for the query to terminate...")
  query.awaitTermination()
  query.stop()
}

org.apache.spark.sql.functions.col Scala Examples