org.apache.spark.sql.Encoders Scala Examples

The following examples show how to use org.apache.spark.sql.Encoders. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: RecoverPartitionsCustomIntegrationTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.integration

import com.adidas.utils.TestUtils._
import com.adidas.analytics.algo.AppendLoad
import com.adidas.utils.FileReader
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{Dataset, Encoders}
import org.scalatest.FeatureSpec
import org.scalatest.Matchers._

import scala.collection.JavaConverters._


class RecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest {

  feature("Partitions can be updated programmatically using custom logic") {

    scenario("Using Append Load Algorithm with multiple source files") {
      val testResourceDir = "multiple_source_files"
      val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json")
      val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1")

      val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType]
      val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType]
      val dataReader = FileReader.newDSVFileReader(Some(targetSchema))
      val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema))

      val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema)
      setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader)
      prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv"))
      uploadParameters(testResourceDir)

      // checking pre-conditions
      spark.read.csv(sourceDirPath.toString).count() shouldBe 7
      targetTable.read().count() shouldBe 19

      fs.exists(targetPath20180101) shouldBe false
      fs.exists(headerPath20180101) shouldBe false

      // executing load
      AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()

      // validating result
      val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true)
      val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true)
      val expectedDf = dataReader.read(spark, expectedDataLocation)
      val actualDf = targetTable.read()

      val producedPartitionsNumber: Dataset[String] = spark
        .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}")
        .as(Encoders.STRING)

      // MetaData Specific Tests
      val expectedPartitions: Dataset[String] = expectedPartitionsDataReader
        .read(spark, expectedPartitionsLocation)
        .as(Encoders.STRING)

      expectedPartitions.collectAsList().asScala.sorted.toSet should
        equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet)

      actualDf.hasDiff(expectedDf) shouldBe false

      spark
        .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)")
        .filter("col_name == 'Partition Statistics'")
        .head()
        .getAs[String]("data_type").contains("6 rows") shouldBe true

      fs.exists(targetPath20180101) shouldBe true
      fs.exists(headerPath20180101) shouldBe true
    }
  }


} 
Example 2
Source File: LikelihoodRatioTest.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.sql.expressions

import breeze.linalg.{DenseMatrix, DenseVector}
import org.apache.spark.ml.linalg.{DenseMatrix => SparkDenseMatrix}
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types.StructType

object LikelihoodRatioTest extends LogitTest {
  override type FitState = LRTFitState
  override def fitStatePerPhenotype: Boolean = true
  override val resultSchema: StructType = Encoders.product[LogitTestResults].schema

  override def init(phenotypes: Array[Double], covariates: SparkDenseMatrix): LRTFitState = {
    val nullX = new DenseMatrix(covariates.numRows, covariates.numCols, covariates.values)
    val y = new DenseVector(phenotypes)
    val nullFitState = new NewtonIterationsState(covariates.numRows, covariates.numCols)
    nullFitState.initFromMatrix(nullX, y)
    val nullFit = LogisticRegressionGwas.newtonIterations(nullX, y, nullX.copy, nullFitState)
    val fullFitState = new NewtonIterationsState(covariates.numRows, covariates.numCols + 1)
    val x = DenseMatrix.horzcat(nullX, DenseMatrix.zeros[Double](covariates.numRows, 1))
    LRTFitState(x, x.copy, nullFit, fullFitState)
  }

  override def runTest(
      genotypes: DenseVector[Double],
      phenotypes: DenseVector[Double],
      fitState: LRTFitState): InternalRow = {
    fitState.x(::, -1) := genotypes
    fitState.newtonState.initFromMatrixAndNullFit(fitState.x, phenotypes, fitState.nullFit.args)

    if (!fitState.nullFit.converged) {
      return LogitTestResults.nanRow
    }

    val fullFit =
      LogisticRegressionGwas.newtonIterations(
        fitState.x,
        phenotypes,
        fitState.hessian,
        fitState.newtonState)

    if (!fullFit.converged) {
      return LogitTestResults.nanRow
    }

    val beta = fullFit.args.b(-1)
    LogisticRegressionGwas.makeStats(
      beta,
      fullFit.args.fisher,
      fullFit.logLkhd,
      fitState.nullFit.logLkhd)
  }
}

case class LRTFitState(
    x: DenseMatrix[Double],
    hessian: DenseMatrix[Double],
    nullFit: NewtonResult,
    newtonState: NewtonIterationsState
) 
Example 3
Source File: LocalIntegrationTest.scala    From kafka-examples   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.streaming.refapp

import java.sql.Timestamp

import org.scalatest.Matchers._
import org.scalatest.concurrent.Eventually._
import org.scalatest.time.{Seconds, Span}

import org.apache.spark.sql.Encoders

class LocalIntegrationTest extends IntegrationTestBase {

  test("Integration test with one kafka and one spark instance embedded in the same JVM") {

    val inputDir = "src/test/resources/samples"

    val spark = EmbeddedSpark.sparkSession

    val fileSource = new FileSources(spark, inputDir)
    val kafkaConfig = EmbeddedKafkaBroker.defaultKafkaConfig
    val kafkaSource = new KafkaSource(spark, kafkaConfig)

    val application = new Application(
      spark,
      Sources(
        statesFromCluster = fileSource.jsonFile("states"),
        customersFromCluster = fileSource.jsonFile("customers"),
        vendorsFromCluster = fileSource.jsonFile("vendors"),
        customersFromStream = kafkaSource.jsonStreamWithKafkaTimestamp("customer"),
        vendorsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("vendor", "update_timestamp"),
        transactionsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("transaction", "event_timestamp")
      ),
      Sinks(
        invalidTransactions = Memory.memorySink("invalidTransactions"),
        validTransactions = Memory.memorySink("validTransactions"),
        customerOrphans = Memory.memorySink("customerOrphans"),
        vendorOrphans = Memory.memorySink("vendorOrphans"),
        customers = Memory.memorySink("customers"),
        vendors = Memory.memorySink("vendors"),
        transactionsOperationalMetadata = Memory.memorySink("transactionsOperationalMetadata")
      ))

    application.start()

    eventually(timeout(Span(20, Seconds)), interval(Span(5, Seconds))) {
      EmbeddedKafkaBroker.publishStringMessageToKafka(
        "transaction",
        """{
          "transaction_id": "1",
          "customer_id": 1,
          "vendor_id": 1,
          "event_state": "CREATED",
          "event_timestamp": "2018-11-12 09:42:00",
          "price": "100",
          "card_type": "Credit"}""")
      EmbeddedKafkaBroker.publishStringMessageToKafka(
        "transaction",
        """{
          "transaction_id": "21",
          "customer_id": 100,
          "vendor_id": 2,
          "event_state": "SWIPED",
          "event_timestamp": "2018-11-13 09:45:01",
          "price": "100",
          "card_type": "Debit"}""")

      val validTransactionsQuery = application.streamingQueries.validTransactions
      validTransactionsQuery.processAllAvailable()
      val currentContent = spark.table("validTransactions").as[Transaction](Encoders.product).collect()

      currentContent.shouldBe(
        Array(
          Transaction(
            transaction_id = "1",
            customer_id = Some(1),
            vendor_id = Some(1),
            event_state = Some("CREATED"),
            event_timestamp = Timestamp.valueOf("2018-11-12 09:42:00"),
            price = Some("100"),
            card_type = Some("Credit")),
          Transaction(
            transaction_id = "21",
            customer_id = Some(100),
            vendor_id = Some(2),
            event_state = Some("SWIPED"),
            event_timestamp = Timestamp.valueOf("2018-11-13 09:45:01"),
            price = Some("100"),
            card_type = Some("Debit"))
        ))
    }
  }
} 
Example 4
Source File: K-Centers.scala    From Clustering4Ever   with Apache License 2.0 5 votes vote down vote up
package org.clustering4ever.clustering.kcenters.dataset

		@annotation.tailrec
		def go(cpt: Int, haveAllCentersConverged: Boolean, centers: List[(Int, V)]): List[(Int, V)] = {
			val preUpdatedCenters = data.groupByKey( cz => obtainNearestCenterID(cz.v, centers, metric) )(encoderInt)
				.mapGroups(computeCenters)(encoder)
				.collect
				.sortBy(_._1)
				.toList
			val alignedOldCenters = preUpdatedCenters.map{ case (oldClusterID, _) => centers(oldClusterID) }
			val updatedCenters = preUpdatedCenters.zipWithIndex.map{ case ((oldClusterID, center), newClusterID) => (newClusterID, center) }
			val shiftingEnough = areCentersNotMovingEnough(updatedCenters, alignedOldCenters, minShift, metric)
			if(cpt < maxIterations && !shiftingEnough) {
				go(cpt + 1, shiftingEnough, updatedCenters)
			}
			else {
				updatedCenters
			}
		}

		immutable.HashMap(go(0, false, centers):_*)

	}
} 
Example 5
Source File: A_9_MyAverageByAggregator.scala    From wow-spark   with MIT License 5 votes vote down vote up
package com.sev7e0.wow.sql

import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.expressions.Aggregator



case class Employee(name:String, salary:Long)
case class Average(var sum:Long, var count:Long)
object A_9_MyAverageByAggregator extends Aggregator[Employee, Average, Double]{
  override def zero: Average = Average(0L,0L)

  override def reduce(b: Average, a: Employee): Average = {
    b.sum += a.salary
    b.count+=1
    b
  }

  override def merge(b1: Average, b2: Average): Average = {
    b1.count+=b2.count
    b1.sum+=b2.sum
    b1
  }

  override def finish(reduction: Average): Double = reduction.sum.toDouble/reduction.count

  override def bufferEncoder: Encoder[Average] = Encoders.product

  override def outputEncoder: Encoder[Double] = Encoders.scalaDouble

  def main(args: Array[String]): Unit = {
    val sparkSession = SparkSession.builder().master("local").appName("MyAverageByAggregator")
      .getOrCreate()
    //隐式转换
    import sparkSession.implicits._
    val dataFrame = sparkSession.read.json("src/main/resources/sparkresource/employees.json").as[Employee]
    dataFrame.show()

    val salary_average = A_9_MyAverageByAggregator.toColumn.name("salary_average")

    val frame = dataFrame.select(salary_average)
    frame.show()
  }
} 
Example 6
Source File: BEDRelation.scala    From bdg-sequila   with Apache License 2.0 5 votes vote down vote up
package org.biodatageeks.sequila.datasources.BED

import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Encoders, Row, SQLContext, SparkSession}
import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs}

class BEDRelation(path: String)(@transient val sqlContext: SQLContext)
  extends BaseRelation
    with PrunedFilteredScan
    with Serializable {

  @transient val logger = Logger.getLogger(this.getClass.getCanonicalName)
  override def schema: org.apache.spark.sql.types.StructType = Encoders.product[org.biodatageeks.formats.BrowserExtensibleData].schema

  private def getValueFromColumn(colName:String, r:Array[String]): Any = {
    colName match {
      case Columns.CONTIG       =>  DataQualityFuncs.cleanContig(r(0) )
      case Columns.START        =>  r(1).toInt + 1 //Convert interval to 1-based
      case Columns.END          =>  r(2).toInt
      case Columns.NAME         =>  if (r.length > 3) Some (r(3)) else None
      case Columns.SCORE        =>  if (r.length > 4) Some (r(4).toInt) else None
      case Columns.STRAND       =>  if (r.length > 5) Some (r(5)) else None
      case Columns.THICK_START  =>  if (r.length > 6) Some (r(6).toInt) else None
      case Columns.THICK_END    =>  if (r.length > 7) Some (r(7).toInt) else None
      case Columns.ITEM_RGB     =>  if (r.length > 8) Some (r(8).split(",").map(_.toInt)) else None
      case Columns.BLOCK_COUNT  =>  if (r.length > 9) Some (r(9).toInt) else None
      case Columns.BLOCK_SIZES  =>  if (r.length > 10) Some (r(10).split(",").map(_.toInt)) else None
      case Columns.BLOCK_STARTS =>  if (r.length > 11) Some (r(11).split(",").map(_.toInt)) else None
      case _                    =>  throw new Exception(s"Unknown column found: ${colName}")
    }
  }
  override def buildScan(requiredColumns:Array[String], filters:Array[Filter]): RDD[Row] = {
    sqlContext
      .sparkContext
      .textFile(path)
      .filter(!_.toLowerCase.startsWith("track"))
      .filter(!_.toLowerCase.startsWith("browser"))
      .map(_.split("\t"))
      .map(r=>
            {
              val record = new Array[Any](requiredColumns.length)
              //requiredColumns.
              for (i <- 0 to requiredColumns.length - 1) {
                record(i) = getValueFromColumn(requiredColumns(i), r)
              }
              Row.fromSeq(record)
            }

    )




  }

} 
Example 7
Source File: EncoderErrorMessageSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.encoders

import scala.reflect.ClassTag

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders

class NonEncodable(i: Int)

case class ComplexNonEncodable1(name1: NonEncodable)

case class ComplexNonEncodable2(name2: ComplexNonEncodable1)

case class ComplexNonEncodable3(name3: Option[NonEncodable])

case class ComplexNonEncodable4(name4: Array[NonEncodable])

case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]])

class EncoderErrorMessageSuite extends SparkFunSuite {

  // Note: we also test error messages for encoders for private classes in JavaDatasetSuite.
  // That is done in Java because Scala cannot create truly private classes.

  test("primitive types in encoders using Kryo serialization") {
    intercept[UnsupportedOperationException] { Encoders.kryo[Int] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Long] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Char] }
  }

  test("primitive types in encoders using Java serialization") {
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] }
  }

  test("nice error message for missing encoder") {
    val errorMsg1 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage
    assert(errorMsg1.contains(
      s"""root class: "${clsName[ComplexNonEncodable1]}""""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg2 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage
    assert(errorMsg2.contains(
      s"""root class: "${clsName[ComplexNonEncodable2]}""""))
    assert(errorMsg2.contains(
      s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")"""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg3 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage
    assert(errorMsg3.contains(
      s"""root class: "${clsName[ComplexNonEncodable3]}""""))
    assert(errorMsg3.contains(
      s"""field (class: "scala.Option", name: "name3")"""))
    assert(errorMsg3.contains(
      s"""option value class: "${clsName[NonEncodable]}""""))

    val errorMsg4 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage
    assert(errorMsg4.contains(
      s"""root class: "${clsName[ComplexNonEncodable4]}""""))
    assert(errorMsg4.contains(
      s"""field (class: "scala.Array", name: "name4")"""))
    assert(errorMsg4.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))

    val errorMsg5 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage
    assert(errorMsg5.contains(
      s"""root class: "${clsName[ComplexNonEncodable5]}""""))
    assert(errorMsg5.contains(
      s"""field (class: "scala.Option", name: "name5")"""))
    assert(errorMsg5.contains(
      s"""option value class: "scala.Array""""))
    assert(errorMsg5.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))
  }

  private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName
} 
Example 8
Source File: ReduceAggregatorSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null).asInstanceOf[(Boolean, Int)])
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == ((true, 1)))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == ((true, 3)))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == ((true, 6)))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == ((true, 1)))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == ((true, 3)))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == ((true, 4)))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 9
Source File: EncoderErrorMessageSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.encoders

import scala.reflect.ClassTag

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders

class NonEncodable(i: Int)

case class ComplexNonEncodable1(name1: NonEncodable)

case class ComplexNonEncodable2(name2: ComplexNonEncodable1)

case class ComplexNonEncodable3(name3: Option[NonEncodable])

case class ComplexNonEncodable4(name4: Array[NonEncodable])

case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]])

class EncoderErrorMessageSuite extends SparkFunSuite {

  // Note: we also test error messages for encoders for private classes in JavaDatasetSuite.
  // That is done in Java because Scala cannot create truly private classes.

  test("primitive types in encoders using Kryo serialization") {
    intercept[UnsupportedOperationException] { Encoders.kryo[Int] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Long] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Char] }
  }

  test("primitive types in encoders using Java serialization") {
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] }
  }

  test("nice error message for missing encoder") {
    val errorMsg1 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage
    assert(errorMsg1.contains(
      s"""root class: "${clsName[ComplexNonEncodable1]}""""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg2 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage
    assert(errorMsg2.contains(
      s"""root class: "${clsName[ComplexNonEncodable2]}""""))
    assert(errorMsg2.contains(
      s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")"""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg3 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage
    assert(errorMsg3.contains(
      s"""root class: "${clsName[ComplexNonEncodable3]}""""))
    assert(errorMsg3.contains(
      s"""field (class: "scala.Option", name: "name3")"""))
    assert(errorMsg3.contains(
      s"""option value class: "${clsName[NonEncodable]}""""))

    val errorMsg4 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage
    assert(errorMsg4.contains(
      s"""root class: "${clsName[ComplexNonEncodable4]}""""))
    assert(errorMsg4.contains(
      s"""field (class: "scala.Array", name: "name4")"""))
    assert(errorMsg4.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))

    val errorMsg5 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage
    assert(errorMsg5.contains(
      s"""root class: "${clsName[ComplexNonEncodable5]}""""))
    assert(errorMsg5.contains(
      s"""field (class: "scala.Option", name: "name5")"""))
    assert(errorMsg5.contains(
      s"""option value class: "scala.Array""""))
    assert(errorMsg5.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))
  }

  private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName
} 
Example 10
Source File: TopByKeyAggregator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.recommendation

import scala.language.implicitConversions
import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.{Encoder, Encoders}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.util.BoundedPriorityQueue



private[recommendation] class TopByKeyAggregator[K1: TypeTag, K2: TypeTag, V: TypeTag]
  (num: Int, ord: Ordering[(K2, V)])
  extends Aggregator[(K1, K2, V), BoundedPriorityQueue[(K2, V)], Array[(K2, V)]] {

  override def zero: BoundedPriorityQueue[(K2, V)] = new BoundedPriorityQueue[(K2, V)](num)(ord)

  override def reduce(
      q: BoundedPriorityQueue[(K2, V)],
      a: (K1, K2, V)): BoundedPriorityQueue[(K2, V)] = {
    q += {(a._2, a._3)}
  }

  override def merge(
      q1: BoundedPriorityQueue[(K2, V)],
      q2: BoundedPriorityQueue[(K2, V)]): BoundedPriorityQueue[(K2, V)] = {
    q1 ++= q2
  }

  override def finish(r: BoundedPriorityQueue[(K2, V)]): Array[(K2, V)] = {
    r.toArray.sorted(ord.reverse)
  }

  override def bufferEncoder: Encoder[BoundedPriorityQueue[(K2, V)]] = {
    Encoders.kryo[BoundedPriorityQueue[(K2, V)]]
  }

  override def outputEncoder: Encoder[Array[(K2, V)]] = ExpressionEncoder[Array[(K2, V)]]()
} 
Example 11
Source File: GeolocationMapVectorizer.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.sequence.{SequenceEstimator, SequenceModel}
import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.DoubleArrayParam
import org.apache.spark.sql.{Dataset, Encoders}


class GeolocationMapVectorizer
(
  operationName: String = "vecGeoMap",
  uid: String = UID[GeolocationMapVectorizer]
) extends SequenceEstimator[GeolocationMap, OPVector](operationName = operationName, uid = uid)
  with MapVectorizerFuns[Seq[Double], GeolocationMap] with TrackNullsParam {
  private implicit val seqArrayEncoder = Encoders.kryo[Seq[Array[Double]]]

  final val defaultValue = new DoubleArrayParam(
    parent = this, name = "defaultValue", doc = "value to give missing keys when pivoting"
  )
  setDefault(defaultValue, TransmogrifierDefaults.DefaultGeolocation.toArray)

  def setDefaultValue(value: Geolocation): this.type = set(defaultValue, value.toArray)

  override def makeVectorMetadata(allKeys: Seq[Seq[String]]): OpVectorMetadata = {
    val meta = vectorMetadataFromInputFeatures

    val cols = for {
      (keys, col) <- allKeys.zip(meta.columns)
      key <- keys
      nm <- Geolocation.Names
    } yield new OpVectorColumnMetadata(
      parentFeatureName = col.parentFeatureName,
      parentFeatureType = col.parentFeatureType,
      grouping = Option(key),
      descriptorValue = Option(nm)
    )
    meta.withColumns(cols.toArray)
  }

  override def makeVectorMetaWithNullIndicators(allKeys: Seq[Seq[String]]): OpVectorMetadata = {
    val vectorMeta = makeVectorMetadata(allKeys)
    val updatedCols = vectorMeta.columns.grouped(3).flatMap { col => {
      val head = col.head
      col :+ OpVectorColumnMetadata(
        parentFeatureName = head.parentFeatureName,
        parentFeatureType = head.parentFeatureType,
        grouping = head.grouping,
        indicatorValue = Some(TransmogrifierDefaults.NullString)
      )
    }
    }.toArray
    vectorMeta.withColumns(updatedCols)
  }

  def fitFn(dataset: Dataset[Seq[GeolocationMap#Value]]): SequenceModel[GeolocationMap, OPVector] = {
    val shouldClean = $(cleanKeys)
    val defValue = $(defaultValue).toSeq
    val allKeys = getKeyValues(dataset, shouldClean, shouldCleanValues = false)
    val trackNullsValue = $(trackNulls)

    val meta = if (trackNullsValue) makeVectorMetaWithNullIndicators(allKeys) else makeVectorMetadata(allKeys)
    setMetadata(meta.toMetadata)

    new GeolocationMapVectorizerModel(
      allKeys = allKeys, defaultValue = defValue, shouldClean = shouldClean, trackNulls = trackNullsValue,
      operationName = operationName, uid = uid
    )
  }

}

final class GeolocationMapVectorizerModel private[op]
(
  val allKeys: Seq[Seq[String]],
  val defaultValue: Seq[Double],
  val shouldClean: Boolean,
  val trackNulls: Boolean,
  operationName: String,
  uid: String
) extends SequenceModel[GeolocationMap, OPVector](operationName = operationName, uid = uid)
  with CleanTextMapFun {

  def transformFn: Seq[GeolocationMap] => OPVector = row => {
    val eachPivoted: Array[Array[Double]] =
      row.map(_.value).zip(allKeys).flatMap { case (map, keys) =>
        val cleanedMap = cleanMap(map, shouldClean, shouldCleanValue = false)
        keys.map(k => {
          val vOpt = cleanedMap.get(k)
          val isEmpty = vOpt.isEmpty
          val v = vOpt.getOrElse(defaultValue).toArray
          if (trackNulls) v :+ (if (isEmpty) 1.0 else 0.0) else v
        })
      }.toArray
    Vectors.dense(eachPivoted.flatten).compressed.toOPVector
  }
} 
Example 12
Source File: ReduceAggregatorSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null))
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == (true, 1))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == (true, 3))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == (true, 6))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == (true, 1))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == (true, 3))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == (true, 4))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 13
Source File: EncoderErrorMessageSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.encoders

import scala.reflect.ClassTag

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders

class NonEncodable(i: Int)

case class ComplexNonEncodable1(name1: NonEncodable)

case class ComplexNonEncodable2(name2: ComplexNonEncodable1)

case class ComplexNonEncodable3(name3: Option[NonEncodable])

case class ComplexNonEncodable4(name4: Array[NonEncodable])

case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]])

class EncoderErrorMessageSuite extends SparkFunSuite {

  // Note: we also test error messages for encoders for private classes in JavaDatasetSuite.
  // That is done in Java because Scala cannot create truly private classes.

  test("primitive types in encoders using Kryo serialization") {
    intercept[UnsupportedOperationException] { Encoders.kryo[Int] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Long] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Char] }
  }

  test("primitive types in encoders using Java serialization") {
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] }
  }

  test("nice error message for missing encoder") {
    val errorMsg1 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage
    assert(errorMsg1.contains(
      s"""root class: "${clsName[ComplexNonEncodable1]}""""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg2 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage
    assert(errorMsg2.contains(
      s"""root class: "${clsName[ComplexNonEncodable2]}""""))
    assert(errorMsg2.contains(
      s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")"""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg3 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage
    assert(errorMsg3.contains(
      s"""root class: "${clsName[ComplexNonEncodable3]}""""))
    assert(errorMsg3.contains(
      s"""field (class: "scala.Option", name: "name3")"""))
    assert(errorMsg3.contains(
      s"""option value class: "${clsName[NonEncodable]}""""))

    val errorMsg4 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage
    assert(errorMsg4.contains(
      s"""root class: "${clsName[ComplexNonEncodable4]}""""))
    assert(errorMsg4.contains(
      s"""field (class: "scala.Array", name: "name4")"""))
    assert(errorMsg4.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))

    val errorMsg5 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage
    assert(errorMsg5.contains(
      s"""root class: "${clsName[ComplexNonEncodable5]}""""))
    assert(errorMsg5.contains(
      s"""field (class: "scala.Option", name: "name5")"""))
    assert(errorMsg5.contains(
      s"""option value class: "scala.Array""""))
    assert(errorMsg5.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))
  }

  private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName
} 
Example 14
Source File: SimpleJsonIngestionJob.scala    From comet-data-pipeline   with Apache License 2.0 5 votes vote down vote up
package com.ebiznext.comet.job.ingest

import com.ebiznext.comet.config.Settings
import com.ebiznext.comet.schema.handlers.{SchemaHandler, StorageHandler}
import com.ebiznext.comet.schema.model._
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{DataFrame, Encoders}

import scala.util.{Failure, Success, Try}


class SimpleJsonIngestionJob(
  domain: Domain,
  schema: Schema,
  types: List[Type],
  path: List[Path],
  storageHandler: StorageHandler,
  schemaHandler: SchemaHandler
)(implicit settings: Settings)
    extends DsvIngestionJob(domain, schema, types, path, storageHandler, schemaHandler) {

  override def loadDataSet(): Try[DataFrame] = {
    try {

      val df =
        if (metadata.isArray()) {
          val jsonRDD =
            session.sparkContext.wholeTextFiles(path.map(_.toString).mkString(",")).map(_._2)

          session.read
            .json(session.createDataset(jsonRDD)(Encoders.STRING))
            .withColumn(
              //  Spark cannot detect the input file automatically, so we should add it explicitly
              Settings.cometInputFileNameColumn,
              if (settings.comet.grouped) lit(path.map(_.toString).mkString(","))
              else lit(path.head.toString)
            )

        } else {
          session.read
            .option("encoding", metadata.getEncoding())
            .option("multiline", metadata.getMultiline())
            .json(path.map(_.toString): _*)
            .withColumn(
              //  Spark here can detect the input file automatically, so we're just using the input_file_name spark function
              Settings.cometInputFileNameColumn,
              org.apache.spark.sql.functions.input_file_name()
            )
        }

      import session.implicits._
      val resDF = if (df.columns.contains("_corrupt_record")) {
        //TODO send rejected records to rejected area
        logger.whenDebugEnabled {
          df.filter($"_corrupt_record".isNotNull).show(1000, false)
        }
        throw new Exception(
          s"""Invalid JSON File: ${path
            .map(_.toString)
            .mkString(",")}. SIMPLE_JSON require a valid json file """
        )
      } else {
        df
      }
      Success(
        resDF
      )
    } catch {
      case e: Exception =>
        Failure(e)
    }
  }
} 
Example 15
Source File: JsonIngestionJob.scala    From comet-data-pipeline   with Apache License 2.0 5 votes vote down vote up
package com.ebiznext.comet.job.ingest

import com.ebiznext.comet.config.{DatasetArea, Settings, StorageArea}
import com.ebiznext.comet.schema.handlers.{SchemaHandler, StorageHandler}
import com.ebiznext.comet.schema.model._
import org.apache.hadoop.fs.Path
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.execution.datasources.json.JsonIngestionUtil
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Encoders, Row}

import scala.util.{Failure, Success, Try}


  @deprecated("We let Spark compute the final schema", "")
  def saveAccepted(acceptedRDD: RDD[Row]): Path = {
    val writeMode = metadata.getWriteMode()
    val acceptedPath = new Path(DatasetArea.accepted(domain.name), schema.name)
    saveRows(
      session.createDataFrame(acceptedRDD, schemaSparkType),
      acceptedPath,
      writeMode,
      StorageArea.accepted,
      schema.merge.isDefined
    )
    acceptedPath
  }

  override def name: String = "JsonJob"
} 
Example 16
Source File: VerifyIsolationForest.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.isolationforest

import com.microsoft.ml.spark.build.BuildInfo
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.metrics.MetricConstants
import com.microsoft.ml.spark.core.test.benchmarks.Benchmarks
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row}
import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.scalactic.Tolerance._
import com.microsoft.ml.spark.train.ComputeModelStatistics

case class MammographyRecord(feature0: Double, feature1: Double, feature2: Double, feature3: Double,
                             feature4: Double, feature5: Double, label: Double)
case class ScoringResult(features: Vector, label: Double, predictedLabel: Double, outlierScore: Double)

class VerifyIsolationForest extends Benchmarks with EstimatorFuzzing[IsolationForest] {
  test ("Verify isolationForestMammographyDataTest") {
    import session.implicits._

    val data = loadMammographyData

    // Train a new isolation forest model
    val contamination = 0.02
    val isolationForest = new IsolationForest()
      .setNumEstimators(100)
      .setBootstrap(false)
      .setMaxSamples(256)
      .setMaxFeatures(1.0)
      .setFeaturesCol("features")
      .setPredictionCol("predictedLabel")
      .setScoreCol("outlierScore")
      .setContamination(0.02)
      .setContaminationError(contamination * 0.01)
      .setRandomSeed(1)

    // Score all training data instances using the new model
    val isolationForestModel = isolationForest.fit(data)

    // Calculate area under ROC curve and assert
    val scores = isolationForestModel.transform(data).as[ScoringResult]
    val metrics = new ComputeModelStatistics()
      .setEvaluationMetric(MetricConstants.AucSparkMetric)
      .setLabelCol("label")
      .setScoredLabelsCol("predictedLabel")
      .setScoresCol("outlierScore")
      .transform(scores)

    // Expectation from results in the 2008 "Isolation Forest" paper by F. T. Liu, et al.
    val aurocExpectation = 0.86
    val uncert = 0.02
    val auroc = metrics.first().getDouble(1)
    assert(auroc === aurocExpectation +- uncert, "expected area under ROC =" +
        s" $aurocExpectation +/- $uncert, but observed $auroc")
  }

  def loadMammographyData(): DataFrame = {

    import session.implicits._

    val mammographyRecordSchema = Encoders.product[MammographyRecord].schema

    val fileLocation = FileUtilities.join(BuildInfo.datasetDir,"IsolationForest", "mammography.csv").toString

    // Open source dataset from http://odds.cs.stonybrook.edu/mammography-dataset/
    val rawData = session.read
      .format("csv")
      .option("comment", "#")
      .option("header", "false")
      .schema(mammographyRecordSchema)
      .load(fileLocation)

    val assembler = new VectorAssembler()
      .setInputCols(Array("feature0", "feature1", "feature2", "feature3", "feature4", "feature5"))
      .setOutputCol("features")

    val data = assembler
      .transform(rawData)
      .select("features", "label")

    data
  }

  override def reader: MLReadable[_] = IsolationForest
  override def modelReader: MLReadable[_] = IsolationForestModel

  override def testObjects(): Seq[TestObject[IsolationForest]] = {
    val dataset = loadMammographyData.toDF

    Seq(new TestObject(
      new IsolationForest(),
      dataset))
  }
} 
Example 17
Source File: SparkRecoverPartitionsCustomIntegrationTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.integration

import com.adidas.utils.TestUtils._
import com.adidas.analytics.algo.AppendLoad
import com.adidas.utils.FileReader
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{Dataset, Encoders}
import org.scalatest.FeatureSpec
import org.scalatest.Matchers._

import scala.collection.JavaConverters._


class SparkRecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest {

  feature("Partitions can be updated programmatically using custom logic") {

    scenario("Using Append Load Algorithm with multiple source files") {
      val testResourceDir = "multiple_source_files"
      val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json")
      val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1")

      val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType]
      val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType]
      val dataReader = FileReader.newDSVFileReader(Some(targetSchema))
      val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema))

      val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema)
      setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader)
      prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv"))
      uploadParameters(testResourceDir)

      // checking pre-conditions
      spark.read.csv(sourceDirPath.toString).count() shouldBe 7
      targetTable.read().count() shouldBe 19

      fs.exists(targetPath20180101) shouldBe false
      fs.exists(headerPath20180101) shouldBe false

      // executing load
      AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()

      // validating result
      val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true)
      val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true)
      val expectedDf = dataReader.read(spark, expectedDataLocation)
      val actualDf = targetTable.read()

      val producedPartitionsNumber: Dataset[String] = spark
        .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}")
        .as(Encoders.STRING)

      // MetaData Specific Tests
      val expectedPartitions: Dataset[String] = expectedPartitionsDataReader
        .read(spark, expectedPartitionsLocation)
        .as(Encoders.STRING)

      expectedPartitions.collectAsList().asScala.sorted.toSet should
        equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet)

      actualDf.hasDiff(expectedDf) shouldBe false

      fs.exists(targetPath20180101) shouldBe true
      fs.exists(headerPath20180101) shouldBe true
    }
  }


} 
Example 18
Source File: SparkRecoverPartitionsNativeIntegrationTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.integration

import com.adidas.utils.TestUtils._
import com.adidas.analytics.algo.AppendLoad
import com.adidas.utils.FileReader
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{Dataset, Encoders}
import org.scalatest.FeatureSpec
import org.scalatest.Matchers._

import scala.collection.JavaConverters._


class SparkRecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest {

  feature("Partitions can be updated with native spark.recoverPartitions()") {

    scenario("Using Append Load Algorithm with multiple source files") {
      val testResourceDir = "multiple_source_files"
      val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json")
      val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1")

      val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType]
      val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType]
      val dataReader = FileReader.newDSVFileReader(Some(targetSchema))
      val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema))

      val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema)
      setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader)
      prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv"))
      uploadParameters(testResourceDir)

      // checking pre-conditions
      spark.read.csv(sourceDirPath.toString).count() shouldBe 7
      targetTable.read().count() shouldBe 19

      fs.exists(targetPath20180101) shouldBe false
      fs.exists(headerPath20180101) shouldBe false

      // executing load
      AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()

      // validating result
      val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true)
      val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true)
      val expectedDf = dataReader.read(spark, expectedDataLocation)
      val actualDf = targetTable.read()

      val producedPartitionsNumber: Dataset[String] = spark
        .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}")
        .as(Encoders.STRING)

      // MetaData Specific Tests
      val expectedPartitions: Dataset[String] = expectedPartitionsDataReader
        .read(spark, expectedPartitionsLocation)
        .as(Encoders.STRING)

      expectedPartitions.collectAsList().asScala.sorted.toSet should
        equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet)

      actualDf.hasDiff(expectedDf) shouldBe false

      fs.exists(targetPath20180101) shouldBe true
      fs.exists(headerPath20180101) shouldBe true
    }
  }


} 
Example 19
Source File: RecoverPartitionsNativeIntegrationTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.integration

import com.adidas.utils.TestUtils._
import com.adidas.analytics.algo.AppendLoad
import com.adidas.utils.FileReader
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{Dataset, Encoders}
import org.scalatest.FeatureSpec
import org.scalatest.Matchers._

import scala.collection.JavaConverters._


class RecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest {

  feature("Partitions can be updated with native spark.recoverPartitions()") {

    scenario("Using Append Load Algorithm with multiple source files") {
      val testResourceDir = "multiple_source_files"
      val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json")
      val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1")

      val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType]
      val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType]
      val dataReader = FileReader.newDSVFileReader(Some(targetSchema))
      val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema))

      val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema)
      setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader)
      prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv"))
      uploadParameters(testResourceDir)

      // checking pre-conditions
      spark.read.csv(sourceDirPath.toString).count() shouldBe 7
      targetTable.read().count() shouldBe 19

      fs.exists(targetPath20180101) shouldBe false
      fs.exists(headerPath20180101) shouldBe false

      // executing load
      AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()

      // validating result
      val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true)
      val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true)
      val expectedDf = dataReader.read(spark, expectedDataLocation)
      val actualDf = targetTable.read()

      val producedPartitionsNumber: Dataset[String] = spark
        .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}")
        .as(Encoders.STRING)

      // MetaData Specific Tests
      val expectedPartitions: Dataset[String] = expectedPartitionsDataReader
        .read(spark, expectedPartitionsLocation)
        .as(Encoders.STRING)

      expectedPartitions.collectAsList().asScala.sorted.toSet should
        equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet)

      actualDf.hasDiff(expectedDf) shouldBe false

      spark
        .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)")
        .filter("col_name == 'Partition Statistics'")
        .head()
        .getAs[String]("data_type").contains("6 rows") shouldBe true

      fs.exists(targetPath20180101) shouldBe true
      fs.exists(headerPath20180101) shouldBe true
    }
  }


} 
Example 20
Source File: ReduceAggregatorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null))
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == (true, 1))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == (true, 3))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == (true, 6))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == (true, 1))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == (true, 3))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == (true, 4))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 21
Source File: EncoderErrorMessageSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.encoders

import scala.reflect.ClassTag

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders

class NonEncodable(i: Int)

case class ComplexNonEncodable1(name1: NonEncodable)

case class ComplexNonEncodable2(name2: ComplexNonEncodable1)

case class ComplexNonEncodable3(name3: Option[NonEncodable])

case class ComplexNonEncodable4(name4: Array[NonEncodable])

case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]])

class EncoderErrorMessageSuite extends SparkFunSuite {

  // Note: we also test error messages for encoders for private classes in JavaDatasetSuite.
  // That is done in Java because Scala cannot create truly private classes.

  test("primitive types in encoders using Kryo serialization") {
    intercept[UnsupportedOperationException] { Encoders.kryo[Int] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Long] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Char] }
  }

  test("primitive types in encoders using Java serialization") {
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] }
  }

  test("nice error message for missing encoder") {
    val errorMsg1 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage
    assert(errorMsg1.contains(
      s"""root class: "${clsName[ComplexNonEncodable1]}""""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg2 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage
    assert(errorMsg2.contains(
      s"""root class: "${clsName[ComplexNonEncodable2]}""""))
    assert(errorMsg2.contains(
      s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")"""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg3 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage
    assert(errorMsg3.contains(
      s"""root class: "${clsName[ComplexNonEncodable3]}""""))
    assert(errorMsg3.contains(
      s"""field (class: "scala.Option", name: "name3")"""))
    assert(errorMsg3.contains(
      s"""option value class: "${clsName[NonEncodable]}""""))

    val errorMsg4 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage
    assert(errorMsg4.contains(
      s"""root class: "${clsName[ComplexNonEncodable4]}""""))
    assert(errorMsg4.contains(
      s"""field (class: "scala.Array", name: "name4")"""))
    assert(errorMsg4.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))

    val errorMsg5 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage
    assert(errorMsg5.contains(
      s"""root class: "${clsName[ComplexNonEncodable5]}""""))
    assert(errorMsg5.contains(
      s"""field (class: "scala.Option", name: "name5")"""))
    assert(errorMsg5.contains(
      s"""option value class: "scala.Array""""))
    assert(errorMsg5.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))
  }

  private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName
} 
Example 22
package org.scalaml.spark

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}


private[spark] object DatasetGenerator {

      // Generation of a dataset of type {Double, Double} with a by-name initialization function
  final def toDSPairDouble(
    numDataPoints: Int
  )(
    generator: Int => (Double, Double)
  )(implicit sessionLifeCycle: SessionLifeCycle): Dataset[(Double, Double)] =
    toDSPairDouble(Seq.tabulate(numDataPoints)(generator(_)))

    // Generation of a dataset of type {Double, Double} from a sequence of same type
  def toDSPairDouble(
    data: Seq[(Double, Double)]
  )(implicit sessionLifeCycle: SessionLifeCycle): Dataset[(Double, Double)] = {
    import sessionLifeCycle.sparkSession.implicits._
    data.toDS()
  }

    // Generation of a dataset of type Double
  def toDSDouble(data: Seq[Double])(implicit sessionLifeCycle: SessionLifeCycle): Dataset[Double] = {
    import sessionLifeCycle.sparkSession.implicits._
    data.toDS()
  }

    // Generation of a dataset of type Int
  def toDSInt(data: Seq[Int])(implicit sessionLifeCycle: SessionLifeCycle): Dataset[Int] = {
    import sessionLifeCycle.sparkSession.implicits._
    data.toDS()
  }
}

// --------------------------  EOF ---------------------------------------------- 
Example 23
Source File: MultiStreamHandler.scala    From structured-streaming-application   with Apache License 2.0 5 votes vote down vote up
package knolx.spark

import knolx.Config._
import knolx.KnolXLogger
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode}
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{Encoders, SparkSession}


case class CurrentPowerConsumption(kwh: Double)

case class PowerConsumptionStatus(numOfReadings: Long, total: Double, avg: Double, status: String) {
  def compute(newReadings: List[Double]) = {
    val newTotal = newReadings.sum + total
    val newNumOfReadings = numOfReadings + newReadings.size
    val newAvg = newTotal / newNumOfReadings.toDouble

    PowerConsumptionStatus(newNumOfReadings, newTotal, newAvg, "ON")
  }
}

object MultiStreamHandler extends App with KnolXLogger {
  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()
  spark.sparkContext.setLogLevel("WARN")

  val updateStateFunc =
    (deviceId: String, newReadings: Iterator[(String, CurrentPowerConsumption)], state: GroupState[PowerConsumptionStatus]) => {
      val data = newReadings.toList.map { case(_, reading) => reading }.map(_.kwh)

      lazy val initialPowerConsumptionStatus = PowerConsumptionStatus(0L, 0D, 0D, "OFF")
      val currentState = state.getOption.fold(initialPowerConsumptionStatus.compute(data))(_.compute(data))

      val currentStatus =
        if(state.hasTimedOut) {
          // If we do not receive any reading, for a device, we will assume that it is OFF.
          currentState.copy(status = "OFF")
        } else {
          state.setTimeoutDuration("10 seconds")
          currentState
        }

      state.update(currentStatus)
      (deviceId, currentStatus)
    }

  info("Creating Streaming DF...")
  val dataStream =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", topic)
      .option("failOnDataLoss", false)
      .option("includeTimestamp", true)
      .load()

  info("Writing data to Console...")
  import spark.implicits._

  implicit val currentPowerConsumptionEncoder = Encoders.kryo[CurrentPowerConsumption]
  implicit val powerConsumptionStatusEncoder = Encoders.kryo[PowerConsumptionStatus]

  val query =
    dataStream
      .select(col("key").cast(StringType).as("key"), col("value").cast(StringType).as("value"))
      .as[(String, String)]
      .map { case(deviceId, unit) =>
        (deviceId, CurrentPowerConsumption(Option(unit).fold(0D)(_.toDouble)))
      }
      .groupByKey { case(deviceId, _) => deviceId }
      .mapGroupsWithState[PowerConsumptionStatus, (String, PowerConsumptionStatus)](GroupStateTimeout.ProcessingTimeTimeout())(updateStateFunc)
      .toDF("deviceId", "current_status")
      .writeStream
      .format("console")
      .option("truncate", false)
      .outputMode(OutputMode.Update())
      .option("checkpointLocation", checkPointDir)
      .start()

  info("Waiting for the query to terminate...")
  query.awaitTermination()
  query.stop()
} 
Example 24
Source File: TestCsvData.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.csv

import org.apache.spark.sql.{Dataset, Encoders, SparkSession}

private[csv] trait TestCsvData {
  protected def spark: SparkSession

  def sampledTestData: Dataset[String] = {
    spark.range(0, 100, 1).map { index =>
      val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46,
        57, 62, 68, 72)
      if (predefinedSample.contains(index)) {
        index.toString
      } else {
        (index.toDouble + 0.1).toString
      }
    }(Encoders.STRING)
  }
} 
Example 25
Source File: ReduceAggregatorSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null).asInstanceOf[(Boolean, Int)])
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == ((true, 1)))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == ((true, 3)))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == ((true, 6)))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == ((true, 1)))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == ((true, 3)))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == ((true, 4)))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 26
Source File: EncoderErrorMessageSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.encoders

import scala.reflect.ClassTag

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders

class NonEncodable(i: Int)

case class ComplexNonEncodable1(name1: NonEncodable)

case class ComplexNonEncodable2(name2: ComplexNonEncodable1)

case class ComplexNonEncodable3(name3: Option[NonEncodable])

case class ComplexNonEncodable4(name4: Array[NonEncodable])

case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]])

class EncoderErrorMessageSuite extends SparkFunSuite {

  // Note: we also test error messages for encoders for private classes in JavaDatasetSuite.
  // That is done in Java because Scala cannot create truly private classes.

  test("primitive types in encoders using Kryo serialization") {
    intercept[UnsupportedOperationException] { Encoders.kryo[Int] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Long] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Char] }
  }

  test("primitive types in encoders using Java serialization") {
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] }
  }

  test("nice error message for missing encoder") {
    val errorMsg1 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage
    assert(errorMsg1.contains(
      s"""root class: "${clsName[ComplexNonEncodable1]}""""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg2 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage
    assert(errorMsg2.contains(
      s"""root class: "${clsName[ComplexNonEncodable2]}""""))
    assert(errorMsg2.contains(
      s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")"""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg3 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage
    assert(errorMsg3.contains(
      s"""root class: "${clsName[ComplexNonEncodable3]}""""))
    assert(errorMsg3.contains(
      s"""field (class: "scala.Option", name: "name3")"""))
    assert(errorMsg3.contains(
      s"""option value class: "${clsName[NonEncodable]}""""))

    val errorMsg4 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage
    assert(errorMsg4.contains(
      s"""root class: "${clsName[ComplexNonEncodable4]}""""))
    assert(errorMsg4.contains(
      s"""field (class: "scala.Array", name: "name4")"""))
    assert(errorMsg4.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))

    val errorMsg5 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage
    assert(errorMsg5.contains(
      s"""root class: "${clsName[ComplexNonEncodable5]}""""))
    assert(errorMsg5.contains(
      s"""field (class: "scala.Option", name: "name5")"""))
    assert(errorMsg5.contains(
      s"""option value class: "scala.Array""""))
    assert(errorMsg5.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))
  }

  private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName
} 
Example 27
Source File: KuduEventsHandler.scala    From daf   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package it.teamdigitale.storage

import it.teamdigitale.EventModel.{KuduEvent, StorableEvent}
import it.teamdigitale.config.IotIngestionManagerConfig.KuduConfig
import org.apache.kudu.client.{CreateTableOptions, KuduException}
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.kudu.{Schema, Type}
import org.apache.spark.sql.{DataFrame, Encoders}
import org.apache.logging.log4j.LogManager

import scala.collection.convert.decorateAsJava._


object KuduEventsHandler {
  implicit private val alogger = LogManager.getLogger(this.getClass)

  val primaryKeys =  List("source", "ts", "metric_id")

  def getOrCreateTable(kuduContext: KuduContext, kuduConfig: KuduConfig): Unit = {
    if (!kuduContext.tableExists(kuduConfig.eventsTableName)) {
      try {
        val schema = Encoders.product[KuduEvent].schema
        val table = kuduContext.createTable(
          kuduConfig.eventsTableName,
          schema,
          primaryKeys,
          new CreateTableOptions().
            setRangePartitionColumns(List("ts").asJava)
            .addHashPartitions(List("source").asJava, kuduConfig.eventsNumberBuckets)
        )
        alogger.info(s"Created table ${table.getName}")

      } catch {
        case ex: KuduException if ex.getStatus.isAlreadyPresent => alogger.error(s"Cannot create the table ${kuduConfig.eventsTableName} due the error: ${ex.getMessage}")
        case ex: Throwable => alogger.error(s"Cannot create the table ${kuduConfig.eventsTableName} due the error: ${ex.getMessage}")
      }
    }
  }

  def write(df: DataFrame, kuduContext: KuduContext, kuduConfig: KuduConfig): Unit = {
    kuduContext.insertIgnoreRows(df, kuduConfig.eventsTableName)
  }

} 
Example 28
Source File: ReduceAggregatorSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null))
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == (true, 1))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == (true, 3))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == (true, 6))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == (true, 1))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == (true, 3))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == (true, 4))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 29
Source File: EncoderErrorMessageSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.encoders

import scala.reflect.ClassTag

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders

class NonEncodable(i: Int)

case class ComplexNonEncodable1(name1: NonEncodable)

case class ComplexNonEncodable2(name2: ComplexNonEncodable1)

case class ComplexNonEncodable3(name3: Option[NonEncodable])

case class ComplexNonEncodable4(name4: Array[NonEncodable])

case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]])

class EncoderErrorMessageSuite extends SparkFunSuite {

  // Note: we also test error messages for encoders for private classes in JavaDatasetSuite.
  // That is done in Java because Scala cannot create truly private classes.

  test("primitive types in encoders using Kryo serialization") {
    intercept[UnsupportedOperationException] { Encoders.kryo[Int] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Long] }
    intercept[UnsupportedOperationException] { Encoders.kryo[Char] }
  }

  test("primitive types in encoders using Java serialization") {
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] }
    intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] }
  }

  test("nice error message for missing encoder") {
    val errorMsg1 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage
    assert(errorMsg1.contains(
      s"""root class: "${clsName[ComplexNonEncodable1]}""""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg2 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage
    assert(errorMsg2.contains(
      s"""root class: "${clsName[ComplexNonEncodable2]}""""))
    assert(errorMsg2.contains(
      s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")"""))
    assert(errorMsg1.contains(
      s"""field (class: "${clsName[NonEncodable]}", name: "name1")"""))

    val errorMsg3 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage
    assert(errorMsg3.contains(
      s"""root class: "${clsName[ComplexNonEncodable3]}""""))
    assert(errorMsg3.contains(
      s"""field (class: "scala.Option", name: "name3")"""))
    assert(errorMsg3.contains(
      s"""option value class: "${clsName[NonEncodable]}""""))

    val errorMsg4 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage
    assert(errorMsg4.contains(
      s"""root class: "${clsName[ComplexNonEncodable4]}""""))
    assert(errorMsg4.contains(
      s"""field (class: "scala.Array", name: "name4")"""))
    assert(errorMsg4.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))

    val errorMsg5 =
      intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage
    assert(errorMsg5.contains(
      s"""root class: "${clsName[ComplexNonEncodable5]}""""))
    assert(errorMsg5.contains(
      s"""field (class: "scala.Option", name: "name5")"""))
    assert(errorMsg5.contains(
      s"""option value class: "scala.Array""""))
    assert(errorMsg5.contains(
      s"""array element class: "${clsName[NonEncodable]}""""))
  }

  private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName
} 
Example 30
Source File: StreamingPredictionsSpec.scala    From odsc-east-realish-predictions   with Apache License 2.0 4 votes vote down vote up
package com.twilio.open.odsc.realish

import java.sql.Timestamp
import java.time.Instant
import java.util.{Random, UUID}

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoders, SQLContext, SparkSession}
import org.scalatest.{FunSuite, Matchers}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}

import scala.concurrent.duration._

class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql {

  override def conf: SparkConf = {
    new SparkConf()
      .setMaster("local[*]")
      .setAppName("odsc-spark-utils")
      .set("spark.ui.enabled", "false")
      .set("spark.app.id", appID)
      .set("spark.driver.host", "localhost")
      .set("spark.sql.session.timeZone", "UTC")
  }

  final val notRandomRandom = {
    val generator = new Random
    generator.setSeed(100L)
    generator
  }

  test("should stream in some mock data for fun") {
    implicit val spark: SparkSession = sparkSql
    import spark.implicits._
    implicit val sqlContext: SQLContext = spark.sqlContext

    implicit val metricEncoder = Encoders.product[Metric]
    val metricData = MemoryStream[Metric]

    val startingInstant = Instant.now()

    val backingData = (1 to 10000).map(offset => {
      val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration"
      val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100)
      Metric(
        Timestamp.from(startingInstant.minusSeconds(offset)),
        UUID.randomUUID().toString,
        metric,
        value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240),
        countryCode = if (offset % 8 == 0) "US" else "BR",
        callDirection = if (metric == "loss_percentage") "inbound" else "outbound"
      )
    })
    val processingTimeTrigger = Trigger.ProcessingTime(2.seconds)


    val streamingQuery = metricData.toDF()
      .withWatermark("timestamp", "2 hours")
      .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes"))
      .agg(
        min("value") as "min",
        avg("value") as "mean",
        max("value") as "max",
        count("*") as "total"
      )
      .writeStream
      .format("memory")
      .queryName("datastream")
      .outputMode(OutputMode.Append())
      .trigger(processingTimeTrigger)
      .start()

    metricData.addData(backingData)

    streamingQuery.processAllAvailable()

    spark.sql("select * from datastream").show(20, false)

    val checkChange = spark.sql("select * from datastream")
      .groupBy("metric","countryCode")
      .agg(
        sum("total") as "total",
        avg("mean") as "mean"
      )

    checkChange.show(20, false)

    // now can do interesting things with minor back tracking...

    streamingQuery.stop()

  }

} 
Example 31
Source File: SparkNarrowTest.scala    From spark-tools   with Apache License 2.0 3 votes vote down vote up
package io.univalence

import java.net.URLClassLoader
import java.sql.Date

import io.univalence.centrifuge.Sparknarrow
import org.apache.spark.SparkConf
import org.apache.spark.sql.types._
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

case class Person(name: String, age: Int, date: Date)

class SparknarrowTest extends FunSuite {

  val conf: SparkConf = new SparkConf()
  conf.setAppName("yo")
  conf.set("spark.sql.caseSensitive", "true")
  conf.setMaster("local[2]")

  implicit val ss: SparkSession = SparkSession.builder.config(conf).getOrCreate
  import ss.implicits._

  test("testBasicCC") {

    val classDef = Sparknarrow.basicCC(Encoders.product[Person].schema).classDef
    checkDefinition(classDef)

  }

  def checkDefinition(scalaCode: String): Unit = {
    //TODO do a version for 2.11 and 2.12
    
  }

  test("play with scala eval") {

    val code =
      """
        case class Tata(str: String)
        case class Toto(age: Int, tata: Tata)
      """

    checkDefinition(code)
    checkDefinition(code)

  }

  ignore("printSchema StructType") {
    val yo = StructType(
      Seq(
        StructField("name", StringType),
        StructField("tel", ArrayType(StringType))
      )
    )

    yo.printTreeString()
  }

}