org.apache.spark.sql.Encoders Scala Examples
The following examples show how to use org.apache.spark.sql.Encoders.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RecoverPartitionsCustomIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class RecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated programmatically using custom logic") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false spark .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)") .filter("col_name == 'Partition Statistics'") .head() .getAs[String]("data_type").contains("6 rows") shouldBe true fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 2
Source File: LikelihoodRatioTest.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import breeze.linalg.{DenseMatrix, DenseVector} import org.apache.spark.ml.linalg.{DenseMatrix => SparkDenseMatrix} import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType object LikelihoodRatioTest extends LogitTest { override type FitState = LRTFitState override def fitStatePerPhenotype: Boolean = true override val resultSchema: StructType = Encoders.product[LogitTestResults].schema override def init(phenotypes: Array[Double], covariates: SparkDenseMatrix): LRTFitState = { val nullX = new DenseMatrix(covariates.numRows, covariates.numCols, covariates.values) val y = new DenseVector(phenotypes) val nullFitState = new NewtonIterationsState(covariates.numRows, covariates.numCols) nullFitState.initFromMatrix(nullX, y) val nullFit = LogisticRegressionGwas.newtonIterations(nullX, y, nullX.copy, nullFitState) val fullFitState = new NewtonIterationsState(covariates.numRows, covariates.numCols + 1) val x = DenseMatrix.horzcat(nullX, DenseMatrix.zeros[Double](covariates.numRows, 1)) LRTFitState(x, x.copy, nullFit, fullFitState) } override def runTest( genotypes: DenseVector[Double], phenotypes: DenseVector[Double], fitState: LRTFitState): InternalRow = { fitState.x(::, -1) := genotypes fitState.newtonState.initFromMatrixAndNullFit(fitState.x, phenotypes, fitState.nullFit.args) if (!fitState.nullFit.converged) { return LogitTestResults.nanRow } val fullFit = LogisticRegressionGwas.newtonIterations( fitState.x, phenotypes, fitState.hessian, fitState.newtonState) if (!fullFit.converged) { return LogitTestResults.nanRow } val beta = fullFit.args.b(-1) LogisticRegressionGwas.makeStats( beta, fullFit.args.fisher, fullFit.logLkhd, fitState.nullFit.logLkhd) } } case class LRTFitState( x: DenseMatrix[Double], hessian: DenseMatrix[Double], nullFit: NewtonResult, newtonState: NewtonIterationsState )
Example 3
Source File: LocalIntegrationTest.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp import java.sql.Timestamp import org.scalatest.Matchers._ import org.scalatest.concurrent.Eventually._ import org.scalatest.time.{Seconds, Span} import org.apache.spark.sql.Encoders class LocalIntegrationTest extends IntegrationTestBase { test("Integration test with one kafka and one spark instance embedded in the same JVM") { val inputDir = "src/test/resources/samples" val spark = EmbeddedSpark.sparkSession val fileSource = new FileSources(spark, inputDir) val kafkaConfig = EmbeddedKafkaBroker.defaultKafkaConfig val kafkaSource = new KafkaSource(spark, kafkaConfig) val application = new Application( spark, Sources( statesFromCluster = fileSource.jsonFile("states"), customersFromCluster = fileSource.jsonFile("customers"), vendorsFromCluster = fileSource.jsonFile("vendors"), customersFromStream = kafkaSource.jsonStreamWithKafkaTimestamp("customer"), vendorsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("vendor", "update_timestamp"), transactionsFromStream = kafkaSource.jsonStreamWithTimestampFromMessage("transaction", "event_timestamp") ), Sinks( invalidTransactions = Memory.memorySink("invalidTransactions"), validTransactions = Memory.memorySink("validTransactions"), customerOrphans = Memory.memorySink("customerOrphans"), vendorOrphans = Memory.memorySink("vendorOrphans"), customers = Memory.memorySink("customers"), vendors = Memory.memorySink("vendors"), transactionsOperationalMetadata = Memory.memorySink("transactionsOperationalMetadata") )) application.start() eventually(timeout(Span(20, Seconds)), interval(Span(5, Seconds))) { EmbeddedKafkaBroker.publishStringMessageToKafka( "transaction", """{ "transaction_id": "1", "customer_id": 1, "vendor_id": 1, "event_state": "CREATED", "event_timestamp": "2018-11-12 09:42:00", "price": "100", "card_type": "Credit"}""") EmbeddedKafkaBroker.publishStringMessageToKafka( "transaction", """{ "transaction_id": "21", "customer_id": 100, "vendor_id": 2, "event_state": "SWIPED", "event_timestamp": "2018-11-13 09:45:01", "price": "100", "card_type": "Debit"}""") val validTransactionsQuery = application.streamingQueries.validTransactions validTransactionsQuery.processAllAvailable() val currentContent = spark.table("validTransactions").as[Transaction](Encoders.product).collect() currentContent.shouldBe( Array( Transaction( transaction_id = "1", customer_id = Some(1), vendor_id = Some(1), event_state = Some("CREATED"), event_timestamp = Timestamp.valueOf("2018-11-12 09:42:00"), price = Some("100"), card_type = Some("Credit")), Transaction( transaction_id = "21", customer_id = Some(100), vendor_id = Some(2), event_state = Some("SWIPED"), event_timestamp = Timestamp.valueOf("2018-11-13 09:45:01"), price = Some("100"), card_type = Some("Debit")) )) } } }
Example 4
Source File: K-Centers.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.clustering.kcenters.dataset @annotation.tailrec def go(cpt: Int, haveAllCentersConverged: Boolean, centers: List[(Int, V)]): List[(Int, V)] = { val preUpdatedCenters = data.groupByKey( cz => obtainNearestCenterID(cz.v, centers, metric) )(encoderInt) .mapGroups(computeCenters)(encoder) .collect .sortBy(_._1) .toList val alignedOldCenters = preUpdatedCenters.map{ case (oldClusterID, _) => centers(oldClusterID) } val updatedCenters = preUpdatedCenters.zipWithIndex.map{ case ((oldClusterID, center), newClusterID) => (newClusterID, center) } val shiftingEnough = areCentersNotMovingEnough(updatedCenters, alignedOldCenters, minShift, metric) if(cpt < maxIterations && !shiftingEnough) { go(cpt + 1, shiftingEnough, updatedCenters) } else { updatedCenters } } immutable.HashMap(go(0, false, centers):_*) } }
Example 5
Source File: A_9_MyAverageByAggregator.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.sql import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.apache.spark.sql.expressions.Aggregator case class Employee(name:String, salary:Long) case class Average(var sum:Long, var count:Long) object A_9_MyAverageByAggregator extends Aggregator[Employee, Average, Double]{ override def zero: Average = Average(0L,0L) override def reduce(b: Average, a: Employee): Average = { b.sum += a.salary b.count+=1 b } override def merge(b1: Average, b2: Average): Average = { b1.count+=b2.count b1.sum+=b2.sum b1 } override def finish(reduction: Average): Double = reduction.sum.toDouble/reduction.count override def bufferEncoder: Encoder[Average] = Encoders.product override def outputEncoder: Encoder[Double] = Encoders.scalaDouble def main(args: Array[String]): Unit = { val sparkSession = SparkSession.builder().master("local").appName("MyAverageByAggregator") .getOrCreate() //隐式转换 import sparkSession.implicits._ val dataFrame = sparkSession.read.json("src/main/resources/sparkresource/employees.json").as[Employee] dataFrame.show() val salary_average = A_9_MyAverageByAggregator.toColumn.name("salary_average") val frame = dataFrame.select(salary_average) frame.show() } }
Example 6
Source File: BEDRelation.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.datasources.BED import org.apache.log4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoders, Row, SQLContext, SparkSession} import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs} class BEDRelation(path: String)(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan with Serializable { @transient val logger = Logger.getLogger(this.getClass.getCanonicalName) override def schema: org.apache.spark.sql.types.StructType = Encoders.product[org.biodatageeks.formats.BrowserExtensibleData].schema private def getValueFromColumn(colName:String, r:Array[String]): Any = { colName match { case Columns.CONTIG => DataQualityFuncs.cleanContig(r(0) ) case Columns.START => r(1).toInt + 1 //Convert interval to 1-based case Columns.END => r(2).toInt case Columns.NAME => if (r.length > 3) Some (r(3)) else None case Columns.SCORE => if (r.length > 4) Some (r(4).toInt) else None case Columns.STRAND => if (r.length > 5) Some (r(5)) else None case Columns.THICK_START => if (r.length > 6) Some (r(6).toInt) else None case Columns.THICK_END => if (r.length > 7) Some (r(7).toInt) else None case Columns.ITEM_RGB => if (r.length > 8) Some (r(8).split(",").map(_.toInt)) else None case Columns.BLOCK_COUNT => if (r.length > 9) Some (r(9).toInt) else None case Columns.BLOCK_SIZES => if (r.length > 10) Some (r(10).split(",").map(_.toInt)) else None case Columns.BLOCK_STARTS => if (r.length > 11) Some (r(11).split(",").map(_.toInt)) else None case _ => throw new Exception(s"Unknown column found: ${colName}") } } override def buildScan(requiredColumns:Array[String], filters:Array[Filter]): RDD[Row] = { sqlContext .sparkContext .textFile(path) .filter(!_.toLowerCase.startsWith("track")) .filter(!_.toLowerCase.startsWith("browser")) .map(_.split("\t")) .map(r=> { val record = new Array[Any](requiredColumns.length) //requiredColumns. for (i <- 0 to requiredColumns.length - 1) { record(i) = getValueFromColumn(requiredColumns(i), r) } Row.fromSeq(record) } ) } }
Example 7
Source File: EncoderErrorMessageSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.encoders import scala.reflect.ClassTag import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders class NonEncodable(i: Int) case class ComplexNonEncodable1(name1: NonEncodable) case class ComplexNonEncodable2(name2: ComplexNonEncodable1) case class ComplexNonEncodable3(name3: Option[NonEncodable]) case class ComplexNonEncodable4(name4: Array[NonEncodable]) case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]]) class EncoderErrorMessageSuite extends SparkFunSuite { // Note: we also test error messages for encoders for private classes in JavaDatasetSuite. // That is done in Java because Scala cannot create truly private classes. test("primitive types in encoders using Kryo serialization") { intercept[UnsupportedOperationException] { Encoders.kryo[Int] } intercept[UnsupportedOperationException] { Encoders.kryo[Long] } intercept[UnsupportedOperationException] { Encoders.kryo[Char] } } test("primitive types in encoders using Java serialization") { intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] } } test("nice error message for missing encoder") { val errorMsg1 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage assert(errorMsg1.contains( s"""root class: "${clsName[ComplexNonEncodable1]}"""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg2 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage assert(errorMsg2.contains( s"""root class: "${clsName[ComplexNonEncodable2]}"""")) assert(errorMsg2.contains( s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg3 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage assert(errorMsg3.contains( s"""root class: "${clsName[ComplexNonEncodable3]}"""")) assert(errorMsg3.contains( s"""field (class: "scala.Option", name: "name3")""")) assert(errorMsg3.contains( s"""option value class: "${clsName[NonEncodable]}"""")) val errorMsg4 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage assert(errorMsg4.contains( s"""root class: "${clsName[ComplexNonEncodable4]}"""")) assert(errorMsg4.contains( s"""field (class: "scala.Array", name: "name4")""")) assert(errorMsg4.contains( s"""array element class: "${clsName[NonEncodable]}"""")) val errorMsg5 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage assert(errorMsg5.contains( s"""root class: "${clsName[ComplexNonEncodable5]}"""")) assert(errorMsg5.contains( s"""field (class: "scala.Option", name: "name5")""")) assert(errorMsg5.contains( s"""option value class: "scala.Array"""")) assert(errorMsg5.contains( s"""array element class: "${clsName[NonEncodable]}"""")) } private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName }
Example 8
Source File: ReduceAggregatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null).asInstanceOf[(Boolean, Int)]) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == ((true, 1))) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == ((true, 3))) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == ((true, 6))) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == ((true, 1))) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == ((true, 3))) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == ((true, 4))) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 9
Source File: EncoderErrorMessageSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.encoders import scala.reflect.ClassTag import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders class NonEncodable(i: Int) case class ComplexNonEncodable1(name1: NonEncodable) case class ComplexNonEncodable2(name2: ComplexNonEncodable1) case class ComplexNonEncodable3(name3: Option[NonEncodable]) case class ComplexNonEncodable4(name4: Array[NonEncodable]) case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]]) class EncoderErrorMessageSuite extends SparkFunSuite { // Note: we also test error messages for encoders for private classes in JavaDatasetSuite. // That is done in Java because Scala cannot create truly private classes. test("primitive types in encoders using Kryo serialization") { intercept[UnsupportedOperationException] { Encoders.kryo[Int] } intercept[UnsupportedOperationException] { Encoders.kryo[Long] } intercept[UnsupportedOperationException] { Encoders.kryo[Char] } } test("primitive types in encoders using Java serialization") { intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] } } test("nice error message for missing encoder") { val errorMsg1 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage assert(errorMsg1.contains( s"""root class: "${clsName[ComplexNonEncodable1]}"""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg2 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage assert(errorMsg2.contains( s"""root class: "${clsName[ComplexNonEncodable2]}"""")) assert(errorMsg2.contains( s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg3 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage assert(errorMsg3.contains( s"""root class: "${clsName[ComplexNonEncodable3]}"""")) assert(errorMsg3.contains( s"""field (class: "scala.Option", name: "name3")""")) assert(errorMsg3.contains( s"""option value class: "${clsName[NonEncodable]}"""")) val errorMsg4 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage assert(errorMsg4.contains( s"""root class: "${clsName[ComplexNonEncodable4]}"""")) assert(errorMsg4.contains( s"""field (class: "scala.Array", name: "name4")""")) assert(errorMsg4.contains( s"""array element class: "${clsName[NonEncodable]}"""")) val errorMsg5 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage assert(errorMsg5.contains( s"""root class: "${clsName[ComplexNonEncodable5]}"""")) assert(errorMsg5.contains( s"""field (class: "scala.Option", name: "name5")""")) assert(errorMsg5.contains( s"""option value class: "scala.Array"""")) assert(errorMsg5.contains( s"""array element class: "${clsName[NonEncodable]}"""")) } private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName }
Example 10
Source File: TopByKeyAggregator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.recommendation import scala.language.implicitConversions import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.{Encoder, Encoders} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.util.BoundedPriorityQueue private[recommendation] class TopByKeyAggregator[K1: TypeTag, K2: TypeTag, V: TypeTag] (num: Int, ord: Ordering[(K2, V)]) extends Aggregator[(K1, K2, V), BoundedPriorityQueue[(K2, V)], Array[(K2, V)]] { override def zero: BoundedPriorityQueue[(K2, V)] = new BoundedPriorityQueue[(K2, V)](num)(ord) override def reduce( q: BoundedPriorityQueue[(K2, V)], a: (K1, K2, V)): BoundedPriorityQueue[(K2, V)] = { q += {(a._2, a._3)} } override def merge( q1: BoundedPriorityQueue[(K2, V)], q2: BoundedPriorityQueue[(K2, V)]): BoundedPriorityQueue[(K2, V)] = { q1 ++= q2 } override def finish(r: BoundedPriorityQueue[(K2, V)]): Array[(K2, V)] = { r.toArray.sorted(ord.reverse) } override def bufferEncoder: Encoder[BoundedPriorityQueue[(K2, V)]] = { Encoders.kryo[BoundedPriorityQueue[(K2, V)]] } override def outputEncoder: Encoder[Array[(K2, V)]] = ExpressionEncoder[Array[(K2, V)]]() }
Example 11
Source File: GeolocationMapVectorizer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.sequence.{SequenceEstimator, SequenceModel} import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.DoubleArrayParam import org.apache.spark.sql.{Dataset, Encoders} class GeolocationMapVectorizer ( operationName: String = "vecGeoMap", uid: String = UID[GeolocationMapVectorizer] ) extends SequenceEstimator[GeolocationMap, OPVector](operationName = operationName, uid = uid) with MapVectorizerFuns[Seq[Double], GeolocationMap] with TrackNullsParam { private implicit val seqArrayEncoder = Encoders.kryo[Seq[Array[Double]]] final val defaultValue = new DoubleArrayParam( parent = this, name = "defaultValue", doc = "value to give missing keys when pivoting" ) setDefault(defaultValue, TransmogrifierDefaults.DefaultGeolocation.toArray) def setDefaultValue(value: Geolocation): this.type = set(defaultValue, value.toArray) override def makeVectorMetadata(allKeys: Seq[Seq[String]]): OpVectorMetadata = { val meta = vectorMetadataFromInputFeatures val cols = for { (keys, col) <- allKeys.zip(meta.columns) key <- keys nm <- Geolocation.Names } yield new OpVectorColumnMetadata( parentFeatureName = col.parentFeatureName, parentFeatureType = col.parentFeatureType, grouping = Option(key), descriptorValue = Option(nm) ) meta.withColumns(cols.toArray) } override def makeVectorMetaWithNullIndicators(allKeys: Seq[Seq[String]]): OpVectorMetadata = { val vectorMeta = makeVectorMetadata(allKeys) val updatedCols = vectorMeta.columns.grouped(3).flatMap { col => { val head = col.head col :+ OpVectorColumnMetadata( parentFeatureName = head.parentFeatureName, parentFeatureType = head.parentFeatureType, grouping = head.grouping, indicatorValue = Some(TransmogrifierDefaults.NullString) ) } }.toArray vectorMeta.withColumns(updatedCols) } def fitFn(dataset: Dataset[Seq[GeolocationMap#Value]]): SequenceModel[GeolocationMap, OPVector] = { val shouldClean = $(cleanKeys) val defValue = $(defaultValue).toSeq val allKeys = getKeyValues(dataset, shouldClean, shouldCleanValues = false) val trackNullsValue = $(trackNulls) val meta = if (trackNullsValue) makeVectorMetaWithNullIndicators(allKeys) else makeVectorMetadata(allKeys) setMetadata(meta.toMetadata) new GeolocationMapVectorizerModel( allKeys = allKeys, defaultValue = defValue, shouldClean = shouldClean, trackNulls = trackNullsValue, operationName = operationName, uid = uid ) } } final class GeolocationMapVectorizerModel private[op] ( val allKeys: Seq[Seq[String]], val defaultValue: Seq[Double], val shouldClean: Boolean, val trackNulls: Boolean, operationName: String, uid: String ) extends SequenceModel[GeolocationMap, OPVector](operationName = operationName, uid = uid) with CleanTextMapFun { def transformFn: Seq[GeolocationMap] => OPVector = row => { val eachPivoted: Array[Array[Double]] = row.map(_.value).zip(allKeys).flatMap { case (map, keys) => val cleanedMap = cleanMap(map, shouldClean, shouldCleanValue = false) keys.map(k => { val vOpt = cleanedMap.get(k) val isEmpty = vOpt.isEmpty val v = vOpt.getOrElse(defaultValue).toArray if (trackNulls) v :+ (if (isEmpty) 1.0 else 0.0) else v }) }.toArray Vectors.dense(eachPivoted.flatten).compressed.toOPVector } }
Example 12
Source File: ReduceAggregatorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null)) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == (true, 1)) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == (true, 3)) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == (true, 6)) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == (true, 1)) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == (true, 3)) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == (true, 4)) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 13
Source File: EncoderErrorMessageSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.encoders import scala.reflect.ClassTag import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders class NonEncodable(i: Int) case class ComplexNonEncodable1(name1: NonEncodable) case class ComplexNonEncodable2(name2: ComplexNonEncodable1) case class ComplexNonEncodable3(name3: Option[NonEncodable]) case class ComplexNonEncodable4(name4: Array[NonEncodable]) case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]]) class EncoderErrorMessageSuite extends SparkFunSuite { // Note: we also test error messages for encoders for private classes in JavaDatasetSuite. // That is done in Java because Scala cannot create truly private classes. test("primitive types in encoders using Kryo serialization") { intercept[UnsupportedOperationException] { Encoders.kryo[Int] } intercept[UnsupportedOperationException] { Encoders.kryo[Long] } intercept[UnsupportedOperationException] { Encoders.kryo[Char] } } test("primitive types in encoders using Java serialization") { intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] } } test("nice error message for missing encoder") { val errorMsg1 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage assert(errorMsg1.contains( s"""root class: "${clsName[ComplexNonEncodable1]}"""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg2 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage assert(errorMsg2.contains( s"""root class: "${clsName[ComplexNonEncodable2]}"""")) assert(errorMsg2.contains( s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg3 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage assert(errorMsg3.contains( s"""root class: "${clsName[ComplexNonEncodable3]}"""")) assert(errorMsg3.contains( s"""field (class: "scala.Option", name: "name3")""")) assert(errorMsg3.contains( s"""option value class: "${clsName[NonEncodable]}"""")) val errorMsg4 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage assert(errorMsg4.contains( s"""root class: "${clsName[ComplexNonEncodable4]}"""")) assert(errorMsg4.contains( s"""field (class: "scala.Array", name: "name4")""")) assert(errorMsg4.contains( s"""array element class: "${clsName[NonEncodable]}"""")) val errorMsg5 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage assert(errorMsg5.contains( s"""root class: "${clsName[ComplexNonEncodable5]}"""")) assert(errorMsg5.contains( s"""field (class: "scala.Option", name: "name5")""")) assert(errorMsg5.contains( s"""option value class: "scala.Array"""")) assert(errorMsg5.contains( s"""array element class: "${clsName[NonEncodable]}"""")) } private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName }
Example 14
Source File: SimpleJsonIngestionJob.scala From comet-data-pipeline with Apache License 2.0 | 5 votes |
package com.ebiznext.comet.job.ingest import com.ebiznext.comet.config.Settings import com.ebiznext.comet.schema.handlers.{SchemaHandler, StorageHandler} import com.ebiznext.comet.schema.model._ import org.apache.hadoop.fs.Path import org.apache.spark.sql.functions.lit import org.apache.spark.sql.{DataFrame, Encoders} import scala.util.{Failure, Success, Try} class SimpleJsonIngestionJob( domain: Domain, schema: Schema, types: List[Type], path: List[Path], storageHandler: StorageHandler, schemaHandler: SchemaHandler )(implicit settings: Settings) extends DsvIngestionJob(domain, schema, types, path, storageHandler, schemaHandler) { override def loadDataSet(): Try[DataFrame] = { try { val df = if (metadata.isArray()) { val jsonRDD = session.sparkContext.wholeTextFiles(path.map(_.toString).mkString(",")).map(_._2) session.read .json(session.createDataset(jsonRDD)(Encoders.STRING)) .withColumn( // Spark cannot detect the input file automatically, so we should add it explicitly Settings.cometInputFileNameColumn, if (settings.comet.grouped) lit(path.map(_.toString).mkString(",")) else lit(path.head.toString) ) } else { session.read .option("encoding", metadata.getEncoding()) .option("multiline", metadata.getMultiline()) .json(path.map(_.toString): _*) .withColumn( // Spark here can detect the input file automatically, so we're just using the input_file_name spark function Settings.cometInputFileNameColumn, org.apache.spark.sql.functions.input_file_name() ) } import session.implicits._ val resDF = if (df.columns.contains("_corrupt_record")) { //TODO send rejected records to rejected area logger.whenDebugEnabled { df.filter($"_corrupt_record".isNotNull).show(1000, false) } throw new Exception( s"""Invalid JSON File: ${path .map(_.toString) .mkString(",")}. SIMPLE_JSON require a valid json file """ ) } else { df } Success( resDF ) } catch { case e: Exception => Failure(e) } } }
Example 15
Source File: JsonIngestionJob.scala From comet-data-pipeline with Apache License 2.0 | 5 votes |
package com.ebiznext.comet.job.ingest import com.ebiznext.comet.config.{DatasetArea, Settings, StorageArea} import com.ebiznext.comet.schema.handlers.{SchemaHandler, StorageHandler} import com.ebiznext.comet.schema.model._ import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.datasources.json.JsonIngestionUtil import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Encoders, Row} import scala.util.{Failure, Success, Try} @deprecated("We let Spark compute the final schema", "") def saveAccepted(acceptedRDD: RDD[Row]): Path = { val writeMode = metadata.getWriteMode() val acceptedPath = new Path(DatasetArea.accepted(domain.name), schema.name) saveRows( session.createDataFrame(acceptedRDD, schemaSparkType), acceptedPath, writeMode, StorageArea.accepted, schema.merge.isDefined ) acceptedPath } override def name: String = "JsonJob" }
Example 16
Source File: VerifyIsolationForest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.isolationforest import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.metrics.MetricConstants import com.microsoft.ml.spark.core.test.benchmarks.Benchmarks import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row} import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.scalactic.Tolerance._ import com.microsoft.ml.spark.train.ComputeModelStatistics case class MammographyRecord(feature0: Double, feature1: Double, feature2: Double, feature3: Double, feature4: Double, feature5: Double, label: Double) case class ScoringResult(features: Vector, label: Double, predictedLabel: Double, outlierScore: Double) class VerifyIsolationForest extends Benchmarks with EstimatorFuzzing[IsolationForest] { test ("Verify isolationForestMammographyDataTest") { import session.implicits._ val data = loadMammographyData // Train a new isolation forest model val contamination = 0.02 val isolationForest = new IsolationForest() .setNumEstimators(100) .setBootstrap(false) .setMaxSamples(256) .setMaxFeatures(1.0) .setFeaturesCol("features") .setPredictionCol("predictedLabel") .setScoreCol("outlierScore") .setContamination(0.02) .setContaminationError(contamination * 0.01) .setRandomSeed(1) // Score all training data instances using the new model val isolationForestModel = isolationForest.fit(data) // Calculate area under ROC curve and assert val scores = isolationForestModel.transform(data).as[ScoringResult] val metrics = new ComputeModelStatistics() .setEvaluationMetric(MetricConstants.AucSparkMetric) .setLabelCol("label") .setScoredLabelsCol("predictedLabel") .setScoresCol("outlierScore") .transform(scores) // Expectation from results in the 2008 "Isolation Forest" paper by F. T. Liu, et al. val aurocExpectation = 0.86 val uncert = 0.02 val auroc = metrics.first().getDouble(1) assert(auroc === aurocExpectation +- uncert, "expected area under ROC =" + s" $aurocExpectation +/- $uncert, but observed $auroc") } def loadMammographyData(): DataFrame = { import session.implicits._ val mammographyRecordSchema = Encoders.product[MammographyRecord].schema val fileLocation = FileUtilities.join(BuildInfo.datasetDir,"IsolationForest", "mammography.csv").toString // Open source dataset from http://odds.cs.stonybrook.edu/mammography-dataset/ val rawData = session.read .format("csv") .option("comment", "#") .option("header", "false") .schema(mammographyRecordSchema) .load(fileLocation) val assembler = new VectorAssembler() .setInputCols(Array("feature0", "feature1", "feature2", "feature3", "feature4", "feature5")) .setOutputCol("features") val data = assembler .transform(rawData) .select("features", "label") data } override def reader: MLReadable[_] = IsolationForest override def modelReader: MLReadable[_] = IsolationForestModel override def testObjects(): Seq[TestObject[IsolationForest]] = { val dataset = loadMammographyData.toDF Seq(new TestObject( new IsolationForest(), dataset)) } }
Example 17
Source File: SparkRecoverPartitionsCustomIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class SparkRecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated programmatically using custom logic") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 18
Source File: SparkRecoverPartitionsNativeIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class SparkRecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated with native spark.recoverPartitions()") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 19
Source File: RecoverPartitionsNativeIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class RecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated with native spark.recoverPartitions()") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false spark .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)") .filter("col_name == 'Partition Statistics'") .head() .getAs[String]("data_type").contains("6 rows") shouldBe true fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 20
Source File: ReduceAggregatorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null)) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == (true, 1)) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == (true, 3)) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == (true, 6)) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == (true, 1)) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == (true, 3)) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == (true, 4)) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 21
Source File: EncoderErrorMessageSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.encoders import scala.reflect.ClassTag import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders class NonEncodable(i: Int) case class ComplexNonEncodable1(name1: NonEncodable) case class ComplexNonEncodable2(name2: ComplexNonEncodable1) case class ComplexNonEncodable3(name3: Option[NonEncodable]) case class ComplexNonEncodable4(name4: Array[NonEncodable]) case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]]) class EncoderErrorMessageSuite extends SparkFunSuite { // Note: we also test error messages for encoders for private classes in JavaDatasetSuite. // That is done in Java because Scala cannot create truly private classes. test("primitive types in encoders using Kryo serialization") { intercept[UnsupportedOperationException] { Encoders.kryo[Int] } intercept[UnsupportedOperationException] { Encoders.kryo[Long] } intercept[UnsupportedOperationException] { Encoders.kryo[Char] } } test("primitive types in encoders using Java serialization") { intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] } } test("nice error message for missing encoder") { val errorMsg1 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage assert(errorMsg1.contains( s"""root class: "${clsName[ComplexNonEncodable1]}"""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg2 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage assert(errorMsg2.contains( s"""root class: "${clsName[ComplexNonEncodable2]}"""")) assert(errorMsg2.contains( s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg3 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage assert(errorMsg3.contains( s"""root class: "${clsName[ComplexNonEncodable3]}"""")) assert(errorMsg3.contains( s"""field (class: "scala.Option", name: "name3")""")) assert(errorMsg3.contains( s"""option value class: "${clsName[NonEncodable]}"""")) val errorMsg4 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage assert(errorMsg4.contains( s"""root class: "${clsName[ComplexNonEncodable4]}"""")) assert(errorMsg4.contains( s"""field (class: "scala.Array", name: "name4")""")) assert(errorMsg4.contains( s"""array element class: "${clsName[NonEncodable]}"""")) val errorMsg5 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage assert(errorMsg5.contains( s"""root class: "${clsName[ComplexNonEncodable5]}"""")) assert(errorMsg5.contains( s"""field (class: "scala.Option", name: "name5")""")) assert(errorMsg5.contains( s"""option value class: "scala.Array"""")) assert(errorMsg5.contains( s"""array element class: "${clsName[NonEncodable]}"""")) } private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName }
Example 22
Source File: SessionLifecycle.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession} private[spark] object DatasetGenerator { // Generation of a dataset of type {Double, Double} with a by-name initialization function final def toDSPairDouble( numDataPoints: Int )( generator: Int => (Double, Double) )(implicit sessionLifeCycle: SessionLifeCycle): Dataset[(Double, Double)] = toDSPairDouble(Seq.tabulate(numDataPoints)(generator(_))) // Generation of a dataset of type {Double, Double} from a sequence of same type def toDSPairDouble( data: Seq[(Double, Double)] )(implicit sessionLifeCycle: SessionLifeCycle): Dataset[(Double, Double)] = { import sessionLifeCycle.sparkSession.implicits._ data.toDS() } // Generation of a dataset of type Double def toDSDouble(data: Seq[Double])(implicit sessionLifeCycle: SessionLifeCycle): Dataset[Double] = { import sessionLifeCycle.sparkSession.implicits._ data.toDS() } // Generation of a dataset of type Int def toDSInt(data: Seq[Int])(implicit sessionLifeCycle: SessionLifeCycle): Dataset[Int] = { import sessionLifeCycle.sparkSession.implicits._ data.toDS() } } // -------------------------- EOF ----------------------------------------------
Example 23
Source File: MultiStreamHandler.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import knolx.Config._ import knolx.KnolXLogger import org.apache.spark.sql.functions.col import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode} import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{Encoders, SparkSession} case class CurrentPowerConsumption(kwh: Double) case class PowerConsumptionStatus(numOfReadings: Long, total: Double, avg: Double, status: String) { def compute(newReadings: List[Double]) = { val newTotal = newReadings.sum + total val newNumOfReadings = numOfReadings + newReadings.size val newAvg = newTotal / newNumOfReadings.toDouble PowerConsumptionStatus(newNumOfReadings, newTotal, newAvg, "ON") } } object MultiStreamHandler extends App with KnolXLogger { info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") val updateStateFunc = (deviceId: String, newReadings: Iterator[(String, CurrentPowerConsumption)], state: GroupState[PowerConsumptionStatus]) => { val data = newReadings.toList.map { case(_, reading) => reading }.map(_.kwh) lazy val initialPowerConsumptionStatus = PowerConsumptionStatus(0L, 0D, 0D, "OFF") val currentState = state.getOption.fold(initialPowerConsumptionStatus.compute(data))(_.compute(data)) val currentStatus = if(state.hasTimedOut) { // If we do not receive any reading, for a device, we will assume that it is OFF. currentState.copy(status = "OFF") } else { state.setTimeoutDuration("10 seconds") currentState } state.update(currentStatus) (deviceId, currentStatus) } info("Creating Streaming DF...") val dataStream = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", topic) .option("failOnDataLoss", false) .option("includeTimestamp", true) .load() info("Writing data to Console...") import spark.implicits._ implicit val currentPowerConsumptionEncoder = Encoders.kryo[CurrentPowerConsumption] implicit val powerConsumptionStatusEncoder = Encoders.kryo[PowerConsumptionStatus] val query = dataStream .select(col("key").cast(StringType).as("key"), col("value").cast(StringType).as("value")) .as[(String, String)] .map { case(deviceId, unit) => (deviceId, CurrentPowerConsumption(Option(unit).fold(0D)(_.toDouble))) } .groupByKey { case(deviceId, _) => deviceId } .mapGroupsWithState[PowerConsumptionStatus, (String, PowerConsumptionStatus)](GroupStateTimeout.ProcessingTimeTimeout())(updateStateFunc) .toDF("deviceId", "current_status") .writeStream .format("console") .option("truncate", false) .outputMode(OutputMode.Update()) .option("checkpointLocation", checkPointDir) .start() info("Waiting for the query to terminate...") query.awaitTermination() query.stop() }
Example 24
Source File: TestCsvData.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import org.apache.spark.sql.{Dataset, Encoders, SparkSession} private[csv] trait TestCsvData { protected def spark: SparkSession def sampledTestData: Dataset[String] = { spark.range(0, 100, 1).map { index => val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46, 57, 62, 68, 72) if (predefinedSample.contains(index)) { index.toString } else { (index.toDouble + 0.1).toString } }(Encoders.STRING) } }
Example 25
Source File: ReduceAggregatorSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null).asInstanceOf[(Boolean, Int)]) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == ((true, 1))) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == ((true, 3))) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == ((true, 6))) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == ((true, 1))) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == ((true, 3))) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == ((true, 4))) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 26
Source File: EncoderErrorMessageSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.encoders import scala.reflect.ClassTag import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders class NonEncodable(i: Int) case class ComplexNonEncodable1(name1: NonEncodable) case class ComplexNonEncodable2(name2: ComplexNonEncodable1) case class ComplexNonEncodable3(name3: Option[NonEncodable]) case class ComplexNonEncodable4(name4: Array[NonEncodable]) case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]]) class EncoderErrorMessageSuite extends SparkFunSuite { // Note: we also test error messages for encoders for private classes in JavaDatasetSuite. // That is done in Java because Scala cannot create truly private classes. test("primitive types in encoders using Kryo serialization") { intercept[UnsupportedOperationException] { Encoders.kryo[Int] } intercept[UnsupportedOperationException] { Encoders.kryo[Long] } intercept[UnsupportedOperationException] { Encoders.kryo[Char] } } test("primitive types in encoders using Java serialization") { intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] } } test("nice error message for missing encoder") { val errorMsg1 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage assert(errorMsg1.contains( s"""root class: "${clsName[ComplexNonEncodable1]}"""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg2 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage assert(errorMsg2.contains( s"""root class: "${clsName[ComplexNonEncodable2]}"""")) assert(errorMsg2.contains( s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg3 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage assert(errorMsg3.contains( s"""root class: "${clsName[ComplexNonEncodable3]}"""")) assert(errorMsg3.contains( s"""field (class: "scala.Option", name: "name3")""")) assert(errorMsg3.contains( s"""option value class: "${clsName[NonEncodable]}"""")) val errorMsg4 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage assert(errorMsg4.contains( s"""root class: "${clsName[ComplexNonEncodable4]}"""")) assert(errorMsg4.contains( s"""field (class: "scala.Array", name: "name4")""")) assert(errorMsg4.contains( s"""array element class: "${clsName[NonEncodable]}"""")) val errorMsg5 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage assert(errorMsg5.contains( s"""root class: "${clsName[ComplexNonEncodable5]}"""")) assert(errorMsg5.contains( s"""field (class: "scala.Option", name: "name5")""")) assert(errorMsg5.contains( s"""option value class: "scala.Array"""")) assert(errorMsg5.contains( s"""array element class: "${clsName[NonEncodable]}"""")) } private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName }
Example 27
Source File: KuduEventsHandler.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package it.teamdigitale.storage import it.teamdigitale.EventModel.{KuduEvent, StorableEvent} import it.teamdigitale.config.IotIngestionManagerConfig.KuduConfig import org.apache.kudu.client.{CreateTableOptions, KuduException} import org.apache.kudu.spark.kudu.KuduContext import org.apache.kudu.{Schema, Type} import org.apache.spark.sql.{DataFrame, Encoders} import org.apache.logging.log4j.LogManager import scala.collection.convert.decorateAsJava._ object KuduEventsHandler { implicit private val alogger = LogManager.getLogger(this.getClass) val primaryKeys = List("source", "ts", "metric_id") def getOrCreateTable(kuduContext: KuduContext, kuduConfig: KuduConfig): Unit = { if (!kuduContext.tableExists(kuduConfig.eventsTableName)) { try { val schema = Encoders.product[KuduEvent].schema val table = kuduContext.createTable( kuduConfig.eventsTableName, schema, primaryKeys, new CreateTableOptions(). setRangePartitionColumns(List("ts").asJava) .addHashPartitions(List("source").asJava, kuduConfig.eventsNumberBuckets) ) alogger.info(s"Created table ${table.getName}") } catch { case ex: KuduException if ex.getStatus.isAlreadyPresent => alogger.error(s"Cannot create the table ${kuduConfig.eventsTableName} due the error: ${ex.getMessage}") case ex: Throwable => alogger.error(s"Cannot create the table ${kuduConfig.eventsTableName} due the error: ${ex.getMessage}") } } } def write(df: DataFrame, kuduContext: KuduContext, kuduConfig: KuduConfig): Unit = { kuduContext.insertIgnoreRows(df, kuduConfig.eventsTableName) } }
Example 28
Source File: ReduceAggregatorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null)) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == (true, 1)) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == (true, 3)) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == (true, 6)) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == (true, 1)) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == (true, 3)) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == (true, 4)) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 29
Source File: EncoderErrorMessageSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.encoders import scala.reflect.ClassTag import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders class NonEncodable(i: Int) case class ComplexNonEncodable1(name1: NonEncodable) case class ComplexNonEncodable2(name2: ComplexNonEncodable1) case class ComplexNonEncodable3(name3: Option[NonEncodable]) case class ComplexNonEncodable4(name4: Array[NonEncodable]) case class ComplexNonEncodable5(name5: Option[Array[NonEncodable]]) class EncoderErrorMessageSuite extends SparkFunSuite { // Note: we also test error messages for encoders for private classes in JavaDatasetSuite. // That is done in Java because Scala cannot create truly private classes. test("primitive types in encoders using Kryo serialization") { intercept[UnsupportedOperationException] { Encoders.kryo[Int] } intercept[UnsupportedOperationException] { Encoders.kryo[Long] } intercept[UnsupportedOperationException] { Encoders.kryo[Char] } } test("primitive types in encoders using Java serialization") { intercept[UnsupportedOperationException] { Encoders.javaSerialization[Int] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Long] } intercept[UnsupportedOperationException] { Encoders.javaSerialization[Char] } } test("nice error message for missing encoder") { val errorMsg1 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable1]).getMessage assert(errorMsg1.contains( s"""root class: "${clsName[ComplexNonEncodable1]}"""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg2 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable2]).getMessage assert(errorMsg2.contains( s"""root class: "${clsName[ComplexNonEncodable2]}"""")) assert(errorMsg2.contains( s"""field (class: "${clsName[ComplexNonEncodable1]}", name: "name2")""")) assert(errorMsg1.contains( s"""field (class: "${clsName[NonEncodable]}", name: "name1")""")) val errorMsg3 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable3]).getMessage assert(errorMsg3.contains( s"""root class: "${clsName[ComplexNonEncodable3]}"""")) assert(errorMsg3.contains( s"""field (class: "scala.Option", name: "name3")""")) assert(errorMsg3.contains( s"""option value class: "${clsName[NonEncodable]}"""")) val errorMsg4 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable4]).getMessage assert(errorMsg4.contains( s"""root class: "${clsName[ComplexNonEncodable4]}"""")) assert(errorMsg4.contains( s"""field (class: "scala.Array", name: "name4")""")) assert(errorMsg4.contains( s"""array element class: "${clsName[NonEncodable]}"""")) val errorMsg5 = intercept[UnsupportedOperationException](ExpressionEncoder[ComplexNonEncodable5]).getMessage assert(errorMsg5.contains( s"""root class: "${clsName[ComplexNonEncodable5]}"""")) assert(errorMsg5.contains( s"""field (class: "scala.Option", name: "name5")""")) assert(errorMsg5.contains( s"""option value class: "scala.Array"""")) assert(errorMsg5.contains( s"""array element class: "${clsName[NonEncodable]}"""")) } private def clsName[T : ClassTag]: String = implicitly[ClassTag[T]].runtimeClass.getName }
Example 30
Source File: StreamingPredictionsSpec.scala From odsc-east-realish-predictions with Apache License 2.0 | 4 votes |
package com.twilio.open.odsc.realish import java.sql.Timestamp import java.time.Instant import java.util.{Random, UUID} import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoders, SQLContext, SparkSession} import org.scalatest.{FunSuite, Matchers} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} import scala.concurrent.duration._ class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql { override def conf: SparkConf = { new SparkConf() .setMaster("local[*]") .setAppName("odsc-spark-utils") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) .set("spark.driver.host", "localhost") .set("spark.sql.session.timeZone", "UTC") } final val notRandomRandom = { val generator = new Random generator.setSeed(100L) generator } test("should stream in some mock data for fun") { implicit val spark: SparkSession = sparkSql import spark.implicits._ implicit val sqlContext: SQLContext = spark.sqlContext implicit val metricEncoder = Encoders.product[Metric] val metricData = MemoryStream[Metric] val startingInstant = Instant.now() val backingData = (1 to 10000).map(offset => { val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration" val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100) Metric( Timestamp.from(startingInstant.minusSeconds(offset)), UUID.randomUUID().toString, metric, value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240), countryCode = if (offset % 8 == 0) "US" else "BR", callDirection = if (metric == "loss_percentage") "inbound" else "outbound" ) }) val processingTimeTrigger = Trigger.ProcessingTime(2.seconds) val streamingQuery = metricData.toDF() .withWatermark("timestamp", "2 hours") .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes")) .agg( min("value") as "min", avg("value") as "mean", max("value") as "max", count("*") as "total" ) .writeStream .format("memory") .queryName("datastream") .outputMode(OutputMode.Append()) .trigger(processingTimeTrigger) .start() metricData.addData(backingData) streamingQuery.processAllAvailable() spark.sql("select * from datastream").show(20, false) val checkChange = spark.sql("select * from datastream") .groupBy("metric","countryCode") .agg( sum("total") as "total", avg("mean") as "mean" ) checkChange.show(20, false) // now can do interesting things with minor back tracking... streamingQuery.stop() } }
Example 31
Source File: SparkNarrowTest.scala From spark-tools with Apache License 2.0 | 3 votes |
package io.univalence import java.net.URLClassLoader import java.sql.Date import io.univalence.centrifuge.Sparknarrow import org.apache.spark.SparkConf import org.apache.spark.sql.types._ import org.apache.spark.sql.Encoders import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite case class Person(name: String, age: Int, date: Date) class SparknarrowTest extends FunSuite { val conf: SparkConf = new SparkConf() conf.setAppName("yo") conf.set("spark.sql.caseSensitive", "true") conf.setMaster("local[2]") implicit val ss: SparkSession = SparkSession.builder.config(conf).getOrCreate import ss.implicits._ test("testBasicCC") { val classDef = Sparknarrow.basicCC(Encoders.product[Person].schema).classDef checkDefinition(classDef) } def checkDefinition(scalaCode: String): Unit = { //TODO do a version for 2.11 and 2.12 } test("play with scala eval") { val code = """ case class Tata(str: String) case class Toto(age: Int, tata: Tata) """ checkDefinition(code) checkDefinition(code) } ignore("printSchema StructType") { val yo = StructType( Seq( StructField("name", StringType), StructField("tel", ArrayType(StringType)) ) ) yo.printTreeString() } }