org.apache.spark.sql.Dataset Scala Examples
The following examples show how to use org.apache.spark.sql.Dataset.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingConsumer.scala From Scala-Programming-Projects with MIT License | 11 votes |
package coinyser import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.functions._ object StreamingConsumer { def fromJson(df: DataFrame): Dataset[Transaction] = { import df.sparkSession.implicits._ val schema = Seq.empty[Transaction].toDS().schema df.select(from_json(col("value").cast("string"), schema).alias("v")) .select("v.*").as[Transaction] } def transactionStream(implicit spark: SparkSession, config: KafkaConfig): Dataset[Transaction] = fromJson(spark.readStream.format("kafka") .option("kafka.bootstrap.servers", config.bootStrapServers) .option("startingoffsets", "earliest") .option("subscribe", config.transactionsTopic) .load() ) }
Example 2
Source File: MultilayerPerceptronClassifierWrapper.scala From drizzle-spark with Apache License 2.0 | 8 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel, val labelCount: Long, val layers: Array[Int], val weights: Array[Double] ) extends MLWritable { def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val labelCount = (rMetadata \ "labelCount").extract[Long] val layers = (rMetadata \ "layers").extract[Array[Int]] val weights = (rMetadata \ "weights").extract[Array[Double]] val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = ("class" -> instance.getClass.getName) ~ ("labelCount" -> instance.labelCount) ~ ("layers" -> instance.layers.toSeq) ~ ("weights" -> instance.weights.toArray.toSeq) val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 3
Source File: CogroupTest.scala From spark-tools with Apache License 2.0 | 6 votes |
package io.univalence.plumbus import io.univalence.plumbus.test.SparkTestLike import org.apache.spark.sql.Dataset import org.scalatest.{ FunSuiteLike, Matchers } import com.github.mrpowers.spark.fast.tests.DatasetComparer class CogroupTest extends FunSuiteLike with SparkTestLike with Matchers with DatasetComparer { import spark.implicits._ import io.univalence.plumbus.cogroup._ val person1 = PersonWithId("1", "John", 32) val person2 = PersonWithId("2", "Mary", 32) val address1 = Address("1", "address1") val address2 = Address("2", "address2") val address3 = Address("1", "address3") val persons: Dataset[PersonWithId] = Seq(person1, person2).toDS() val addresses: Dataset[Address] = Seq(address1, address2, address3).toDS() test("apply test") { val applyDS = apply(persons, addresses)(_.id, _.idPerson) val expectedDS = Seq( ("1", Seq(person1), Seq(address1, address3)), ("2", Seq(person2), Seq(address2)) ).toDS() assertSmallDatasetEquality(applyDS, expectedDS, orderedComparison = false) } } case class Address(idPerson: String, name: String)
Example 4
Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0 | 6 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.log4j import scala.collection.mutable object MNISTBenchmark { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt) val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2" val numPartitions = if(args.length >= 3) args(2).toInt else 10 val models = if(args.length >=4) args(3).split(',') else Array("tree","naive") val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, path) .zipWithIndex() .filter(_._2 < ns.max) .sortBy(_._2, numPartitions = numPartitions) .keys .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) .cache() dataset.count() //force persist val limiter = new Limiter() val knn = new KNNClassifier() .setTopTreeSize(numPartitions * 10) .setFeaturesCol("features") .setPredictionCol("prediction") .setK(1) val naiveKNN = new NaiveKNNClassifier() val pipeline = new Pipeline() .setStages(Array(limiter, knn)) val naivePipeline = new Pipeline() .setStages(Array(limiter, naiveKNN)) val paramGrid = new ParamGridBuilder() .addGrid(limiter.n, ns) .build() val bm = new Benchmarker() .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumTimes(3) val metrics = mutable.ArrayBuffer[String]() if(models.contains("tree")) { val bmModel = bm.setEstimator(pipeline).fit(dataset) metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}" } if(models.contains("naive")) { val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset) metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}" } logger.info(metrics.mkString("\n")) } } class Limiter(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("limiter")) val n: IntParam = new IntParam(this, "n", "number of rows to limit") def setN(value: Int): this.type = set(n, value) // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN) override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF() override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = schema }
Example 5
Source File: StreamingConsumerApp.scala From Scala-Programming-Projects with MIT License | 5 votes |
package coinyser import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.functions._ object StreamingConsumerApp extends App { implicit val spark: SparkSession = SparkSession .builder .master("local[*]") .appName("StreamingConsumerApp") .getOrCreate() implicit val config: KafkaConfig = KafkaConfig( bootStrapServers = "localhost:9092", transactionsTopic = "transactions_draft3" ) val txStream: Dataset[Transaction] = StreamingConsumer.transactionStream import spark.implicits._ // TODO move that to a Query class between batch and streaming val groupedStream = txStream .withWatermark("date", "1 second") .groupBy(window($"date", "1 minutes").as("window")) .agg( count($"tid").as("count"), avg("price").as("avgPrice"), stddev("price").as("stddevPrice"), last("price").as("lastPrice"), sum("amount").as("sumAmount") ) .select("window.start", "count", "avgPrice", "lastPrice", "stddevPrice", "sumAmount") groupedStream .writeStream .format("console") .queryName("groupedTx") .outputMode("append") .start() Thread.sleep(Long.MaxValue) }
Example 6
Source File: cogroup.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.plumbus import org.apache.spark.Partitioner import org.apache.spark.rdd.{ CoGroupedRDD, RDD } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ ArrayType, StructField } import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row } import scala.reflect.ClassTag import scala.util.Try object cogroup { implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) { def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] = //Use SparkAddOn ? ??? } def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)( implicit encA: Encoder[A], encB: Encoder[B], encC: Encoder[K], enc: Encoder[(K, Seq[A], Seq[B])], ca: ClassTag[A], ck: ClassTag[K], cb: ClassTag[B] ): Dataset[(K, Seq[A], Seq[B])] = left.sparkSession.implicits .rddToDatasetHolder( RDD .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft)) .cogroup(right.rdd.keyBy(keyRight)) .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) }) ) .toDS def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)( byKey: String, partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*) ): Try[DataFrame] = Try { val subGroup: Seq[DataFrame] = namedSubGroup.map(_._2) val allFrames: Seq[DataFrame] = group +: subGroup val allFramesKeyed: Seq[RDD[(String, Row)]] = allFrames.map(df => { val idx = df.columns.indexOf(byKey) df.rdd.keyBy(_.get(idx).toString) }) val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner) val rowRdd: RDD[Row] = cogroupRdd.map(x => { val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq) val seq = rows.head.head.toSeq ++ rows.tail new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row] }) val schema = types.StructType( group.schema.fields ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) } ) group.sparkSession.createDataFrame(rowRdd, schema) } }
Example 7
Source File: CompressDumpTest.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.plumbus import io.univalence.plumbus.compress.CompressDump import org.apache.spark.sql.{ DataFrame, Dataset, SparkSession } import org.scalatest.FunSuite class CompressDumpTest extends FunSuite { val ss: SparkSession = SparkSession .builder() .master("local[*]") .appName("test") .config("spark.default.parallelism", "1") .getOrCreate() import ss.implicits._ test("compressUsingDF2") { val stringToRs: Map[String, Seq[R]] = Map( "dump1" -> Seq( R(1, "a", 1), R(2, "b", 22) ), "dump2" -> Seq( R(1, "a", 3), R(2, "b", 22) ) ) val df1: Dataset[(Int, Seq[RCompressed])] = CompressDump .compressUsingDF2(dfs = stringToRs.mapValues(s => ss.createDataset(s).toDF()), groupExpr = "id") .as[(Int, Seq[RCompressed])] val map: Map[Int, Seq[RCompressed]] = df1.collect().toMap assert(map(1).size == 2) } } case class R(id: Int, a: String, b: Int) case class RCompressed(id: Int, a: String, b: Int, compressDumpDts: Seq[String])
Example 8
Source File: package.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence import org.apache.spark.sql.Dataset import org.apache.spark.sql.SparkSession import org.apache.spark.sql.centrifuge_sql._ package object centrifuge { type AnnotationSql = Annotation object AnnotationSql { def apply( msg: String, onField: String, fromFields: Vector[String], isError: Boolean, count: Long ): Annotation = Annotation( message = msg, isError = isError, count = count, onField = Some(onField), fromFields = fromFields ) } object implicits { implicit def QADFOps[T](dataframe: Dataset[T]): QADF = new QADF(dataframe.toDF()) implicit def sparkSessionOps(ss: SparkSession): QATools = new QATools(ss) } }
Example 9
Source File: HashingTF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 10
Source File: SQLTransformer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 11
Source File: BinaryClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 12
Source File: MulticlassClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 13
Source File: RegressionEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 14
Source File: RWrapperUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.RFormula import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.Dataset object RWrapperUtils extends Logging { def checkDataColumns(rFormula: RFormula, data: Dataset[_]): Unit = { if (data.schema.fieldNames.contains(rFormula.getFeaturesCol)) { val newFeaturesName = s"${Identifiable.randomUID(rFormula.getFeaturesCol)}" logWarning(s"data containing ${rFormula.getFeaturesCol} column, " + s"using new name $newFeaturesName instead") rFormula.setFeaturesCol(newFeaturesName) } } }
Example 15
Source File: Transformer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(this.createTransformFunc, outputDataType) dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 16
Source File: TokenizerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new Tokenizer) } test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.RegexTokenizerSuite._ import testImplicits._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) ).toDF() testRegexTokenizer(tokenizer0, dataset0) val dataset1 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) ).toDF() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) ).toDF() testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) ).toDF() testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 17
Source File: NGramSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.NGramSuite._ import testImplicits._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = Seq(NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") )).toDF() testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") )).toDF() testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData(Array(), Array())).toDF() testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array() )).toDF() testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } } object NGramSuite extends SparkFunSuite { def testNGram(t: NGram, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("nGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => assert(actualNGrams === wantedNGrams) } } }
Example 18
Source File: SQLBuilderTest.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import scala.util.control.NonFatal import org.apache.spark.sql.{DataFrame, Dataset, QueryTest} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton { protected def checkSQL(e: Expression, expectedSQL: String): Unit = { val actualSQL = e.sql try { assert(actualSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following expression: | |${e.prettyName} | |$cause """.stripMargin) } } protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = { val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) => fail( s"""Cannot convert the following logical query plan to SQL: | |${plan.treeString} """.stripMargin) } try { assert(generatedSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following logical query plan: | |${plan.treeString} | |$cause """.stripMargin) } checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan)) } protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = { checkSQL(df.queryExecution.analyzed, expectedSQL) } }
Example 19
Source File: Aggregator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 20
Source File: FrequentItems.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 21
Source File: cache.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan case class CacheTableCommand( tableIdent: TableIdentifier, plan: Option[LogicalPlan], isLazy: Boolean) extends RunnableCommand { require(plan.isEmpty || tableIdent.database.isEmpty, "Database name is not allowed in CACHE TABLE AS SELECT") override protected def innerChildren: Seq[QueryPlan[_]] = { plan.toSeq } override def run(sparkSession: SparkSession): Seq[Row] = { plan.foreach { logicalPlan => Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString) } sparkSession.catalog.cacheTable(tableIdent.quotedString) if (!isLazy) { // Performs eager caching sparkSession.table(tableIdent).count() } Seq.empty[Row] } } case class UncacheTableCommand(tableIdent: TableIdentifier) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { sparkSession.catalog.uncacheTable(tableIdent.quotedString) Seq.empty[Row] } } case object ClearCacheCommand extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { sparkSession.catalog.clearCache() Seq.empty[Row] } }
Example 22
Source File: FilterTopFeaturesProcess.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs.wal.process import org.apache.s2graph.s2jobs.task.TaskConf import org.apache.s2graph.s2jobs.wal.WalLogAgg import org.apache.s2graph.s2jobs.wal.transformer.{DefaultTransformer, Transformer} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import play.api.libs.json.{JsObject, Json} object FilterTopFeaturesProcess { private var validFeatureHashKeys: Set[Long] = null def getValidFeatureHashKeys(validFeatureHashKeysBCast: Broadcast[Array[Long]]): Set[Long] = { if (validFeatureHashKeys == null) { validFeatureHashKeys = validFeatureHashKeysBCast.value.toSet } validFeatureHashKeys } def collectDistinctFeatureHashes(ss: SparkSession, filteredDict: DataFrame): Array[Long] = { import ss.implicits._ val featureHashUDF = udf((dim: String, value: String) => WalLogAgg.toFeatureHash(dim, value)) filteredDict.withColumn("featureHash", featureHashUDF(col("dim"), col("value"))) .select("featureHash") .distinct().as[Long].collect() } def filterTopKsPerDim(dict: DataFrame, maxRankPerDim: Broadcast[Map[String, Int]], defaultMaxRank: Int): DataFrame = { val filterUDF = udf((dim: String, rank: Long) => { rank < maxRankPerDim.value.getOrElse(dim, defaultMaxRank) }) dict.filter(filterUDF(col("dim"), col("rank"))) } def filterWalLogAgg(ss: SparkSession, walLogAgg: Dataset[WalLogAgg], transformers: Seq[Transformer], validFeatureHashKeysBCast: Broadcast[Array[Long]]) = { import ss.implicits._ walLogAgg.mapPartitions { iter => val validFeatureHashKeys = getValidFeatureHashKeys(validFeatureHashKeysBCast) iter.map { walLogAgg => WalLogAgg.filterProps(walLogAgg, transformers, validFeatureHashKeys) } } } } class FilterTopFeaturesProcess(taskConf: TaskConf) extends org.apache.s2graph.s2jobs.task.Process(taskConf) { import FilterTopFeaturesProcess._ override def execute(ss: SparkSession, inputMap: Map[String, DataFrame]): DataFrame = { import ss.implicits._ val maxRankPerDim = taskConf.options.get("maxRankPerDim").map { s => Json.parse(s).as[JsObject].fields.map { case (k, jsValue) => k -> jsValue.as[Int] }.toMap } val maxRankPerDimBCast = ss.sparkContext.broadcast(maxRankPerDim.getOrElse(Map.empty)) val defaultMaxRank = taskConf.options.get("defaultMaxRank").map(_.toInt) val featureDict = inputMap(taskConf.options("featureDict")) val walLogAgg = inputMap(taskConf.options("walLogAgg")).as[WalLogAgg] val transformers = TaskConf.parseTransformers(taskConf) val filteredDict = filterTopKsPerDim(featureDict, maxRankPerDimBCast, defaultMaxRank.getOrElse(Int.MaxValue)) val validFeatureHashKeys = collectDistinctFeatureHashes(ss, filteredDict) val validFeatureHashKeysBCast = ss.sparkContext.broadcast(validFeatureHashKeys) filterWalLogAgg(ss, walLogAgg, transformers, validFeatureHashKeysBCast).toDF() } override def mandatoryOptions: Set[String] = Set("featureDict", "walLogAgg") }
Example 23
Source File: Deserializer.scala From almaren-framework with Apache License 2.0 | 5 votes |
package com.github.music.of.the.ainur.almaren.state.core import com.github.music.of.the.ainur.almaren.State import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{DataType, StructType} import scala.language.implicitConversions import com.github.music.of.the.ainur.almaren.Almaren import com.github.music.of.the.ainur.almaren.util.Constants import org.apache.spark.sql.Dataset abstract class Deserializer() extends State { override def executor(df: DataFrame): DataFrame = deserializer(df) def deserializer(df: DataFrame): DataFrame implicit def string2Schema(schema: String): DataType = StructType.fromDDL(schema) } case class AvroDeserializer(columnName: String,schema: String) extends Deserializer { import org.apache.spark.sql.avro._ import org.apache.spark.sql.functions._ override def deserializer(df: DataFrame): DataFrame = { logger.info(s"columnName:{$columnName}, schema:{$schema}") df.withColumn(columnName,from_avro(col(columnName),schema)) .select("*",columnName.concat(".*")).drop(columnName) } } case class JsonDeserializer(columnName: String,schema: Option[String]) extends Deserializer { import org.apache.spark.sql.functions._ override def deserializer(df: DataFrame): DataFrame = { import df.sparkSession.implicits._ logger.info(s"columnName:{$columnName}, schema:{$schema}") df.withColumn(columnName, from_json(col(columnName), schema.getOrElse(getSchemaDDL(df.selectExpr(columnName).as[(String)])))) .select("*",columnName.concat(".*")) .drop(columnName) } private def getSchemaDDL(df: Dataset[String]): String = Almaren.spark.getOrCreate().read.json(df.sample(Constants.sampleDeserializer)).schema.toDDL } case class XMLDeserializer(columnName: String) extends Deserializer { import com.databricks.spark.xml.XmlReader override def deserializer(df: DataFrame): DataFrame = { logger.info(s"columnName:{$columnName}") new XmlReader().xmlRdd(df.sparkSession,df.select(columnName).rdd.map(r => r(0).asInstanceOf[String])).toDF } }
Example 24
Source File: PipelineWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.{DataFrame, Dataset} class PipelineWrapper() { var pipeline = new Pipeline() var transformers: Array[TransformerWrapper] = Array() def setTransformers(value: Array[TransformerWrapper]): this.type = { transformers = value setStages(PipelineBuilder.build(transformers)) this } def setStages(value: Array[_ <: PipelineStage]): Unit = { pipeline = pipeline.setStages(value) } def fit(dataset: Dataset[_]): PipelineModelWrapper = { new PipelineModelWrapper(pipeline.fit(dataset), transformers) } } class PipelineModelWrapper(val model: PipelineModel, val transformers: Array[TransformerWrapper]) { def transform(dataset: Dataset[_]): DataFrame = { var df = model.transform(dataset) if (transformers.length >= 2) { (0 until transformers.length - 1).foreach { i => val outCols = transformers(i).getOutputCols for (col <- outCols) { df = df.drop(col) } } } df } }
Example 25
Source File: Sampler.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import scala.util.Random class Sampler(fraction: Double, override val uid: String, seed: Int = Random.nextInt) extends Transformer { def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler")) final def getOutputCol: String = $(inputCol) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sample(false, fraction, seed).toDF } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Sampler = defaultCopy(extra) } object Sampler { def main(args: Array[String]): Unit = { val ss = SparkSession .builder .master("local") .appName("preprocess") .getOrCreate() val training = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") println(training.count) val sampler = new Sampler(0.5) .setInputCol("features") val pipeline = new Pipeline() .setStages(Array(sampler)) val model = pipeline.fit(training) val test = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") model.transform(test).select("*") .collect() .foreach { case Row(label: Double, vector: Vector) => println(s"($label, " + s"${vector.toSparse.indices.mkString("[", ",", "]")}, " + s"${vector.toSparse.values.mkString("[", ",", "]")}") } ss.stop() } }
Example 26
Source File: FeatureUtils.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.sql.{Dataset, Row} import scala.language.postfixOps object FeatureUtils { def maxDim(dataset: Dataset[Row], col: String = "features"): Int = { dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => val dim = rows.map { case Row(v: Vector) => v match { case sv: SparseVector => sv.indices.last case dv: DenseVector => dv.size } }.max Iterator(dim) }.max + 1 } def countNonZero(dataset: Dataset[Row], col: String = "features"): Array[Int] = { dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => val mergeIndices = rows.map { case Row(v: Vector) => v match { case sv: SparseVector => sv.indices.toList } }.reduce(_ union _ distinct) Iterator(mergeIndices) }.reduce((a, b) => (a union b).distinct).toArray } }
Example 27
Source File: package.scala From amadou with Apache License 2.0 | 5 votes |
package com.mediative.amadou import com.google.api.services.bigquery.model._ import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem import com.google.cloud.hadoop.io.bigquery._ import org.apache.hadoop.fs.{FileSystem, Path} import net.ceedubs.ficus.readers.ValueReader import net.ceedubs.ficus.FicusInstances import org.apache.spark.sql.{Dataset, SparkSession, Encoder} import java.util.concurrent.ThreadLocalRandom import scala.collection.JavaConversions._ package object bigquery extends FicusInstances { object CreateDisposition extends Enumeration { val CREATE_IF_NEEDED, CREATE_NEVER = Value } object WriteDisposition extends Enumeration { val WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY = Value } val BQ_CSV_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss zzz" object TableNotFound { import com.google.api.client.googleapis.json.GoogleJsonResponseException import com.google.api.client.googleapis.json.GoogleJsonError import scala.collection.JavaConverters._ def unapply(error: Throwable): Option[GoogleJsonError.ErrorInfo] = error match { case error: GoogleJsonResponseException => Some(error.getDetails) .filter(_.getCode == 404) .flatMap(_.getErrors.asScala.find(_.getReason == "notFound")) case _ => None } } def tableHasDataForDate( spark: SparkSession, table: TableReference, date: java.sql.Date, column: String): Boolean = { val bq = BigQueryClient.getInstance(spark.sparkContext.hadoopConfiguration) bq.hasDataForDate(table, date, column) } def saveAsBigQueryTable( tableRef: TableReference, writeDisposition: WriteDisposition.Value, createDisposition: CreateDisposition.Value): Unit = { val bucket = conf.get(BigQueryConfiguration.GCS_BUCKET_KEY) val temp = s"spark-bigquery-${System.currentTimeMillis()}=${ThreadLocalRandom.current.nextInt(Int.MaxValue)}" val gcsPath = s"gs://$bucket/spark-bigquery-tmp/$temp" self.write.json(gcsPath) val schemaFields = self.schema.fields.map { field => import org.apache.spark.sql.types._ val fieldType = field.dataType match { case BooleanType => "BOOLEAN" case LongType => "INTEGER" case IntegerType => "INTEGER" case StringType => "STRING" case DoubleType => "FLOAT" case TimestampType => "TIMESTAMP" case _: DecimalType => "INTEGER" } new TableFieldSchema().setName(field.name).setType(fieldType) }.toList val tableSchema = new TableSchema().setFields(schemaFields) bq.load(gcsPath, tableRef, tableSchema, writeDisposition, createDisposition) delete(new Path(gcsPath)) } private def delete(path: Path): Unit = { val fs = FileSystem.get(path.toUri, conf) fs.delete(path, true) () } } implicit val valueReader: ValueReader[BigQueryTable.PartitionStrategy] = ValueReader[String].map { _ match { case "month" => BigQueryTable.PartitionByMonth case "day" => BigQueryTable.PartitionByDay case other => sys.error(s"Unknown partition strategy") } } }
Example 28
Source File: DatasetExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch05 import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.{Dataset, SQLContext} import org.apache.spark.sql.functions._ private case class Person(id: Int, name: String, age: Int) object DatasetExample { def main(args: Seq[String]): Unit = { val conf = new SparkConf().setAppName("DatasetExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) run(sc, sqlContext) sc.stop() } def run(sc: SparkContext, sqlContext: SQLContext): Unit = { import sqlContext.implicits._ // Creates a Dataset from a `Seq` val seq = Seq((1, "Bob", 23), (2, "Tom", 23), (3, "John", 22)) val ds1: Dataset[(Int, String, Int)] = sqlContext.createDataset(seq) val ds2: Dataset[(Int, String, Int)] = seq.toDS() // Creates a Dataset from a `RDD` val rdd = sc.parallelize(seq) val ds3: Dataset[(Int, String, Int)] = sqlContext.createDataset(rdd) val ds4: Dataset[(Int, String, Int)] = rdd.toDS() // Creates a Dataset from a `DataFrame` val df = rdd.toDF("id", "name", "age") val ds5: Dataset[Person] = df.as[Person] // Selects a column ds5.select(expr("name").as[String]).show() // Filtering ds5.filter(_.name == "Bob").show() ds5.filter(person => person.age == 23).show() // Groups and counts the number of rows ds5.groupBy(_.age).count().show() } }
Example 29
Source File: MovieRecommendation.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.MovieRecommendation import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.SQLImplicits import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating import scala.Tuple2 import org.apache.spark.rdd.RDD object MovieRecommendation { //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = { val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product))) val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating) }.join(data.map(x => ((x.user, x.product), x.rating))).values if (implicitPrefs) { println("(Prediction, Rating)") println(predictionsAndRatings.take(5).mkString("\n")) } math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean()) } def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession .builder() .appName("JavaLDAExample") .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/"). getOrCreate() val ratigsFile = "data/ratings.csv" val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile) val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp")) ratingsDF.show(false) val moviesFile = "data/movies.csv" val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile) val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres")) moviesDF.show(false) ratingsDF.createOrReplaceTempView("ratings") moviesDF.createOrReplaceTempView("movies") var rmseTest = computeRmse(model, testRDD, true) println("Test RMSE: = " + rmseTest) //Less is better //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668 println("Recommendations: (MovieId => Rating)") println("----------------------------------") val recommendationsUser = model.recommendProducts(668, 6) recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println) println("----------------------------------") spark.stop() } }
Example 30
Source File: Describe.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.ml.linalg.{ Matrix, Vectors } import org.apache.spark.ml.stat.Correlation import org.apache.spark.sql.Row object Describe { case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, international_plan: String, voice_mail_plan: String, num_voice_mail: Double, total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, total_international_num_calls: Double, churn: String) val schema = StructType(Array( StructField("state_code", StringType, true), StructField("account_length", IntegerType, true), StructField("area_code", StringType, true), StructField("international_plan", StringType, true), StructField("voice_mail_plan", StringType, true), StructField("num_voice_mail", DoubleType, true), StructField("total_day_mins", DoubleType, true), StructField("total_day_calls", DoubleType, true), StructField("total_day_charge", DoubleType, true), StructField("total_evening_mins", DoubleType, true), StructField("total_evening_calls", DoubleType, true), StructField("total_evening_charge", DoubleType, true), StructField("total_night_mins", DoubleType, true), StructField("total_night_calls", DoubleType, true), StructField("total_night_charge", DoubleType, true), StructField("total_international_mins", DoubleType, true), StructField("total_international_calls", DoubleType, true), StructField("total_international_charge", DoubleType, true), StructField("total_international_num_calls", DoubleType, true), StructField("churn", StringType, true))) def main(args: Array[String]) { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Desribe") .getOrCreate() spark.conf.set("spark.debug.maxToStringFields", 10000) val DEFAULT_MAX_TO_STRING_FIELDS = 2500 if (SparkEnv.get != null) { SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS) } else { DEFAULT_MAX_TO_STRING_FIELDS } import spark.implicits._ val trainSet: Dataset[CustomerAccount] = spark.read. option("inferSchema", "false") .format("com.databricks.spark.csv") .schema(schema) .load("data/churn-bigml-80.csv") .as[CustomerAccount] val statsDF = trainSet.describe() statsDF.show() trainSet.createOrReplaceTempView("UserAccount") spark.catalog.cacheTable("UserAccount") spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() trainSet.groupBy("churn").count.show() spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn") } }
Example 31
Source File: Preprocessing.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.Dataset object Preprocessing { case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, international_plan: String, voice_mail_plan: String, num_voice_mail: Double, total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, total_international_num_calls: Double, churn: String) val schema = StructType(Array( StructField("state_code", StringType, true), StructField("account_length", IntegerType, true), StructField("area_code", StringType, true), StructField("international_plan", StringType, true), StructField("voice_mail_plan", StringType, true), StructField("num_voice_mail", DoubleType, true), StructField("total_day_mins", DoubleType, true), StructField("total_day_calls", DoubleType, true), StructField("total_day_charge", DoubleType, true), StructField("total_evening_mins", DoubleType, true), StructField("total_evening_calls", DoubleType, true), StructField("total_evening_charge", DoubleType, true), StructField("total_night_mins", DoubleType, true), StructField("total_night_calls", DoubleType, true), StructField("total_night_charge", DoubleType, true), StructField("total_international_mins", DoubleType, true), StructField("total_international_calls", DoubleType, true), StructField("total_international_charge", DoubleType, true), StructField("total_international_num_calls", DoubleType, true), StructField("churn", StringType, true))) val spark: SparkSession = SparkSessionCreate.createSession("ChurnPredictionRandomForest") import spark.implicits._ val trainSet: Dataset[CustomerAccount] = spark.read. option("inferSchema", "false") .format("com.databricks.spark.csv") .schema(schema) .load("data/churn-bigml-80.csv") .as[CustomerAccount] val statsDF = trainSet.describe() statsDF.show() trainSet.cache() trainSet.groupBy("churn").sum("total_international_num_calls").show() trainSet.groupBy("churn").sum("total_international_charge").show() val testSet: Dataset[CustomerAccount] = spark.read. option("inferSchema", "false") .format("com.databricks.spark.csv") .schema(schema) .load("data/churn-bigml-20.csv") .as[CustomerAccount] testSet.describe() testSet.cache() trainSet.printSchema() trainSet.show() trainSet.createOrReplaceTempView("UserAccount") spark.catalog.cacheTable("UserAccount") /////////////// Feature engineering spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() trainSet.groupBy("churn").count.show() spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) as Total_intl_call FROM UserAccount GROUP BY churn").show() val fractions = Map("False" -> 0.1675, "True" -> 1.0) //Here we're keeping all instances of the Churn=True class, but downsampling the Churn=False class to a fraction of 388/2278. val churnDF = trainSet.stat.sampleBy("churn", fractions, 123456L) churnDF.groupBy("churn").count.show() val trainDF = churnDF .drop("state_code") .drop("area_code") .drop("voice_mail_plan") .drop("total_day_charge") .drop("total_evening_charge") println(trainDF.count) trainDF.select("account_length", "international_plan", "num_voice_mail", "total_day_calls", "total_international_num_calls", "churn").show(10) }
Example 32
Source File: XmlReader.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession} import org.apache.spark.sql.types.StructType import com.databricks.spark.xml.util.XmlFile import com.databricks.spark.xml.util.FailFastMode @deprecated("Use xmlFile(SparkSession, ...)", "0.5.0") def xmlFile(sqlContext: SQLContext, path: String): DataFrame = { // We need the `charset` and `rowTag` before creating the relation. val (charset, rowTag) = { val options = XmlOptions(parameters.toMap) (options.charset, options.rowTag) } val relation = XmlRelation( () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), Some(path), parameters.toMap, schema)(sqlContext) sqlContext.baseRelationToDataFrame(relation) } @deprecated("Use xmlRdd(SparkSession, ...)", "0.5.0") def xmlRdd(sqlContext: SQLContext, xmlRDD: RDD[String]): DataFrame = { val relation = XmlRelation( () => xmlRDD, None, parameters.toMap, schema)(sqlContext) sqlContext.baseRelationToDataFrame(relation) } }
Example 33
Source File: GroupSortedDataset.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted.sql import scala.reflect.ClassTag import org.apache.spark.sql.{ Column, Dataset, Encoder } import org.apache.spark.sql.functions.col import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder } import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate } object GroupSortedDataset { private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = { val key = col(dataset.columns.head) val valueSort = { val sort = sortBy(col(dataset.columns.last)) if (reverse) sort.desc else sort.asc } new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort)) } } class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable { def toDS: Dataset[(K, V)] = dataset def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f)) } def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIterator(_)(f)) } def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f))) } def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] = mapStreamByKey(iter => Iterator(iter.reduceLeft(f))) def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(_.scanLeft(wCreate())(f)) } }
Example 34
Source File: SparkSuite.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted import org.scalactic.Equality import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.{ Dataset, SparkSession } object SparkSuite { lazy val spark: SparkSession = { val session = SparkSession.builder .master("local[*]") .appName("test") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.ui.enabled", false) .config("spark.sql.shuffle.partitions", 4) .getOrCreate() session } lazy val sc: SparkContext = spark.sparkContext lazy val jsc = new JavaSparkContext(sc) def javaSparkContext() = jsc } trait SparkSuite { implicit lazy val spark: SparkSession = SparkSuite.spark implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] { private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size) def areEqual(a: RDD[X], b: Any): Boolean = b match { case s: Seq[_] => toCounts(a.collect) == toCounts(s) case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect) } } implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] { def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b) } implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] { def areEqual(a: Dataset[X], b: Any): Boolean = b match { case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd) case x => rddEq.areEqual(a.rdd, x) } } }
Example 35
Source File: VLORRealDataExample.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.example import org.apache.spark.ml.classification.{LogisticRegression, VLogisticRegression} import org.apache.spark.sql.{Dataset, SparkSession} object VLORRealDataExample { // https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a def main(args: Array[String]) = { val spark = SparkSession .builder() .appName("VLogistic Regression real data example") .getOrCreate() val sc = spark.sparkContext val dataset1: Dataset[_] = spark.read.format("libsvm").load("data/a9a") val trainer = new LogisticRegression() .setFitIntercept(false) .setRegParam(0.5) val model = trainer.fit(dataset1) val vtrainer = new VLogisticRegression() .setColsPerBlock(100) .setRowsPerBlock(10) .setColPartitions(3) .setRowPartitions(3) .setRegParam(0.5) val vmodel = vtrainer.fit(dataset1) println(s"VLogistic regression coefficients: ${vmodel.coefficients}") println(s"Logistic regression coefficients: ${model.coefficients}") sc.stop() } }
Example 36
Source File: LORExample2.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.example import org.apache.spark.ml.classification.MyLogisticRegression import org.apache.spark.sql.{Dataset, SparkSession} object LORExample2 { def main(args: Array[String]) = { var maxIter: Int = 100 var dimension: Int = 780 var regParam: Double = 0.5 var fitIntercept: Boolean = true var elasticNetParam = 1.0 var dataPath: String = null try { maxIter = args(0).toInt dimension = args(1).toInt regParam = args(2).toDouble fitIntercept = args(3).toBoolean elasticNetParam = args(4).toDouble dataPath = args(5) } catch { case _: Throwable => println("Param list: " + "maxIter dimension" + " regParam fitIntercept elasticNetParam dataPath") println("parameter description:" + "\nmaxIter max iteration number for VLogisticRegression" + "\ndimension training data dimension number" + "\nregParam regularization parameter" + "\nfitIntercept whether to train intercept, true or false" + "\nelasticNetParam elastic net parameter for regulization" + "\ndataPath training data path on HDFS") System.exit(-1) } val spark = SparkSession .builder() .appName("LOR for testing") .getOrCreate() val sc = spark.sparkContext try { println(s"begin load data from $dataPath") val dataset: Dataset[_] = spark.read.format("libsvm") .option("numFeatures", dimension.toString) .load(dataPath) val trainer = new MyLogisticRegression() .setMaxIter(maxIter) .setRegParam(regParam) .setFitIntercept(fitIntercept) .setElasticNetParam(elasticNetParam) val model = trainer.fit(dataset) println(s"LOR done, coeffs non zeros: ${model.coefficients.numNonzeros}") } catch { case e: Exception => e.printStackTrace() }finally { // println("Press ENTER to exit.") // System.in.read() } sc.stop() } }
Example 37
Source File: VSoftmaxRegressionSuite.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.{SparseMatrix, Vector, Vectors} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import scala.language.existentials class VSoftmaxRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { import testImplicits._ private val seed = 42 @transient var multinomialDataset: Dataset[_] = _ private val eps: Double = 1e-5 override def beforeAll(): Unit = { super.beforeAll() multinomialDataset = { val nPoints = 50 val coefficients = Array( -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) val xMean = Array(5.843, 3.057, 3.758, 1.199) val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) val testData = LogisticRegressionSuite.generateMultinomialLogisticInput( coefficients, xMean, xVariance, addIntercept = true, nPoints, seed) val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed)) df.cache() println("softmax test data:") df.show(10, false) df } } test("test on multinomialDataset") { def b2s(b: Boolean): String = { if (b) "w/" else "w/o" } for (standardization <- Seq(false, true)) { for ((reg, elasticNet) <- Seq((0.0, 0.0), (2.3, 0.0), (0.3, 0.05), (0.01, 1.0))) { println() println(s"# test ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}") val trainer = new LogisticRegression() .setFamily("multinomial") .setStandardization(standardization) .setWeightCol("weight") .setRegParam(reg) .setFitIntercept(false) .setElasticNetParam(elasticNet) val model = trainer.fit(multinomialDataset) val vtrainer = new VSoftmaxRegression() .setColsPerBlock(2) .setRowsPerBlock(5) .setColPartitions(2) .setRowPartitions(3) .setWeightCol("weight") .setGeneratingFeatureMatrixBuffer(2) .setStandardization(standardization) .setRegParam(reg) .setElasticNetParam(elasticNet) val vmodel = vtrainer.fit(multinomialDataset) println(s"VSoftmaxRegression coefficientMatrix:\n" + s"${vmodel.coefficientMatrix.asInstanceOf[SparseMatrix].toDense},\n" + s"ml.SoftmaxRegression coefficientMatrix:\n" + s"${model.coefficientMatrix}\n") assert(vmodel.coefficientMatrix ~== model.coefficientMatrix relTol eps) } } } }
Example 38
Source File: StreamingIncrementCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import java.util.Locale import org.apache.spark.SparkException import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.streaming.StreamingRelationV2 import org.apache.spark.sql.sources.v2.StreamWriteSupport import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.xsql.DataSourceManager._ import org.apache.spark.sql.xsql.StreamingSinkType case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand { private var outputMode: OutputMode = OutputMode.Append // dummy override def output: Seq[AttributeReference] = Seq.empty // dummy override def producedAttributes: AttributeSet = plan.producedAttributes override def run(sparkSession: SparkSession): Seq[Row] = { import StreamingSinkType._ val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan)) val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema)) plan.collectLeaves.head match { case StreamingRelationV2(_, _, extraOptions, _, _) => val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK) val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv => val key = kv._1.substring(STREAMING_SINK_PREFIX.length) (key, kv._2) } StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match { case CONSOLE => case TEXT | PARQUET | ORC | JSON | CSV => if (sinkOptions.get(STREAMING_SINK_PATH) == None) { throw new SparkException("Sink type is file, must config path") } case KAFKA => if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) { throw new SparkException("Sink type is kafka, must config bootstrap servers") } if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) { throw new SparkException("Sink type is kafka, must config kafka topic") } case _ => throw new SparkException( "Sink type is invalid, " + s"select from ${StreamingSinkType.values}") } val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf) val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",") val sink = ds.newInstance() match { case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) => w case _ => val ds = DataSource( sparkSession, className = source, options = sinkOptions.toMap, partitionColumns = Nil) ds.createSink(InternalOutputModes.Append) } val outputMode = InternalOutputModes( extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE)) val duration = extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION) val trigger = extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match { case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration) case STREAMING_ONCE_TRIGGER => Trigger.Once() case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration) } val query = sparkSession.sessionState.streamingQueryManager.startQuery( extraOptions.get("queryName"), extraOptions.get(STREAMING_CHECKPOINT_LOCATION), df, sinkOptions.toMap, sink, outputMode, useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK, recoverFromCheckpointLocation = true, trigger = trigger) query.awaitTermination() } // dummy Seq.empty } } case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output }
Example 39
Source File: Aggregator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 40
Source File: JsonUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.json import org.apache.spark.input.PortableDataStream import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset import org.apache.spark.sql.catalyst.json.JSONOptions object JsonUtils { def sample(json: RDD[PortableDataStream], options: JSONOptions): RDD[PortableDataStream] = { require(options.samplingRatio > 0, s"samplingRatio (${options.samplingRatio}) should be greater than 0") if (options.samplingRatio > 0.99) { json } else { json.sample(withReplacement = false, options.samplingRatio, 1) } } }
Example 41
Source File: SaveIntoDataSourceCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.CreatableRelationProvider case class SaveIntoDataSourceCommand( query: LogicalPlan, dataSource: CreatableRelationProvider, options: Map[String, String], mode: SaveMode) extends RunnableCommand { override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { dataSource.createRelation( sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query)) Seq.empty[Row] } override def simpleString: String = { val redacted = SQLConf.get.redactOptions(options) s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}" } }
Example 42
Source File: CSVUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ object CSVUtils { def sample(csv: RDD[Array[String]], options: CSVOptions): RDD[Array[String]] = { require(options.samplingRatio > 0, s"samplingRatio (${options.samplingRatio}) should be greater than 0") if (options.samplingRatio > 0.99) { csv } else { csv.sample(withReplacement = false, options.samplingRatio, 1) } } }
Example 43
Source File: FrequentItems.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 44
Source File: cache.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan case class CacheTableCommand( tableIdent: TableIdentifier, plan: Option[LogicalPlan], isLazy: Boolean) extends RunnableCommand { require(plan.isEmpty || tableIdent.database.isEmpty, "Database name is not allowed in CACHE TABLE AS SELECT") override protected def innerChildren: Seq[QueryPlan[_]] = plan.toSeq override def run(sparkSession: SparkSession): Seq[Row] = { plan.foreach { logicalPlan => Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString) } sparkSession.catalog.cacheTable(tableIdent.quotedString) if (!isLazy) { // Performs eager caching sparkSession.table(tableIdent).count() } Seq.empty[Row] } } case class UncacheTableCommand( tableIdent: TableIdentifier, ifExists: Boolean) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val tableId = tableIdent.quotedString if (!ifExists || sparkSession.catalog.tableExists(tableId)) { sparkSession.catalog.uncacheTable(tableId) } Seq.empty[Row] } } override def makeCopy(newArgs: Array[AnyRef]): ClearCacheCommand = ClearCacheCommand() }
Example 45
Source File: ConsoleWriter.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.sources import org.apache.spark.internal.Logging import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.sources.v2.DataSourceOptions import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.types.StructType class ConsoleWriter(schema: StructType, options: DataSourceOptions) extends StreamWriter with Logging { // Number of rows to display, by default 20 rows protected val numRowsToShow = options.getInt("numRows", 20) // Truncate the displayed data if it is too long, by default it is true protected val isTruncated = options.getBoolean("truncate", true) assert(SparkSession.getActiveSession.isDefined) protected val spark = SparkSession.getActiveSession.get def createWriterFactory(): DataWriterFactory[InternalRow] = PackedRowWriterFactory override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { // We have to print a "Batch" label for the epoch for compatibility with the pre-data source V2 // behavior. printRows(messages, schema, s"Batch: $epochId") } def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} protected def printRows( commitMessages: Array[WriterCommitMessage], schema: StructType, printMessage: String): Unit = { val rows = commitMessages.collect { case PackedRowCommitMessage(rs) => rs }.flatten // scalastyle:off println println("-------------------------------------------") println(printMessage) println("-------------------------------------------") // scalastyle:off println Dataset.ofRows(spark, LocalRelation(schema.toAttributes, rows)) .show(numRowsToShow, isTruncated) } override def toString(): String = { s"ConsoleWriter[numRows=$numRowsToShow, truncate=$isTruncated]" } }
Example 46
Source File: TestCsvData.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import org.apache.spark.sql.{Dataset, Encoders, SparkSession} private[csv] trait TestCsvData { protected def spark: SparkSession def sampledTestData: Dataset[String] = { spark.range(0, 100, 1).map { index => val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46, 57, 62, 68, 72) if (predefinedSample.contains(index)) { index.toString } else { (index.toDouble + 0.1).toString } }(Encoders.STRING) } }
Example 47
Source File: GenericWordSpecSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test import org.scalatest.WordSpec import org.apache.spark.sql.Dataset class GenericWordSpecSuite extends WordSpec with SharedSparkSession { import testImplicits._ private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS "A Simple Dataset" when { "looked at as complete rows" should { "have the specified number of elements" in { assert(8 === ds.count) } "have the specified number of unique elements" in { assert(8 === ds.distinct.count) } } "refined to specific columns" should { "have the specified number of elements in each column" in { assert(8 === ds.select("_1").count) assert(8 === ds.select("_2").count) } "have the correct number of distinct elements in each column" in { assert(8 === ds.select("_1").distinct.count) assert(4 === ds.select("_2").distinct.count) } } } }
Example 48
Source File: GenericFlatSpecSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test import org.scalatest.FlatSpec import org.apache.spark.sql.Dataset class GenericFlatSpecSuite extends FlatSpec with SharedSparkSession { import testImplicits._ private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS "A Simple Dataset" should "have the specified number of elements" in { assert(8 === ds.count) } it should "have the specified number of unique elements" in { assert(8 === ds.distinct.count) } it should "have the specified number of elements in each column" in { assert(8 === ds.select("_1").count) assert(8 === ds.select("_2").count) } it should "have the correct number of distinct elements in each column" in { assert(8 === ds.select("_1").distinct.count) assert(4 === ds.select("_2").distinct.count) } }
Example 49
Source File: GenericFunSpecSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test import org.scalatest.FunSpec import org.apache.spark.sql.Dataset class GenericFunSpecSuite extends FunSpec with SharedSparkSession { import testImplicits._ private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS describe("Simple Dataset") { it("should have the specified number of elements") { assert(8 === ds.count) } it("should have the specified number of unique elements") { assert(8 === ds.distinct.count) } it("should have the specified number of elements in each column") { assert(8 === ds.select("_1").count) assert(8 === ds.select("_2").count) } it("should have the correct number of distinct elements in each column") { assert(8 === ds.select("_1").distinct.count) assert(4 === ds.select("_2").distinct.count) } } }
Example 50
Source File: CassandraStorage.scala From graphsense-transformation with MIT License | 5 votes |
package at.ac.ait.storage import com.datastax.spark.connector.rdd.ValidRDDType import com.datastax.spark.connector.rdd.reader.RowReaderFactory import com.datastax.spark.connector.writer.{RowWriterFactory} import java.time.LocalDateTime import java.time.format.DateTimeFormatter import org.apache.spark.sql.{Dataset, Encoder, SparkSession} import scala.reflect.ClassTag import at.ac.ait.Util._ class CassandraStorage(spark: SparkSession) { import spark.implicits._ import com.datastax.spark.connector._ def load[T <: Product: ClassTag: RowReaderFactory: ValidRDDType: Encoder]( keyspace: String, tableName: String, columns: ColumnRef* ) = { spark.sparkContext.setJobDescription(s"Loading table ${tableName}") val table = spark.sparkContext.cassandraTable[T](keyspace, tableName) if (columns.isEmpty) table.toDS().as[T] else table.select(columns: _*).toDS().as[T] } def store[T <: Product: RowWriterFactory]( keyspace: String, tableName: String, df: Dataset[T] ) = { spark.sparkContext.setJobDescription(s"Writing table ${tableName}") val dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss") val timestamp = LocalDateTime.now().format(dtf) println(s"[$timestamp] Writing table ${tableName}") time { df.rdd.saveToCassandra(keyspace, tableName) } } }
Example 51
Source File: StringMap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ private val className = classOf[StringMap].getName override def load(path: String): StringMap = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("labels", "handleInvalid", "defaultValue").head() val labels = data.getAs[Map[String, Double]](0) val handleInvalid = HandleInvalid.fromString(data.getAs[String](1)) val defaultValue = data.getAs[Double](2) val model = new StringMapModel(labels, handleInvalid = handleInvalid, defaultValue = defaultValue) val transformer = new StringMap(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 52
Source File: MathUnary.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType} import org.apache.spark.sql.functions.udf private val className = classOf[MathUnary].getName override def load(path: String): MathUnary = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("operation").head() val operation = data.getAs[String](0) val model = MathUnaryModel(UnaryOperation.forName(operation)) val transformer = new MathUnary(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 53
Source File: WordLengthFilter.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.WordLengthFilterModel import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, Params} import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} final def getWordLength: Int = $(wordLength) } class WordLengthFilter(override val uid: String) extends Transformer with WordLengthFilterParams with DefaultParamsWritable { val defaultLength = 3 var model: WordLengthFilterModel = new WordLengthFilterModel(defaultLength) //Initialize with default filter length 3 def this(model: WordLengthFilterModel) = this(uid = Identifiable.randomUID("filter_words")) def this() = this(new WordLengthFilterModel) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) def setWordLength(value: Int = defaultLength): this.type = set(wordLength, value) override def transform(dataset: Dataset[_]): DataFrame = { if(defaultLength != getWordLength) model = new WordLengthFilterModel(getWordLength) val filterWordsUdf = udf { (words: Seq[String]) => model(words) } dataset.withColumn($(outputCol), filterWordsUdf(dataset($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { require(schema($(inputCol)).dataType.isInstanceOf[ArrayType], s"Input column must be of type ArrayType(StringType,true) but got ${schema($(inputCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(outputCol)), s"Output column ${$(outputCol)} already exists.") StructType(schema.fields :+ StructField($(outputCol), ArrayType(StringType, true))) } } object WordLengthFilter extends DefaultParamsReadable[WordLengthFilter] { override def load(path: String): WordLengthFilter = super.load(path) }
Example 54
Source File: MultinomialLabeler.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.MultinomialLabelerModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasFeaturesCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{udf, col} import ml.combust.mleap.core.util.VectorConverters._ class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"), val model: MultinomialLabelerModel) extends Transformer with HasFeaturesCol with HasProbabilitiesCol with HasLabelsCol { def setFeaturesCol(value: String): this.type = set(featuresCol, value) def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value) def setLabelsCol(value: String): this.type = set(labelsCol, value) @org.apache.spark.annotation.Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val probabilitiesUdf = udf { (vector: Vector) => model.top(vector).map(_._1).toArray } val labelsUdf = udf { (vector: Vector) => model.topLabels(vector).toArray } dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))). withColumn($(labelsCol), labelsUdf(col($(featuresCol)))) } override def copy(extra: ParamMap): Transformer = copyValues(new MultinomialLabeler(uid, model), extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT], s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(probabilitiesCol)), s"Output column ${$(probabilitiesCol)} already exists.") require(!inputFields.exists(_.name == $(labelsCol)), s"Output column ${$(labelsCol)} already exists.") StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)), StructField($(labelsCol), ArrayType(StringType)))) } }
Example 55
Source File: ChangingDesign.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_3 import com.tomekl007.{UserData, UserTransaction} import org.apache.spark.sql.{Dataset, SparkSession} import org.scalatest.FunSuite class ChangingDesign extends FunSuite { val spark = SparkSession.builder().master("local[2]").getOrCreate() test("example of operation that is causing shuffle") { import spark.sqlContext.implicits._ val userData = spark.sparkContext.makeRDD(List( UserData("user_1", "1"), UserData("user_2", "2"), UserData("user_4", "200") )).toDS() val repartitionedUserData = userData.repartition(userData("userId")) val transactionData = spark.sparkContext.makeRDD(List( UserTransaction("user_1", 100), UserTransaction("user_2", 300), UserTransaction("user_3", 1300) )).toDS() val repartitionedTransactionData = transactionData.repartition(transactionData("userId")) //when //data is already partitioned using join-column. Don't need to shuffle val res: Dataset[(UserData, UserTransaction)] = repartitionedUserData.joinWith(repartitionedTransactionData, userData("userId") === transactionData("userId"), "inner") //then res.show() assert(res.count() == 2) } }
Example 56
Source File: TestingOperationsThatCausesShuffle.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_3 import com.tomekl007.{UserData, UserTransaction} import org.apache.spark.sql.{Dataset, SparkSession} import org.scalatest.FunSuite class TestingOperationsThatCausesShuffle extends FunSuite { val spark = SparkSession.builder().master("local[2]").getOrCreate() test("example of operation that is causing shuffle") { import spark.sqlContext.implicits._ val userData = spark.sparkContext.makeRDD(List( UserData("user_1", "1"), UserData("user_2", "2"), UserData("user_4", "200") )).toDS() val transactionData = spark.sparkContext.makeRDD(List( UserTransaction("user_1", 100), UserTransaction("user_2", 300), UserTransaction("user_3", 1300) )).toDS() //shuffle: userData can stay on the current executors, but data from //transactionData needs to be send to those executors according to joinColumn //causing shuffle //when val res: Dataset[(UserData, UserTransaction)] = userData.joinWith(transactionData, userData("userId") === transactionData("userId"), "inner") //then res.show() assert(res.count() == 2) } }
Example 57
Source File: SessionLifecycle.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession} private[spark] object DatasetGenerator { // Generation of a dataset of type {Double, Double} with a by-name initialization function final def toDSPairDouble( numDataPoints: Int )( generator: Int => (Double, Double) )(implicit sessionLifeCycle: SessionLifeCycle): Dataset[(Double, Double)] = toDSPairDouble(Seq.tabulate(numDataPoints)(generator(_))) // Generation of a dataset of type {Double, Double} from a sequence of same type def toDSPairDouble( data: Seq[(Double, Double)] )(implicit sessionLifeCycle: SessionLifeCycle): Dataset[(Double, Double)] = { import sessionLifeCycle.sparkSession.implicits._ data.toDS() } // Generation of a dataset of type Double def toDSDouble(data: Seq[Double])(implicit sessionLifeCycle: SessionLifeCycle): Dataset[Double] = { import sessionLifeCycle.sparkSession.implicits._ data.toDS() } // Generation of a dataset of type Int def toDSInt(data: Seq[Int])(implicit sessionLifeCycle: SessionLifeCycle): Dataset[Int] = { import sessionLifeCycle.sparkSession.implicits._ data.toDS() } } // -------------------------- EOF ----------------------------------------------
Example 58
Source File: OpBoston.scala From transmogrifai-helloworld-sbt with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.hw.boston import com.salesforce.op._ import com.salesforce.op.evaluators.Evaluators import com.salesforce.op.readers.CustomReader import com.salesforce.op.stages.impl.regression.RegressionModelSelector import com.salesforce.op.stages.impl.regression.RegressionModelsToTry._ import com.salesforce.op.stages.impl.tuning.DataSplitter import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, SparkSession} object OpBoston extends OpAppWithRunner with BostonFeatures { //////////////////////////////////////////////////////////////////////////////// // READERS DEFINITION ///////////////////////////////////////////////////////////////////////////////// val randomSeed = 42L def customRead(path: String)(implicit spark: SparkSession): RDD[BostonHouse] = { val myFile = spark.sparkContext.textFile(path) myFile.filter(_.nonEmpty).zipWithIndex.map { case (x, id) => val words = x.replaceAll("\\s+", " ").replaceAll(s"^\\s+(?m)", "").replaceAll(s"(?m)\\s+$$", "").split(" ") BostonHouse(id.toInt, words(0).toDouble, words(1).toDouble, words(2).toDouble, words(3), words(4).toDouble, words(5).toDouble, words(6).toDouble, words(7).toDouble, words(8).toInt, words(9).toDouble, words(10).toDouble, words(11).toDouble, words(12).toDouble, words(13).toDouble) } } val trainingReader = new CustomReader[BostonHouse](key = _.rowId.toString) { def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = Left { val Array(train, _) = customRead(getFinalReadPath(params)).randomSplit(weights = Array(0.9, 0.1), randomSeed) train } } val scoringReader = new CustomReader[BostonHouse](key = _.rowId.toString) { def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = Left { val Array(_, test) = customRead(getFinalReadPath(params)).randomSplit(weights = Array(0.9, 0.1), randomSeed) test } } //////////////////////////////////////////////////////////////////////////////// // WORKFLOW DEFINITION ///////////////////////////////////////////////////////////////////////////////// val houseFeatures = Seq(crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat).transmogrify() val splitter = DataSplitter(seed = randomSeed) val prediction = RegressionModelSelector .withCrossValidation( dataSplitter = Some(splitter), seed = randomSeed, modelTypesToUse = Seq(OpGBTRegressor, OpRandomForestRegressor) ).setInput(medv, houseFeatures).getOutput() val workflow = new OpWorkflow().setResultFeatures(prediction) val evaluator = Evaluators.Regression().setLabelCol(medv).setPredictionCol(prediction) def runner(opParams: OpParams): OpWorkflowRunner = new OpWorkflowRunner( workflow = workflow, trainingReader = trainingReader, scoringReader = scoringReader, evaluationReader = Option(trainingReader), evaluator = Option(evaluator), scoringEvaluator = None, featureToComputeUpTo = Option(houseFeatures) ) }
Example 59
Source File: GaussianProcessCommons.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import breeze.optimize.LBFGSB import org.apache.spark.ml.commons.kernel.{EyeKernel, Kernel, _} import org.apache.spark.ml.commons.util.DiffFunctionMemoized import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.Instrumentation import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} private[ml] trait GaussianProcessCommons[F, E <: Predictor[F, E, M], M <: PredictionModel[F, M]] extends ProjectedGaussianProcessHelper { this: Predictor[F, E, M] with GaussianProcessParams => protected val getKernel : () => Kernel = () => $(kernel)() + $(sigma2).const * new EyeKernel protected def getPoints(dataset: Dataset[_]) = { dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map { case Row(label: Double, features: Vector) => LabeledPoint(label, features) } } protected def groupForExperts(points: RDD[LabeledPoint]) = { val numberOfExperts = Math.round(points.count().toDouble / $(datasetSizeForExpert)) points.zipWithIndex.map { case(instance, index) => (index % numberOfExperts, instance) }.groupByKey().map(_._2) } protected def getExpertLabelsAndKernels(points: RDD[LabeledPoint]): RDD[(BDV[Double], Kernel)] = { groupForExperts(points).map { chunk => val (labels, trainingVectors) = chunk.map(lp => (lp.label, lp.features)).toArray.unzip (BDV(labels: _*), getKernel().setTrainingVectors(trainingVectors)) } } protected def projectedProcess(expertLabelsAndKernels: RDD[(BDV[Double], Kernel)], points: RDD[LabeledPoint], optimalHyperparameters: BDV[Double]) = { val activeSet = $(activeSetProvider)($(activeSetSize), expertLabelsAndKernels, points, getKernel, optimalHyperparameters, $(seed)) points.unpersist() val (matrixKmnKnm, vectorKmny) = getMatrixKmnKnmAndVectorKmny(expertLabelsAndKernels, activeSet) expertLabelsAndKernels.unpersist() val optimalKernel = getKernel().setHyperparameters(optimalHyperparameters).setTrainingVectors(activeSet) // inv(sigma^2 K_mm + K_mn * K_nm) * K_mn * y val (magicVector, magicMatrix) = getMagicVector(optimalKernel, matrixKmnKnm, vectorKmny, activeSet, optimalHyperparameters) new GaussianProjectedProcessRawPredictor(magicVector, magicMatrix, optimalKernel) } protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor) : M } class GaussianProjectedProcessRawPredictor private[commons] (val magicVector: BDV[Double], val magicMatrix: BDM[Double], val kernel: Kernel) extends Serializable { def predict(features: Vector): (Double, Double) = { val cross = kernel.crossKernel(features) val selfKernel = kernel.selfKernel(features) (cross * magicVector, selfKernel + cross * magicMatrix * cross.t) } }
Example 60
Source File: GaussianProcessRegression.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import breeze.linalg.{DenseVector => BDV, _} import org.apache.spark.internal.Logging import org.apache.spark.ml.commons._ import org.apache.spark.ml.commons.kernel.Kernel import org.apache.spark.ml.commons.util._ import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, Instrumentation} import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset class GaussianProcessRegression(override val uid: String) extends Regressor[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with GaussianProcessParams with GaussianProcessCommons[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with Logging { def this() = this(Identifiable.randomUID("gaussProcessReg")) override protected def train(dataset: Dataset[_]): GaussianProcessRegressionModel = { val instr = Instrumentation.create(this, dataset) val points: RDD[LabeledPoint] = getPoints(dataset).cache() val expertLabelsAndKernels: RDD[(BDV[Double], Kernel)] = getExpertLabelsAndKernels(points).cache() val optimalHyperparameters = optimizeHypers(instr, expertLabelsAndKernels, likelihoodAndGradient) expertLabelsAndKernels.foreach(_._2.setHyperparameters(optimalHyperparameters)) produceModel(instr, points, expertLabelsAndKernels, optimalHyperparameters) } private def likelihoodAndGradient(yAndK : (BDV[Double], Kernel), x : BDV[Double]) = { val (y: BDV[Double], kernel : Kernel) = yAndK kernel.setHyperparameters(x) val (k, derivative) = kernel.trainingKernelAndDerivative() val (_, logdet, kinv) = logDetAndInv(k) val alpha = kinv * y val likelihood = 0.5 * (y.t * alpha) + 0.5 * logdet val alphaAlphaTMinusKinv = alpha * alpha.t alphaAlphaTMinusKinv -= kinv val gradient = derivative.map(derivative => -0.5 * sum(derivative *= alphaAlphaTMinusKinv)) (likelihood, BDV(gradient:_*)) } override def copy(extra: ParamMap): GaussianProcessRegression = defaultCopy(extra) override protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor): GaussianProcessRegressionModel = new GaussianProcessRegressionModel(uid, rawPredictor) } class GaussianProcessRegressionModel private[regression](override val uid: String, private val gaussianProjectedProcessRawPredictor: GaussianProjectedProcessRawPredictor) extends RegressionModel[Vector, GaussianProcessRegressionModel] { override protected def predict(features: Vector): Double = { gaussianProjectedProcessRawPredictor.predict(features)._1 } override def copy(extra: ParamMap): GaussianProcessRegressionModel = { val newModel = copyValues(new GaussianProcessRegressionModel(uid, gaussianProjectedProcessRawPredictor), extra) newModel.setParent(parent) } }
Example 61
Source File: DataFrameConverter.scala From incubator-toree with Apache License 2.0 | 5 votes |
package org.apache.toree.utils import org.apache.spark.sql.{Dataset, Row} import org.apache.toree.plugins.Plugin import play.api.libs.json.{JsObject, Json} import scala.util.Try import org.apache.toree.plugins.annotations.Init import DataFrameConverter._ class DataFrameConverter extends Plugin with LogLike { @Init def init() = { register(this) } def convert(df: Dataset[Row], outputType: String, limit: Int = 10): Try[String] = { Try( outputType.toLowerCase() match { case "html" => convertToHtml(df = df, limit = limit) case "json" => convertToJson(df = df, limit = limit) case "csv" => convertToCsv(df = df, limit = limit) } ) } private def convertToHtml(df: Dataset[Row], limit: Int = 10): String = { val columnFields = df.schema.fieldNames.map(columnName => { s"<th>${columnName}</th>" }).reduce(_ + _) val columns = s"<tr>${columnFields}</tr>" val rows = df.rdd.map(row => { val fieldValues = row.toSeq.map(field => { s"<td>${fieldToString(field)}</td>" }).reduce(_ + _) s"<tr>${fieldValues}</tr>" }).take(limit).reduce(_ + _) s"<table>${columns}${rows}</table>" } private def convertToJson(df: Dataset[Row], limit: Int = 10): String = { val schema = Json.toJson(df.schema.fieldNames) val transformed = df.rdd.map(row => row.toSeq.map(fieldToString).toArray) val rows = transformed.take(limit) JsObject(Seq( "columns" -> schema, "rows" -> Json.toJson(rows) )).toString() } private def convertToCsv(df: Dataset[Row], limit: Int = 10): String = { val headers = df.schema.fieldNames.reduce(_ + "," + _) val rows = df.rdd.map(row => { row.toSeq.map(fieldToString).reduce(_ + "," + _) }).take(limit).reduce(_ + "\n" + _) s"${headers}\n${rows}" } } object DataFrameConverter { def fieldToString(any: Any): String = any match { case null => "null" case seq: Seq[_] => seq.mkString("[", ", ", "]") case _ => any.toString } }
Example 62
Source File: CustomSinkSuite.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import com.holdenkarau.spark.testing.DataFrameSuiteBase import scala.collection.mutable.ListBuffer import org.scalatest.FunSuite import org.apache.spark._ import org.apache.spark.sql.{Dataset, DataFrame, Encoder, SQLContext} import org.apache.spark.sql.execution.streaming.MemoryStream class CustomSinkSuite extends FunSuite with DataFrameSuiteBase { test("really simple test of the custom sink") { import spark.implicits._ val input = MemoryStream[String] val doubled = input.toDS().map(x => x + " " + x) val formatName = ("com.highperformancespark.examples" + "structuredstreaming.CustomSinkCollectorProvider") val query = doubled.writeStream .queryName("testCustomSinkBasic") .format(formatName) .start() val inputData = List("hi", "holden", "bye", "pandas") input.addData(inputData) assert(query.isActive === true) query.processAllAvailable() assert(query.exception === None) assert(Pandas.results(0) === inputData.map(x => x + " " + x)) } } object Pandas{ val results = new ListBuffer[Seq[String]]() } class CustomSinkCollectorProvider extends ForeachDatasetSinkProvider { override def func(df: DataFrame) { val spark = df.sparkSession import spark.implicits._ Pandas.results += df.as[String].rdd.collect() } }
Example 63
Source File: StreamingKMeansSuite.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.linalg._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.execution.streaming.MemoryStream import org.scalatest.FunSuite import org.apache.log4j.{Level, Logger} case class TestRow(features: Vector) class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase { override def beforeAll(): Unit = { super.beforeAll() Logger.getLogger("org").setLevel(Level.OFF) } test("streaming model with one center should converge to true center") { import spark.implicits._ val k = 1 val dim = 5 val clusterSpread = 0.1 val seed = 63 // TODO: this test is very flaky. The centers do not converge for some // (most?) random seeds val (batches, trueCenters) = StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed) val inputStream = MemoryStream[TestRow] val ds = inputStream.toDS() val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01) val query = skm.evilTrain(ds.toDF()) val streamingModels = batches.map { batch => inputStream.addData(batch) query.processAllAvailable() skm.getModel } // TODO: use spark's testing suite streamingModels.last.centers.zip(trueCenters).foreach { case (center, trueCenter) => val centers = center.toArray.mkString(",") val trueCenters = trueCenter.toArray.mkString(",") println(s"${centers} | ${trueCenters}") assert(center.toArray.zip(trueCenter.toArray).forall( x => math.abs(x._1 - x._2) < 0.1)) } query.stop() } def compareBatchAndStreaming( batchModel: KMeansModel, streamingModel: StreamingKMeansModel, validationData: DataFrame): Unit = { assert(batchModel.clusterCenters === streamingModel.centers) // TODO: implement prediction comparison } } object StreamingKMeansSuite { def generateBatches( numPoints: Int, numBatches: Int, k: Int, d: Int, r: Double, seed: Int, initCenters: Array[Vector] = null): (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = { val rand = scala.util.Random rand.setSeed(seed) val centers = initCenters match { case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian()))) case _ => initCenters } val data = (0 until numBatches).map { i => (0 until numPoints).map { idx => val center = centers(idx % k) val vec = Vectors.dense( Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r)) TestRow(vec) } } (data, centers) } }
Example 64
Source File: FeatureExtraction.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package com.spark.recommendation import org.apache.spark.{sql, SparkConf} import org.apache.spark.ml.recommendation.ALS import org.apache.spark.sql.{Dataset, SparkSession} def getFeatures(): sql.DataFrame = { import spark.implicits._ //val ratings = spark.read.textFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_05/data/ml-100k 2/u.data").map(parseRating).toDF() val ratings = spark.read.textFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_05/2.0.0/scala-spark-app/src/main/scala/com/spark/recommendation/sample_movielens_ratings.txt").map(parseRating).toDF() println(ratings.first()) // val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) // println(training.first()) return ratings } def getSpark(): SparkSession = { return spark } def main(args: Array[String]) { getFeatures() } }
Example 65
Source File: Prettify.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset} import org.scalacheck.util.Pretty trait Prettify { val maxNumberOfShownValues = 100 implicit def prettyDataFrame(dataframe: DataFrame): Pretty = Pretty { _ => describeDataframe(dataframe)} implicit def prettyRDD(rdd: RDD[_]): Pretty = Pretty { _ => describeRDD(rdd)} implicit def prettyDataset(dataset: Dataset[_]): Pretty = Pretty { _ => describeDataset(dataset)} private def describeDataframe(dataframe: DataFrame) = s"""<DataFrame: schema = ${dataframe.toString}, size = ${dataframe.count()}, |values = (${dataframe.take(maxNumberOfShownValues).mkString(", ")})>""". stripMargin.replace("\n", " ") private def describeRDD(rdd: RDD[_]) = s"""<RDD: size = ${rdd.count()}, |values = (${rdd.take(maxNumberOfShownValues).mkString(", ")})>""". stripMargin.replace("\n", " ") private def describeDataset(dataset: Dataset[_]) = s"""<Dataset: schema = ${dataset.toString}, size = ${dataset.count()}, |values = (${dataset.take(maxNumberOfShownValues).mkString(", ")})>""". stripMargin.replace("\n", " ") } object Prettify extends Prettify
Example 66
Source File: DatasetGenerator.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Encoder, SQLContext} import org.scalacheck.{Arbitrary, Gen} import scala.reflect.ClassTag object DatasetGenerator { def arbitrarySizedDataset[T: ClassTag : Encoder] (sqlCtx: SQLContext, minPartitions: Int = 1) (generator: Int => Gen[T]): Arbitrary[Dataset[T]] = { val rddGen: Gen[RDD[T]] = RDDGenerator.genSizedRDD[T](sqlCtx.sparkContext, minPartitions)(generator) val datasetGen: Gen[Dataset[T]] = rddGen.map(rdd => sqlCtx.createDataset(rdd)) Arbitrary { datasetGen } } }
Example 67
Source File: SampleDatasetGeneratorTest.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.sql.{Dataset, SQLContext} import org.scalacheck.{Gen, Arbitrary} import org.scalacheck.Prop.forAll import org.scalatest.FunSuite import org.scalatest.prop.Checkers class SampleDatasetGeneratorTest extends FunSuite with SharedSparkContext with Checkers { test("test generating Datasets[String]") { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val property = forAll( DatasetGenerator.genDataset[String](sqlContext)( Arbitrary.arbitrary[String])) { dataset => dataset.map(_.length).count() == dataset.count() } check(property) } test("test generating sized Datasets[String]") { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val property = forAll { DatasetGenerator.genSizedDataset[(Int, String)](sqlContext) { size => Gen.listOfN(size, Arbitrary.arbitrary[Char]).map(l => (size, l.mkString)) } }{ dataset => val tuples = dataset.collect() val value = dataset.map{ case (_, str) => str.length} tuples.forall{ case (size, str) => size == str.length} && value.count() == dataset.count } check(property) } test("test generating Datasets[Custom Class]") { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val carGen: Gen[Dataset[Car]] = DatasetGenerator.genDataset[Car](sqlContext) { val generator: Gen[Car] = for { name <- Arbitrary.arbitrary[String] speed <- Arbitrary.arbitrary[Int] } yield (Car(name, speed)) generator } val property = forAll(carGen) { dataset => dataset.map(_.speed).count() == dataset.count() } check(property) } } case class Car(name: String, speed: Int)
Example 68
Source File: DatasetGeneratorSizeSpecial.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.sql.{Dataset, SQLContext} import org.scalacheck.{Gen, Arbitrary} import org.scalacheck.Prop.forAll import org.scalatest.FunSuite import org.scalatest.prop.Checkers class DatasetGeneratorSizeSpecial extends FunSuite with SharedSparkContext with Checkers { test("test generating sized Datasets[Custom Class]") { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // In 2.3 List is fine, however prior to 2.1 the generator returns // a concrete sub type which isn't handled well. // This works in 1.6.1+ but we only test in 2.0+ because that's easier val carGen: Gen[Dataset[Seq[Car]]] = DatasetGenerator.genSizedDataset[Seq[Car]](sqlContext) { size => val slowCarsTopNumber = math.ceil(size * 0.1).toInt def carGenerator(speed: Gen[Int]): Gen[Car] = for { name <- Arbitrary.arbitrary[String] speed <- speed } yield Car(name, speed) val cars: Gen[List[Car]] = for { slowCarsNumber: Int <- Gen.choose(0, slowCarsTopNumber) slowCars: List[Car] <- Gen.listOfN(slowCarsNumber, carGenerator(Gen.choose(0, 20))) normalSpeedCars: List[Car] <- Gen.listOfN( size - slowCarsNumber, carGenerator(Gen.choose(21, 150)) ) } yield { slowCars ++ normalSpeedCars } cars } val property = forAll(carGen.map(_.flatMap(identity))) { dataset => val cars = dataset.collect() val dataSetSize = cars.length val slowCars = cars.filter(_.speed < 21) slowCars.length <= dataSetSize * 0.1 && cars.map(_.speed).length == dataSetSize } check(property) } }
Example 69
Source File: HashingTF.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 70
Source File: SQLTransformer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 71
Source File: BinaryClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 72
Source File: MulticlassClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 73
Source File: RegressionEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 74
Source File: RWrapperUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.internal.Logging import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute} import org.apache.spark.ml.feature.{RFormula, RFormulaModel} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.Dataset private[r] object RWrapperUtils extends Logging { def getFeaturesAndLabels( rFormulaModel: RFormulaModel, data: Dataset[_]): (Array[String], Array[String]) = { val schema = rFormulaModel.transform(data).schema val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol)) .attributes.get val features = featureAttrs.map(_.name.get) val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol)) .asInstanceOf[NominalAttribute] val labels = labelAttr.values.get (features, labels) } }
Example 75
Source File: MultilayerPerceptronClassifierWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] val weights: Array[Double] = mlpModel.weights.toArray val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 76
Source File: Transformer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(this.createTransformFunc, outputDataType) dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 77
Source File: ChiSqSelectorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @transient var dataset: Dataset[_] = _ override def beforeAll(): Unit = { super.beforeAll() // Toy dataset, including the top feature for a chi-squared test. // These data are chosen such that each feature's test has a distinct p-value. val allParamSettings: Map[String, Any] = Map( "selectorType" -> "percentile", "numTopFeatures" -> 1, "percentile" -> 0.12, "outputCol" -> "myOutput" ) }
Example 78
Source File: TokenizerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new Tokenizer) } test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.RegexTokenizerSuite._ import testImplicits._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) ).toDF() testRegexTokenizer(tokenizer0, dataset0) val dataset1 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) ).toDF() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) ).toDF() testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) ).toDF() testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 79
Source File: NGramSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.NGramSuite._ import testImplicits._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = Seq(NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") )).toDF() testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") )).toDF() testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData(Array(), Array())).toDF() testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array() )).toDF() testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } } object NGramSuite extends SparkFunSuite { def testNGram(t: NGram, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("nGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => assert(actualNGrams === wantedNGrams) } } }
Example 80
Source File: PredictorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext { import PredictorSuite._ test("should support all NumericType labels and not support other types") { val df = spark.createDataFrame(Seq( (0, Vectors.dense(0, 2, 3)), (1, Vectors.dense(0, 3, 9)), (0, Vectors.dense(0, 2, 6)) )).toDF("label", "features") val types = Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0)) val predictor = new MockPredictor() types.foreach { t => predictor.fit(df.select(col("label").cast(t), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label").cast(StringType), col("features"))) } } } object PredictorSuite { class MockPredictor(override val uid: String) extends Predictor[Vector, MockPredictor, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictor")) override def train(dataset: Dataset[_]): MockPredictionModel = { require(dataset.schema("label").dataType == DoubleType) new MockPredictionModel(uid) } override def copy(extra: ParamMap): MockPredictor = throw new NotImplementedError() } class MockPredictionModel(override val uid: String) extends PredictionModel[Vector, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictormodel")) override def predict(features: Vector): Double = throw new NotImplementedError() override def copy(extra: ParamMap): MockPredictionModel = throw new NotImplementedError() } }
Example 81
Source File: SQLBuilderTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import scala.util.control.NonFatal import org.apache.spark.sql.{DataFrame, Dataset, QueryTest} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton { protected def checkSQL(e: Expression, expectedSQL: String): Unit = { val actualSQL = e.sql try { assert(actualSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following expression: | |${e.prettyName} | |$cause """.stripMargin) } } protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = { val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) => fail( s"""Cannot convert the following logical query plan to SQL: | |${plan.treeString} """.stripMargin) } try { assert(generatedSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following logical query plan: | |${plan.treeString} | |$cause """.stripMargin) } checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan)) } protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = { checkSQL(df.queryExecution.analyzed, expectedSQL) } }
Example 82
Source File: Aggregator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 83
Source File: FrequentItems.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 84
Source File: cache.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan case class CacheTableCommand( tableIdent: TableIdentifier, plan: Option[LogicalPlan], isLazy: Boolean) extends RunnableCommand { require(plan.isEmpty || tableIdent.database.isEmpty, "Database name is not allowed in CACHE TABLE AS SELECT") override protected def innerChildren: Seq[QueryPlan[_]] = { plan.toSeq } override def run(sparkSession: SparkSession): Seq[Row] = { plan.foreach { logicalPlan => Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString) } sparkSession.catalog.cacheTable(tableIdent.quotedString) if (!isLazy) { // Performs eager caching sparkSession.table(tableIdent).count() } Seq.empty[Row] } } case class UncacheTableCommand( tableIdent: TableIdentifier, ifExists: Boolean) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val tableId = tableIdent.quotedString try { sparkSession.catalog.uncacheTable(tableId) } catch { case _: NoSuchTableException if ifExists => // don't throw } Seq.empty[Row] } } case object ClearCacheCommand extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { sparkSession.catalog.clearCache() Seq.empty[Row] } }
Example 85
Source File: XGBoost.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import eleflow.uberdata.models.UberXGBOOSTModel import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType} import scala.reflect.ClassTag class XGBoost[I](override val uid: String, val models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))])( implicit kt: ClassTag[I], ord: Ordering[I] = null) extends ForecastBaseModel[XGBoostSmallModel[I]] with HasInputCol with HasOutputCol with DefaultParamsWritable with HasFeaturesCol with HasNFutures with HasGroupByCol { def this( models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))] )(implicit kt: ClassTag[I], ord: Ordering[I] ) = this(Identifiable.randomUID("xgboost"), models) override def transform(dataSet: Dataset[_]): DataFrame = { val schema = dataSet.schema val predSchema = transformSchema(schema) val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)}) val predictions = joined.map { case (id, ((bestModel, metrics), row)) => val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]]( IUberdataForecastUtil.FEATURES_COL_NAME ) val label = DataTransformer.toFloat(row.getAs($(featuresCol))) val labelPoint = features.map { vec => val array = vec.toArray.map(_.toFloat) LabeledPoint(label, null, array) } val matrix = new DMatrix(labelPoint.toIterator) val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance .predict(matrix) .flatMap(_.map(_.toDouble)) .splitAt(features.length) Row( row.toSeq :+ Vectors .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _* ) } dataSet.sqlContext.createDataFrame(predictions, predSchema) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra) }
Example 86
Source File: TimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataSet: Dataset[_]): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val index = sparkContext.broadcast(dataSet.schema.fieldIndex($(timeCol).get)) val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(groupByCol).get)) val featuresColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(featuresCol))) val grouped = rdd.map { case (row: Row) => val timeColRow = IUberdataForecastUtil.convertColumnToLong(row, index.value) convertColumnToDouble(timeColRow, featuresColIndex) }.groupBy { row => row.getAs[L](labelColIndex.value) }.map { case (key, values) => val toBeUsed = values.toArray.sortBy(row => row.getAs[Long](index.value)) (key, toBeUsed) } val toBeTrained = grouped.map { case (key, values) => org.apache.spark.sql.Row( key, Vectors.dense(values.map(_.getAs[Double](featuresColIndex.value))) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(toBeTrained, trainSchema) } override def transformSchema(schema: StructType): StructType = { val labelIndex = schema.fieldIndex($(groupByCol).get) StructType( Seq( schema.fields(labelIndex), StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): TimeSeriesGenerator[L] = defaultCopy(extra) } object TimeSeriesGenerator extends DefaultParamsReadable[TimeSeriesGenerator[_]] { override def load(path: String): TimeSeriesGenerator[_] = super.load(path) }
Example 87
Source File: XGBoostBigModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberXGBoostModel import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint} import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)]) extends ForecastBaseModel[XGBoostBigModel[I]] with HasLabelCol with HasIdCol { def setLabelcol(label: String): this.type = set(labelCol, label) def setIdcol(id: String): this.type = set(idCol, id) override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) ) } .join(prediction) .map { case (id, (features, predictValue)) => Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } protected def predict(dataSet: Dataset[_]) = { val features = dataSet.rdd.map { case (row: Row) => val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) val id = row.getAs[I]($(idCol)) SparkLabeledPoint(DataTransformer.toFloat(id), features) }.cache val (_, model) = models.head UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(getPredictionSchema) protected def getPredictionSchema: Array[StructField] = { Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) } }
Example 88
Source File: ArimaBestModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.TimeSeriesModel import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.StructType class ArimaBestModel[L, M <: TimeSeriesModel]( override val uid: String, val bestPrediction: RDD[(L, M)], val validationMetrics: RDD[(L, Seq[ModelParamEvaluation[L]])] ) extends Model[ArimaBestModel[L, M]] with TimeSeriesBestModelFinderParam[L] { //TODO avaliar necessidade override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) dataset.toDF() } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): ArimaBestModel[L, M] = { val copied = new ArimaBestModel[L, M](uid, bestPrediction, validationMetrics) copyValues(copied, extra) } }
Example 89
Source File: MovingAverage.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types._ def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(windowSize -> 3) override def transform(dataSet: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataSet.schema) val sparkContext = dataSet.sqlContext.sparkContext val inputType = outputSchema($(inputCol)).dataType val inputTypeBr = sparkContext.broadcast(inputType) val dataSetRdd = dataSet.rdd val inputColName = sparkContext.broadcast($(inputCol)) val inputColIndex = dataSet.columns.indexOf($(inputCol)) val inputColIndexBr = sparkContext.broadcast(inputColIndex) val windowSizeBr = sparkContext.broadcast($(windowSize)) val maRdd = dataSetRdd.map { case (row: Row) => val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) { val vector = row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value) (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1))) } else { val iterable = row.getAs[Iterable[Double]](inputColName.value) (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1))) } val (before, after) = row.toSeq.splitAt(inputColIndexBr.value) Row( (before :+ rawValue) ++ after.tail :+ MovingAverageCalc .simpleMovingAverageArray(array, windowSizeBr.value): _* ) } dataSet.sqlContext.createDataFrame(maRdd, outputSchema) } override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra) } object MovingAverageCalc { private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = { (for (i <- 1 to values.length) yield //TODO rollback this comment with the right size of features to make the meanaverage return // the features values for the first values of the calc if (i < period) 0d //values(i) else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d) } } object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] { override def load(path: String): MovingAverage[_] = super.load(path) }
Example 90
Source File: VectorizeEncoder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.core.data.DataTransformer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} class VectorizeEncoder(override val uid: String) extends Transformer with HasIdCol with HasTimeCol with HasInputCols with HasLabelCol with HasGroupByCol with HasOutputCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("vectorizer")) def setIdCol(input: String) = set(idCol, input) def setLabelCol(input: String) = set(labelCol, input) def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy)) def setInputCol(input: Array[String]) = set(inputCols, input) def setTimeCol(time: String) = set(timeCol, Some(time)) def setOutputCol(output: String) = set(outputCol, output) override def transform(dataSet: Dataset[_]): DataFrame = { val context = dataSet.sqlContext.sparkContext val input = context.broadcast($(inputCols)) val allColumnNames = dataSet.schema.map(_.name) val nonInputColumnIndexes = context.broadcast( allColumnNames.zipWithIndex.filter( f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol) || f._1 == $(timeCol).getOrElse(""))) val result = dataSet.rdd.map { case (row: Row) => val rowSeq = row.toSeq val nonInputColumns = nonInputColumnIndexes.value.map { case (_, index) => rowSeq(index) } val size = input.value.length val (values, indices) = input.value .filter(col => row.getAs(col) != null) .map { column => DataTransformer.toDouble(row.getAs(column)) } .zipWithIndex .filter(f => f._1 != 0d) .unzip Row( nonInputColumns :+ org.apache.spark.ml.linalg.Vectors .sparse(size, indices.toArray, values.toArray): _* ) } val newSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(result, newSchema) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType( schema.filter( col => !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol) || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("") ) ).add(StructField($(outputCol), new VectorUDT)) }
Example 91
Source File: AllColumnsTimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) // override def transform(dataSet: DataFrame): DataFrame = { override def transform(dataSet: Dataset[_] ): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(labelCol))) val keyValueDataSet = rdd.map { case (row: Row) => Row( row.getAs[T](labelColIndex.value), row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol)) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(keyValueDataSet, trainSchema) } override def transformSchema(schema: StructType): StructType = { StructType( schema.filter(_.name == $(labelCol)).head +: Seq( StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): AllColumnsTimeSeriesGenerator[T, U] = defaultCopy(extra) } object AllColumnsTimeSeriesGenerator extends DefaultParamsReadable[AllColumnsTimeSeriesGenerator[_, _]] { override def load(path: String): AllColumnsTimeSeriesGenerator[_, _] = super.load(path) }
Example 92
Source File: HoltWintersEstimator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.TimeSeriesModel import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import org.apache.spark.sql.Dataset class HoltWintersBestModel[T, M <: TimeSeriesModel]( override val uid: String, val bestPrediction: RDD[(T, M)], val validationMetrics: RDD[(T, ModelParamEvaluation[T])] ) extends Model[HoltWintersBestModel[T, M]] with TimeSeriesBestModelFinderParam[T] { //TODO look for this method usage to see if it can be removed override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) dataset.toDF() } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): HoltWintersBestModel[T, M] = { val copied = new HoltWintersBestModel[T, M](uid, bestPrediction, validationMetrics) copyValues(copied, extra) } }
Example 93
Source File: XGBoostBigModelTimeSeries.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import java.sql.Timestamp import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasTimeCol import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModelTimeSeries[I](override val uid: String, override val models: Seq[(ParamMap, XGBoostModel)]) extends XGBoostBigModel[I](uid, models) with HasTimeCol{ def setTimecol(time: String): this.type = set(timeCol, Some(time)) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME), row.getAs[java.sql.Timestamp]($(timeCol).get))) } .join(prediction) .map { case (id, ((features, time), predictValue)) => Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField($(timeCol).get, TimestampType), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) ) }
Example 94
Source File: HoltWintersBestModelFinder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberHoltWintersModel import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import scala.reflect.ClassTag class HoltWintersBestModelFinder[G]( override val uid: String )(implicit kt: ClassTag[G]) extends HoltWintersBestModelEvaluation[G, HoltWintersModel[G]] with DefaultParamsWritable with HasGroupByCol with TimeSeriesBestModelFinder { def setTimeSeriesEvaluator(eval: TimeSeriesEvaluator[G]): this.type = set(timeSeriesEvaluator, eval) def setEstimatorParamMaps(value: Array[ParamMap]): this.type = set(estimatorParamMaps, value) def setNFutures(value: Int): this.type = set(nFutures, value) override def setValidationCol(value: String): this.type = set(validationCol, value) def setLabelCol(label: String): this.type = set(labelCol, label) def setGroupByCol(groupBy: String): this.type = set(groupByCol, Some(groupBy)) def this()(implicit kt: ClassTag[G]) = this(Identifiable.randomUID("arima")) def modelEvaluation( idModels: RDD[(G, Row, Option[UberHoltWintersModel])] ): RDD[(G, (UberHoltWintersModel, ModelParamEvaluation[G]))] = { val eval = $(timeSeriesEvaluator) val broadcastEvaluator = idModels.context.broadcast(eval) idModels.filter(_._3.isDefined).map { case (id, row, models) => val evaluatedModels = models.map { model => holtWintersEvaluation(row, model, broadcastEvaluator, id) }.head log.warn(s"best model reach ${evaluatedModels._2.metricResult}") (id, evaluatedModels) } } override protected def train(dataSet: Dataset[_]): HoltWintersModel[G] = { val splitDs = split(dataSet, $(nFutures)) val idModels = splitDs.rdd.map(train) new HoltWintersModel[G](uid, modelEvaluation(idModels)) .setValidationCol($(validationCol)) .asInstanceOf[HoltWintersModel[G]] } def train(row: Row): (G, Row, Option[UberHoltWintersModel]) = { val id = row.getAs[G]($(groupByCol).get) val result = try { val dense = row.getAs[org.apache.spark.ml.linalg.DenseVector]($(featuresCol)) val ts:org.apache.spark.mllib.linalg.Vector = org.apache.spark.mllib.linalg.Vectors.dense(dense.toArray); Some( UberHoltWintersModel.fitModelWithBOBYQA(ts, $(nFutures)) ) } catch { case e: Exception => log.error( s"Got the following Exception ${e.getLocalizedMessage} in id $id" ) None } (id, row, result) } } object HoltWintersBestModelFinder extends DefaultParamsReadable[HoltWintersBestModelFinder[_]] { override def load(path: String): HoltWintersBestModelFinder[_] = super.load(path) }
Example 95
Source File: SparkTest.scala From spark-records with Apache License 2.0 | 5 votes |
package examples.fancy_numbers import com.swoop.spark.records._ import com.swoop.spark.test.SparkSqlSpec import org.apache.spark.sql.Dataset import org.apache.spark.storage.StorageLevel class SparkTest extends ExampleSpec with SparkSqlSpec with TestNegative5To100 { lazy val dc = SimpleDriverContext(sc) lazy val jc = dc.jobContext(SimpleJobContext) lazy val ds = recordsDataset(-5 to 100, jc) lazy val records = ds.collect "in an integration test" - { implicit val env = FlatRecordEnvironment() val sqlContext = sqlc import sqlContext.implicits._ behave like fancyRecordBuilder(records, jc) "should build records with Spark" in { ds.count should be(105) } "should filter error records" in { ds.errorRecords.count should be(6) } "should extract data from records" in { ds.recordData.count should be(99) } "should extract issues" in { ds.allIssues.count should be(8) ds.errorIssues.count should be(6) } "should demonstrate issueCounts() output" in { ds.issueCounts.show(false) } "should demonstrate errorIssueCounts() output" in { ds.errorIssueCounts.show(false) } "should demonstrate messageCounts() output" in { ds.messageCounts.show(false) } "should demonstrate errorMessageCounts() output" in { ds.errorMessageCounts.show(false) } "should demonstrate errorDetailCounts() output" in { ds.errorIssues.errorDetailCounts().show } "should demonstrate unknownErrorDetailCounts() output" in { ds.errorIssues.unknownErrorDetailCounts("examples.fancy_numbers").show } "should demonstrate errorDetails() output" in { ds.errorIssues.errorDetails().show } "should demonstrate unknownErrorDetails() output" in { ds.errorIssues.unknownErrorDetails("examples.fancy_numbers").show } } def recordsDataset(numbers: Seq[Int], jc: JobContext): Dataset[FancyNumberRecord] = { val sqlContext = sqlc import sqlContext.implicits._ sqlc.createDataset(numbers) .mapPartitions(inputs => Example.buildRecords(inputs, jc)) .persist(StorageLevel.MEMORY_ONLY) } }
Example 96
Source File: MergeProjection.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.mutation.merge import java.sql.{Date, Timestamp} import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection} import org.apache.spark.sql.catalyst.util.DateTimeUtils case class MergeProjection( @transient tableCols: Seq[String], @transient statusCol : String, @transient ds: Dataset[Row], @transient rltn: CarbonDatasourceHadoopRelation, @transient sparkSession: SparkSession, @transient mergeAction: MergeAction) { private val cutOffDate = Integer.MAX_VALUE >> 1 val isUpdate = mergeAction.isInstanceOf[UpdateAction] val isDelete = mergeAction.isInstanceOf[DeleteAction] def apply(row: GenericRowWithSchema): InternalRow = { // TODO we can avoid these multiple conversions if this is added as a SparkPlan node. val values = row.values.map { case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s) case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d) case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b) case d: Date => DateTimeUtils.fromJavaDate(d) case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t) case value => value } projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow] } val (projection, output) = generateProjection private def generateProjection: (Projection, Array[Expression]) = { val existingDsOutput = rltn.carbonRelation.schema.toAttributes val colsMap = mergeAction match { case UpdateAction(updateMap) => updateMap case InsertAction(insertMap) => insertMap case _ => null } if (colsMap != null) { val output = new Array[Expression](tableCols.length) val expecOutput = new Array[Expression](tableCols.length) colsMap.foreach { case (k, v) => val tableIndex = tableCols.indexOf(k.toString().toLowerCase) if (tableIndex < 0) { throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap") } output(tableIndex) = v.expr.transform { case a: Attribute if !a.resolved => ds.queryExecution.analyzed.resolveQuoted(a.name, sparkSession.sessionState.analyzer.resolver).get } expecOutput(tableIndex) = existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get } if (output.contains(null)) { throw new CarbonMergeDataSetException(s"Not all columns are mapped") } (new InterpretedMutableProjection(output++Seq( ds.queryExecution.analyzed.resolveQuoted(statusCol, sparkSession.sessionState.analyzer.resolver).get), ds.queryExecution.analyzed.output), expecOutput) } else { (null, null) } } }
Example 97
Source File: DeltaLoad.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo import com.adidas.analytics.algo.DeltaLoad._ import com.adidas.analytics.algo.core.Algorithm import com.adidas.analytics.algo.shared.DateComponentDerivation import com.adidas.analytics.config.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration import com.adidas.analytics.util.DataFrameUtils._ import com.adidas.analytics.util._ import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.storage.StorageLevel import org.slf4j.{Logger, LoggerFactory} private def getUpsertRecords(deltaRecords: Dataset[Row], resultColumns: Seq[String]): Dataset[Row] = { // Create partition window - Partitioning by delta records logical key (i.e. technical key of active records) val partitionWindow = Window .partitionBy(businessKey.map(col): _*) .orderBy(technicalKey.map(component => col(component).desc): _*) // Ranking & projection val rankedDeltaRecords = deltaRecords .withColumn(rankingColumnName, row_number().over(partitionWindow)) .filter(upsertRecordsModesFilterFunction) rankedDeltaRecords .filter(rankedDeltaRecords(rankingColumnName) === 1) .selectExpr(resultColumns: _*) } protected def withDatePartitions(spark: SparkSession, dfs: DFSWrapper, dataFrames: Vector[DataFrame]): Vector[DataFrame] = { logger.info("Adding partitioning information if needed") try { dataFrames.map { df => if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions){ df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions)) } else df } } catch { case e: Throwable => logger.error("Cannot add partitioning information for data frames.", e) //TODO: Handle failure case properly throw new RuntimeException("Unable to transform data frames.", e) } } } object DeltaLoad { private val logger: Logger = LoggerFactory.getLogger(getClass) def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = { new DeltaLoad(spark, dfs, configLocation) } }
Example 98
Source File: PartitionHelpers.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo.core import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} trait PartitionHelpers { protected def getDistinctPartitions(outputDataFrame: DataFrame, targetPartitions: Seq[String]): Dataset[Row] = { val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString)) outputDataFrame.select(targetPartitionsColumns: _*).distinct } protected def getParameterValue(row: Row, partitionString: String): String = createParameterValue(row.get(row.fieldIndex(partitionString))) protected def createParameterValue(partitionRawValue: Any): String = partitionRawValue match { case value: java.lang.Short => value.toString case value: java.lang.Integer => value.toString case value: scala.Predef.String => "'" + value + "'" case null => throw new Exception("Partition Value is null. No support for null partitions!") case value => throw new Exception("Unsupported partition DataType: " + value.getClass) } }
Example 99
Source File: RecoverPartitionsNativeIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class RecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated with native spark.recoverPartitions()") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false spark .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)") .filter("col_name == 'Partition Statistics'") .head() .getAs[String]("data_type").contains("6 rows") shouldBe true fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 100
Source File: SparkRecoverPartitionsNativeIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class SparkRecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated with native spark.recoverPartitions()") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 101
Source File: SparkRecoverPartitionsCustomIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class SparkRecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated programmatically using custom logic") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 102
Source File: RecoverPartitionsCustomIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class RecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated programmatically using custom logic") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false spark .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)") .filter("col_name == 'Partition Statistics'") .head() .getAs[String]("data_type").contains("6 rows") shouldBe true fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 103
Source File: SparkRecoverPartitionsCustomTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.unit import com.adidas.analytics.util.SparkRecoverPartitionsCustom import com.adidas.utils.SparkSessionWrapper import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{Dataset, Row} import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers, PrivateMethodTester} import scala.collection.JavaConverters._ class SparkRecoverPartitionsCustomTest extends FunSuite with SparkSessionWrapper with PrivateMethodTester with Matchers with BeforeAndAfterAll{ test("test conversion of String Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) val result = customSparkRecoverPartitions invokePrivate createParameterValue("theValue") result should be("'theValue'") } test("test conversion of Short Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Short.valueOf("2")) result should be("2") } test("test conversion of Integer Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Integer.valueOf("4")) result should be("4") } test("test conversion of null Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) an [Exception] should be thrownBy { customSparkRecoverPartitions invokePrivate createParameterValue(null) } } test("test conversion of not supported Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) an [Exception] should be thrownBy { customSparkRecoverPartitions invokePrivate createParameterValue(false) } } test("test HiveQL statements Generation") { val customSparkRecoverPartitions = SparkRecoverPartitionsCustom( tableName="test", targetPartitions = Seq("country","district") ) val rowsInput = Seq( Row(1, "portugal", "porto"), Row(2, "germany", "herzogenaurach"), Row(3, "portugal", "coimbra") ) val inputSchema = StructType( List( StructField("number", IntegerType, nullable = true), StructField("country", StringType, nullable = true), StructField("district", StringType, nullable = true) ) ) val expectedStatements: Seq[String] = Seq( "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='porto')", "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='germany',district='herzogenaurach')", "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='coimbra')" ) val testDataset: Dataset[Row] = spark.createDataset(rowsInput)(RowEncoder(inputSchema)) val createParameterValue = PrivateMethod[Dataset[String]]('generateAddPartitionStatements) val producedStatements: Seq[String] = (customSparkRecoverPartitions invokePrivate createParameterValue(testDataset)) .collectAsList() .asScala expectedStatements.sorted.toSet should equal(producedStatements.sorted.toSet) } override def afterAll(): Unit = { spark.stop() } }
Example 104
Source File: RecoverPartitionsCustomTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.unit import com.adidas.analytics.util.RecoverPartitionsCustom import com.adidas.utils.SparkSessionWrapper import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{Dataset, Row} import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers, PrivateMethodTester} import scala.collection.JavaConverters._ class RecoverPartitionsCustomTest extends FunSuite with SparkSessionWrapper with PrivateMethodTester with Matchers with BeforeAndAfterAll{ test("test conversion of String Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) val result = customSparkRecoverPartitions invokePrivate createParameterValue("theValue") result should be("'theValue'") } test("test conversion of Short Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Short.valueOf("2")) result should be("2") } test("test conversion of Integer Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Integer.valueOf("4")) result should be("4") } test("test conversion of null Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) an [Exception] should be thrownBy { customSparkRecoverPartitions invokePrivate createParameterValue(null) } } test("test conversion of not supported Value to HiveQL Partition Parameter") { val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) an [Exception] should be thrownBy { customSparkRecoverPartitions invokePrivate createParameterValue(false) } } test("test HiveQL statements Generation") { val customSparkRecoverPartitions = RecoverPartitionsCustom( tableName="test", targetPartitions = Seq("country","district") ) val rowsInput = Seq( Row(1, "portugal", "porto"), Row(2, "germany", "herzogenaurach"), Row(3, "portugal", "coimbra") ) val inputSchema = StructType( List( StructField("number", IntegerType, nullable = true), StructField("country", StringType, nullable = true), StructField("district", StringType, nullable = true) ) ) val expectedStatements: Seq[String] = Seq( "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='porto')", "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='germany',district='herzogenaurach')", "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='coimbra')" ) val testDataset: Dataset[Row] = spark.createDataset(rowsInput)(RowEncoder(inputSchema)) val createParameterValue = PrivateMethod[Dataset[String]]('generateAddPartitionStatements) val producedStatements: Seq[String] = (customSparkRecoverPartitions invokePrivate createParameterValue(testDataset)) .collectAsList() .asScala expectedStatements.sorted.toSet should equal(producedStatements.sorted.toSet) } override def afterAll(): Unit = { spark.stop() } }
Example 105
Source File: SqsSource.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sqs import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation} import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.FileStreamSource._ import org.apache.spark.sql.types.StructType class SqsSource(sparkSession: SparkSession, metadataPath: String, options: Map[String, String], override val schema: StructType) extends Source with Logging { private val sourceOptions = new SqsSourceOptions(options) private val hadoopConf = sparkSession.sessionState.newHadoopConf() private val metadataLog = new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath) private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L) private val maxFilesPerTrigger = sourceOptions.maxFilesPerTrigger private val maxFileAgeMs: Long = sourceOptions.maxFileAgeMs private val fileFormatClassName = sourceOptions.fileFormatClassName private val shouldSortFiles = sourceOptions.shouldSortFiles private val sqsClient = new SqsClient(sourceOptions, hadoopConf) metadataLog.allFiles().foreach { entry => sqsClient.sqsFileCache.add(entry.path, MessageDescription(entry.timestamp, true, "")) } sqsClient.sqsFileCache.purge() logInfo(s"maxFilesPerBatch = $maxFilesPerTrigger, maxFileAgeMs = $maxFileAgeMs") val batchFiles = sqsClient.sqsFileCache.getUncommittedFiles(maxFilesPerTrigger, shouldSortFiles) if (batchFiles.nonEmpty) { metadataLogCurrentOffset += 1 metadataLog.add(metadataLogCurrentOffset, batchFiles.map { case (path, timestamp, receiptHandle) => FileEntry(path = path, timestamp = timestamp, batchId = metadataLogCurrentOffset) }.toArray) logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files") val messageReceiptHandles = batchFiles.map { case (path, timestamp, receiptHandle) => sqsClient.sqsFileCache.markCommitted(path) logDebug(s"New file: $path") receiptHandle }.toList sqsClient.addToDeleteMessageQueue(messageReceiptHandles) } val numPurged = sqsClient.sqsFileCache.purge() if (!sqsClient.deleteMessageQueue.isEmpty) { sqsClient.deleteMessagesFromQueue() } logTrace( s""" |Number of files selected for batch = ${batchFiles.size} |Number of files purged from tracking map = $numPurged """.stripMargin) FileStreamSourceOffset(metadataLogCurrentOffset) } override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1) override def commit(end: Offset): Unit = { // No-op for now; SqsSource currently garbage-collects files based on timestamp // and the value of the maxFileAge parameter. } override def stop(): Unit = { if (!sqsClient.sqsScheduler.isTerminated) { sqsClient.sqsScheduler.shutdownNow() } } override def toString: String = s"SqsSource[${sqsClient.sqsUrl}]" }
Example 106
Source File: Cleaner.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions.clean import com.hankcs.hanlp.HanLP import config.paramconf.{HasOutputCol, HasInputCol} import functions.MySchemaUtils import functions.clean.chinese.BCConvert import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1) override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val cleanFunc = udf {line: String => var cleaned = "" getFanJian match { case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line) case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line) case _ => cleaned = line } getQuanBan match { case "q2b" => cleaned = BCConvert.qj2bj(cleaned) case "b2q" => cleaned = BCConvert.bj2qj(cleaned) case _ => cleaned = cleaned } cleaned } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record => val outputIndex = record.fieldIndex($(outputCol)) record.getString(outputIndex).length >= getMinLineLen } } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.typeName.equals(StringType.typeName), s"Input type must be StringType but got $inputType.") MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable) } } object Cleaner extends DefaultParamsReadable[Cleaner] { override def load(path: String): Cleaner = super.load(path) }
Example 107
Source File: CUDAUtils.scala From GPUEnabler with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.gpuenabler import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.StructType import org.apache.spark.internal.Logging object CUDAUtils { def DS = Dataset type _ds[T] = Dataset[T] def getLogicalPlan[T](ds: Dataset[T]) = { ds.logicalPlan } def getAttributes(st: StructType) = { st.toAttributes } type _Logging = Logging def md5HashObj(obj: AnyRef) : String = { val text = obj.toString() java.security.MessageDigest.getInstance("MD5").digest(text.getBytes) .map(0xFF & _).map { "%02x".format(_) }.foldLeft("") { _ + _ } + "_" + obj.hashCode() } }
Example 108
Source File: LightPipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.ml.{PipelineModel, Transformer} import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.JavaConverters._ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddingsVectors: Boolean = false) { private var ignoreUnsupported = false def setIgnoreUnsupported(v: Boolean): Unit = ignoreUnsupported = v def getIgnoreUnsupported: Boolean = ignoreUnsupported def getStages: Array[Transformer] = pipelineModel.stages def transform(dataFrame: Dataset[_]): DataFrame = pipelineModel.transform(dataFrame) def fullAnnotate(target: String, startWith: Map[String, Seq[Annotation]] = Map.empty[String, Seq[Annotation]]): Map[String, Seq[Annotation]] = { getStages.foldLeft(startWith)((annotations, transformer) => { transformer match { case documentAssembler: DocumentAssembler => annotations.updated(documentAssembler.getOutputCol, documentAssembler.assemble(target, Map.empty[String, String])) case lazyAnnotator: AnnotatorModel[_] if lazyAnnotator.getLazyAnnotator => annotations case recursiveAnnotator: HasRecursiveTransform[_] with AnnotatorModel[_] => val combinedAnnotations = recursiveAnnotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil)) annotations.updated(recursiveAnnotator.getOutputCol, recursiveAnnotator.annotate(combinedAnnotations, pipelineModel)) case annotator: AnnotatorModel[_] => val combinedAnnotations = annotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil)) annotations.updated(annotator.getOutputCol, annotator.annotate(combinedAnnotations)) case finisher: Finisher => annotations.filterKeys(finisher.getInputCols.contains) case rawModel: RawAnnotator[_] => if (ignoreUnsupported) annotations else throw new IllegalArgumentException(s"model ${rawModel.uid} does not support LightPipeline." + s" Call setIgnoreUnsupported(boolean) on LightPipeline to ignore") case pipeline: PipelineModel => new LightPipeline(pipeline, parseEmbeddingsVectors).fullAnnotate(target, annotations) case _ => annotations } }) } def fullAnnotate(targets: Array[String]): Array[Map[String, Seq[Annotation]]] = { targets.par.map(target => { fullAnnotate(target) }).toArray } def fullAnnotateJava(target: String): java.util.Map[String, java.util.List[JavaAnnotation]] = { fullAnnotate(target).mapValues(_.map(aa => JavaAnnotation(aa.annotatorType, aa.begin, aa.end, aa.result, aa.metadata.asJava)).asJava).asJava } def fullAnnotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[JavaAnnotation]]] = { targets.asScala.par.map(target => { fullAnnotateJava(target) }).toList.asJava } def annotate(target: String): Map[String, Seq[String]] = { fullAnnotate(target).mapValues(_.map(a => { a.annotatorType match { case (AnnotatorType.WORD_EMBEDDINGS | AnnotatorType.SENTENCE_EMBEDDINGS) if (parseEmbeddingsVectors) => a.embeddings.mkString(" ") case _ => a.result } })) } def annotate(targets: Array[String]): Array[Map[String, Seq[String]]] = { targets.par.map(target => { annotate(target) }).toArray } def annotateJava(target: String): java.util.Map[String, java.util.List[String]] = { annotate(target).mapValues(_.asJava).asJava } def annotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[String]]] = { targets.asScala.par.map(target => { annotateJava(target) }).toList.asJava } }
Example 109
Source File: AnnotatorApproach.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.storage.HasStorage import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer} import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType} import org.apache.spark.ml.util.DefaultParamsWritable override final def transformSchema(schema: StructType): StructType = { require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" + msgHelper(schema) + s"\nMake sure such annotators exist in your pipeline, " + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", outputAnnotatorType) val outputFields = schema.fields :+ StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build) StructType(outputFields) } }
Example 110
Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.mutable.ListBuffer class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline { def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty) def this(uid: String) = this(uid, Array.empty) def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages) this.setStages(baseStages) override def fit(dataset: Dataset[_]): PipelineModel = { transformSchema(dataset.schema, logging = true) val theStages = $(stages) var indexOfLastEstimator = -1 theStages.view.zipWithIndex.foreach { case (stage, index) => stage match { case _: Estimator[_] => indexOfLastEstimator = index case _ => } } var curDataset = dataset val transformers = ListBuffer.empty[Transformer] theStages.view.zipWithIndex.foreach { case (stage, index) => if (index <= indexOfLastEstimator) { val transformer = stage match { case estimator: HasRecursiveFit[_] => estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset)) case estimator: Estimator[_] => estimator.fit(curDataset) case t: Transformer => t case _ => throw new IllegalArgumentException( s"Does not support stage $stage of type ${stage.getClass}") } if (index < indexOfLastEstimator) { curDataset = transformer.transform(curDataset) } transformers += transformer } else { transformers += stage.asInstanceOf[Transformer] } } createPipeline(dataset, transformers.toArray) } } class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel) extends Model[RecursivePipelineModel] with MLWritable with Logging { def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline) // drops right at most because is itself included private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel = new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset) override def copy(extra: ParamMap): RecursivePipelineModel = { new RecursivePipelineModel(uid, innerPipeline.copy(extra)) } override def write: MLWriter = { innerPipeline.write } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match { case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset)) case t: AnnotatorModel[_] if t.getLazyAnnotator => cur case t: Transformer => t.transform(cur) }) } override def transformSchema(schema: StructType): StructType = { innerPipeline.transformSchema(schema) } }
Example 111
Source File: BigTextMatcher.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.btm import com.johnsnowlabs.collections.StorageSearchTrie import com.johnsnowlabs.nlp.AnnotatorType.{TOKEN, DOCUMENT, CHUNK} import com.johnsnowlabs.nlp.annotators.TokenizerModel import com.johnsnowlabs.nlp.serialization.StructFeature import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper} import com.johnsnowlabs.nlp.AnnotatorApproach import com.johnsnowlabs.storage.Database.Name import com.johnsnowlabs.storage.{Database, HasStorage, RocksDBConnection, StorageWriter} import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.Dataset class BigTextMatcher(override val uid: String) extends AnnotatorApproach[BigTextMatcherModel] with HasStorage { def this() = this(Identifiable.randomUID("ENTITY_EXTRACTOR")) override val inputAnnotatorTypes = Array(DOCUMENT, TOKEN) override val outputAnnotatorType: AnnotatorType = CHUNK override val description: String = "Extracts entities from target dataset given in a text file" val mergeOverlapping = new BooleanParam(this, "mergeOverlapping", "whether to merge overlapping matched chunks. Defaults false") val tokenizer = new StructFeature[TokenizerModel](this, "tokenizer") setDefault(inputCols,Array(TOKEN)) setDefault(caseSensitive, true) setDefault(mergeOverlapping, false) def setTokenizer(tokenizer: TokenizerModel): this.type = set(this.tokenizer, tokenizer) def getTokenizer: TokenizerModel = $$(tokenizer) def setMergeOverlapping(v: Boolean): this.type = set(mergeOverlapping, v) def getMergeOverlapping: Boolean = $(mergeOverlapping) private def loadEntities(path: String, writers: Map[Database.Name, StorageWriter[_]]): Unit = { val inputFiles: Seq[Iterator[String]] = ResourceHelper.parseLinesIterator(ExternalResource(path, ReadAs.TEXT, Map())) inputFiles.foreach { inputFile => { StorageSearchTrie.load(inputFile, writers, get(tokenizer)) }} } override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): BigTextMatcherModel = { new BigTextMatcherModel() .setInputCols($(inputCols)) .setOutputCol($(outputCol)) .setCaseSensitive($(caseSensitive)) .setStorageRef($(storageRef)) .setMergeOverlapping($(mergeOverlapping)) } override protected def createWriter(database: Name, connection: RocksDBConnection): StorageWriter[_] = { database match { case Database.TMVOCAB => new TMVocabReadWriter(connection, $(caseSensitive)) case Database.TMEDGES => new TMEdgesReadWriter(connection, $(caseSensitive)) case Database.TMNODES => new TMNodesWriter(connection) } } override protected def index( fitDataset: Dataset[_], storageSourcePath: Option[String], readAs: Option[ReadAs.Value], writers: Map[Database.Name, StorageWriter[_]], readOptions: Option[Map[String, String]] ): Unit = { require(readAs.get == ReadAs.TEXT, "BigTextMatcher only supports TEXT input formats at the moment.") loadEntities(storageSourcePath.get, writers) } override protected val databases: Array[Name] = BigTextMatcherModel.databases } object BigTextMatcher extends DefaultParamsReadable[BigTextMatcher]
Example 112
Source File: ChunkTokenizer.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, TOKEN} import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.Dataset override val outputAnnotatorType: AnnotatorType = TOKEN override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): TokenizerModel = { val ruleFactory = buildRuleFactory val processedExceptions = get(exceptionsPath) .map(er => ResourceHelper.parseLines(er)) .getOrElse(Array.empty[String]) ++ get(exceptions).getOrElse(Array.empty[String]) val raw = new ChunkTokenizerModel() .setCaseSensitiveExceptions($(caseSensitiveExceptions)) .setTargetPattern($(targetPattern)) .setRules(ruleFactory) if (processedExceptions.nonEmpty) raw.setExceptions(processedExceptions) else raw } } object ChunkTokenizer extends DefaultParamsReadable[ChunkTokenizer]
Example 113
Source File: CoNLLGenerator.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.util import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import scala.collection.mutable.ArrayBuffer import scala.util.Try object CoNLLGenerator { def exportConllFiles(spark: SparkSession, filesPath: String, pipelineModel: PipelineModel, outputPath: String): Unit = { import spark.implicits._ //for toDS and toDF val data = spark.sparkContext.wholeTextFiles(filesPath).toDS.toDF("filename", "text") exportConllFiles(data, pipelineModel, outputPath) } def exportConllFiles(spark: SparkSession, filesPath: String, pipelinePath: String, outputPath: String): Unit = { val model = PipelineModel.load(pipelinePath) exportConllFiles(spark, filesPath, model, outputPath) } def exportConllFiles(data: DataFrame, pipelineModel: PipelineModel, outputPath: String): Unit = { val POSdataset = pipelineModel.transform(data) exportConllFiles(POSdataset, outputPath) } def exportConllFiles(data: DataFrame, pipelinePath: String, outputPath: String): Unit = { val model = PipelineModel.load(pipelinePath) exportConllFiles(data, model, outputPath) } def exportConllFiles(data: DataFrame, outputPath: String): Unit = { import data.sparkSession.implicits._ //for udf var dfWithNER = data //if data does not contain ner column, add "O" as default if (Try(data("finished_ner")).isFailure){ def OArray = (len : Int) => { //create array of $len "O"s var z = new Array[String](len) for (i <- 0 until z.length) { z(i)="O" } z } val makeOArray = data.sparkSession.udf.register("finished_pos", OArray) dfWithNER=data.withColumn("finished_ner", makeOArray(size(col("finished_pos")))) } val newPOSDataset = dfWithNER.select("finished_token", "finished_pos", "finished_token_metadata", "finished_ner"). as[(Array[String], Array[String], Array[(String, String)], Array[String])] val CoNLLDataset = makeConLLFormat(newPOSDataset) CoNLLDataset.coalesce(1).write.format("com.databricks.spark.csv"). options(scala.collection.Map("delimiter" -> " ", "emptyValue" -> "")). save(outputPath) } def makeConLLFormat(newPOSDataset : Dataset[(Array[String], Array[String], Array[(String, String)], Array[String])]) ={ import newPOSDataset.sparkSession.implicits._ //for row casting newPOSDataset.flatMap(row => { val newColumns: ArrayBuffer[(String, String, String, String)] = ArrayBuffer() val columns = ((row._1 zip row._2), row._3.map(_._2.toInt), row._4).zipped.map{case (a,b, c) => (a._1, a._2, b, c)} var sentenceId = 1 newColumns.append(("", "", "", "")) newColumns.append(("-DOCSTART-", "-X-", "-X-", "O")) newColumns.append(("", "", "", "")) columns.foreach(a => { if (a._3 != sentenceId){ newColumns.append(("", "", "", "")) sentenceId = a._3 } newColumns.append((a._1, a._2, a._2, a._4)) }) newColumns }) } }
Example 114
Source File: HasStorageRef.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.storage import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable import org.apache.spark.ml.param.Param import org.apache.spark.sql.Dataset trait HasStorageRef extends ParamsAndFeaturesWritable { val storageRef = new Param[String](this, "storageRef", "storage unique identifier") setDefault(storageRef, this.uid) def createDatabaseConnection(database: Database.Name): RocksDBConnection = RocksDBConnection.getOrCreate(database, $(storageRef)) def setStorageRef(value: String): this.type = { if (get(storageRef).nonEmpty) throw new UnsupportedOperationException(s"Cannot override storage ref on $this. " + s"Please re-use current ref: $getStorageRef") set(this.storageRef, value) } def getStorageRef: String = $(storageRef) def validateStorageRef(dataset: Dataset[_], inputCols: Array[String], annotatorType: String): Unit = { require(isDefined(storageRef), "This Annotator does not have a storage reference defined. This could be an outdated " + "model or an incorrectly created one. Make sure storageRef param is defined and set.") require(HasStorageRef.getStorageRefFromInput(dataset, inputCols, annotatorType) == $(storageRef), s"Found input column with storage metadata. But such ref does not match to the ref this annotator requires. " + s"Make sure you are loading the annotator with ref: ${$(storageRef)}") } } object HasStorageRef { def getStorageRefFromInput(dataset: Dataset[_], inputCols: Array[String], annotatorType: String): String = { val storageCol = dataset.schema.fields .find(f => inputCols.contains(f.name) && f.metadata.getString("annotatorType") == annotatorType) .getOrElse(throw new Exception(s"Could not find a column of type $annotatorType. Make sure your pipeline is correct.")) .name val storage_meta = dataset.select(storageCol).schema.fields.head.metadata require(storage_meta.contains("ref"), s"Could not find a ref name in column $storageCol. " + s"Make sure $storageCol was created appropriately with a valid storageRef") storage_meta.getString("ref") } }
Example 115
Source File: DataBuilder.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.training.CoNLL import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ object DataBuilder extends FlatSpec with BeforeAndAfterAll { this: Suite => import SparkAccessor.spark.implicits._ def basicDataBuild(content: String*)(implicit cleanupMode: String = "disabled"): Dataset[Row] = { val data = SparkAccessor.spark.sparkContext.parallelize(content).toDS().toDF("text") AnnotatorBuilder.withDocumentAssembler(data, cleanupMode) } def multipleDataBuild(content: Seq[String]): Dataset[Row] = { val data = SparkAccessor.spark.sparkContext.parallelize(content).toDS().toDF("text") AnnotatorBuilder.withDocumentAssembler(data) } def buildNerDataset(datasetContent: String): Dataset[Row] = { val lines = datasetContent.split("\n") val data = CoNLL(conllLabelIndex = 1) .readDatasetFromLines(lines, SparkAccessor.spark).toDF AnnotatorBuilder.withDocumentAssembler(data) } def loadParquetDataset(path: String) = SparkAccessor.spark.read.parquet(path) }
Example 116
Source File: BigTextMatcherBehaviors.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.btm import com.johnsnowlabs.nlp.{Annotation, AnnotatorBuilder} import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ trait BigTextMatcherBehaviors { this: FlatSpec => def fullBigTextMatcher(dataset: => Dataset[Row]) { "An BigTextMatcher Annotator" should "successfully transform data" in { AnnotatorBuilder.withFullBigTextMatcher(dataset) .collect().foreach { row => row.getSeq[Row](3) .map(Annotation(_)) .foreach { case entity: Annotation if entity.annotatorType == "entity" => println(entity, entity.end) case _ => () } } } } }
Example 117
Source File: DependencyParserBehaviors.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.parser.dep import com.johnsnowlabs.nlp._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalatest.FlatSpec import com.johnsnowlabs.util.PipelineModels import org.apache.spark.ml.Pipeline trait DependencyParserBehaviors { this: FlatSpec => def initialAnnotations(testDataSet: Dataset[Row]): Unit = { val fixture = createFixture(testDataSet) it should "add annotations" in { assert(fixture.dependencies.count > 0, "Annotations count should be greater than 0") } it should "add annotations with the correct annotationType" in { fixture.depAnnotations.foreach { a => assert(a.annotatorType == AnnotatorType.DEPENDENCY, s"Annotation type should ${AnnotatorType.DEPENDENCY}") } } it should "annotate each token" in { assert(fixture.tokenAnnotations.size == fixture.depAnnotations.size, s"Every token should be annotated") } it should "annotate each word with a head" in { fixture.depAnnotations.foreach { a => assert(a.result.nonEmpty, s"Result should have a head") } } it should "annotate each word with the correct indexes" in { fixture.depAnnotations .zip(fixture.tokenAnnotations) .foreach { case (dep, token) => assert(dep.begin == token.begin && dep.end == token.end, s"Token and word should have equal indixes") } } } private def createFixture(testDataSet: Dataset[Row]) = new { val dependencies: DataFrame = testDataSet.select("dependency") val depAnnotations: Seq[Annotation] = dependencies .collect .flatMap { r => r.getSeq[Row](0) } .map { r => Annotation(r.getString(0), r.getInt(1), r.getInt(2), r.getString(3), r.getMap[String, String](4)) } val tokens: DataFrame = testDataSet.select("token") val tokenAnnotations: Seq[Annotation] = tokens .collect .flatMap { r => r.getSeq[Row](0) } .map { r => Annotation(r.getString(0), r.getInt(1), r.getInt(2), r.getString(3), r.getMap[String, String](4)) } } def relationshipsBetweenWordsPredictor(testDataSet: Dataset[Row], pipeline: Pipeline): Unit = { val emptyDataSet = PipelineModels.dummyDataset val dependencyParserModel = pipeline.fit(emptyDataSet) it should "train a model" in { val model = dependencyParserModel.stages.last.asInstanceOf[DependencyParserModel] assert(model.isInstanceOf[DependencyParserModel]) } val dependencyParserDataFrame = dependencyParserModel.transform(testDataSet) //dependencyParserDataFrame.collect() dependencyParserDataFrame.select("dependency").show(false) it should "predict relationships between words" in { assert(dependencyParserDataFrame.isInstanceOf[DataFrame]) } } }
Example 118
Source File: TokenizerBehaviors.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.{Annotation, AnnotatorBuilder, AnnotatorType} import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ import scala.language.reflectiveCalls trait TokenizerBehaviors { this: FlatSpec => def fixture(dataset: => Dataset[Row]) = new { val df = AnnotatorBuilder.withTokenizer(AnnotatorBuilder.withTokenizer(dataset)) val documents = df.select("document") val sentences = df.select("sentence") val tokens = df.select("token") val sentencesAnnotations = sentences .collect .flatMap { r => r.getSeq[Row](0) } .map { a => Annotation(a.getString(0), a.getInt(1), a.getInt(2), a.getString(3), a.getMap[String, String](4)) } val tokensAnnotations = tokens .collect .flatMap { r => r.getSeq[Row](0)} .map { a => Annotation(a.getString(0), a.getInt(1), a.getInt(2), a.getString(3), a.getMap[String, String](4)) } val docAnnotations = documents .collect .flatMap { r => r.getSeq[Row](0)} .map { a => Annotation(a.getString(0), a.getInt(1), a.getInt(2), a.getString(3), a.getMap[String, String](4)) } val corpus = docAnnotations .map(d => d.result) .mkString("") } def fullTokenizerPipeline(dataset: => Dataset[Row]) { "A Tokenizer Annotator" should "successfully transform data" in { val f = fixture(dataset) assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators") } it should "annotate using the annotatorType of token" in { val f = fixture(dataset) assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators") f.tokensAnnotations.foreach { a => assert(a.annotatorType == AnnotatorType.TOKEN, "Tokenizer annotations type should be equal to 'token'") } } it should "annotate with the correct word indexes" in { val f = fixture(dataset) f.tokensAnnotations.foreach { a => val token = a.result val sentenceToken = f.corpus.slice(a.begin, a.end + 1) assert(sentenceToken == token, s"Word ($sentenceToken) from sentence at (${a.begin},${a.end}) should be equal to token ($token) inside the corpus ${f.corpus}") } } } }
Example 119
Source File: RankingMetricFormatter.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ import ws.vinta.albedo.evaluators.RankingEvaluator._ class RankingMetricFormatter(override val uid: String, val sourceType: String) extends Transformer with DefaultParamsWritable { def this(sourceType: String) = { this(Identifiable.randomUID("rankingMetricFormatter"), sourceType) } val userCol = new Param[String](this, "userCol", "User column name") def getUserCol: String = $(userCol) def setUserCol(value: String): this.type = set(userCol, value) setDefault(userCol -> "user") val itemCol = new Param[String](this, "itemCol", "Item column name") def getItemCol: String = $(itemCol) def setItemCol(value: String): this.type = set(itemCol, value) setDefault(itemCol -> "item") val predictionCol = new Param[String](this, "predictionCol", "Prediction column name") def getPredictionCol: String = $(predictionCol) def setPredictionCol(value: String): this.type = set(predictionCol, value) setDefault(predictionCol -> "prediction") val topK = new IntParam(this, "topK", "Recommend top-k items for every user") def getTopK: Int = $(topK) def setTopK(value: Int): this.type = set(topK, value) setDefault(topK -> 15) override def transformSchema(schema: StructType): StructType = { Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType) .foreach{ case(columnName: String, expectedDataType: DataType) => { val actualDataType = schema(columnName).dataType require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.") } } schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) sourceType match { case "als" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK))) case "lr" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK))) } } override def copy(extra: ParamMap): RankingMetricFormatter = { val copied = new RankingMetricFormatter(uid, sourceType) copyValues(copied, extra) } } object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter]
Example 120
Source File: SQLTransformer.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.Transformer import com.tencent.angel.sona.ml.param.{Param, ParamMap} import com.tencent.angel.sona.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType /** * Implements the transformations which are defined by SQL statement. * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...' * where '__THIS__' represents the underlying table of the input dataset. * The select clause specifies the fields, constants, and expressions to display in * the output, it can be any select clause that Spark SQL supports. Users can also * use Spark SQL built-in function and UDFs to operate on these selected columns. * For example, [[SQLTransformer]] supports statements like: * {{{ * SELECT a, a + b AS a_b FROM __THIS__ * SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5 * SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b * }}} */ class SQLTransformer(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = this(Identifiable.randomUID("sql")) /** * SQL statement parameter. The statement is provided in string form. * * @group param */ final val statement: Param[String] = new Param[String](this, "statement", "SQL statement") def setStatement(value: String): this.type = set(statement, value) def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) // Call SessionCatalog.dropTempView to avoid unpersisting the possibly cached dataset. dataset.sparkSession.catalog.dropTempView(tableName) // Compatible.sessionstate.catalog.dropTempView(tableName) result } override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { override def load(path: String): SQLTransformer = super.load(path) }
Example 121
Source File: DatasetUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata} import org.apache.spark.sql.{Column, DataFrame, Dataset} object DatasetUtil { def withColumns[T](ds: Dataset[T], colNames: Seq[String], cols: Seq[Column], metadata: Seq[Metadata]): DataFrame = { require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") require(colNames.size == metadata.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of metadata elements: ${metadata.size}") val sparkSession = ds.sparkSession val queryExecution = ds.queryExecution val resolver = sparkSession.sessionState.analyzer.resolver val output = queryExecution.analyzed.output checkColumnNameDuplication(colNames, "in given column names", sparkSession.sessionState.conf.caseSensitiveAnalysis) val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) => colName -> col.as(colName, metadata) }.toMap val replacedAndExistingColumns = output.map { field => columnMap.find { case (colName, _) => resolver(field.name, colName) } match { case Some((colName: String, col: Column)) => col.as(colName) case _ => new Column(field) } } val newColumns = columnMap.filter { case (colName, col) => !output.exists(f => resolver(f.name, colName)) }.map { case (colName, col) => col.as(colName) } ds.select(replacedAndExistingColumns ++ newColumns: _*) } def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = { withColumns(ds, Seq(colName), Seq(col), Seq(metadata)) } private def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } /** * Cast a column in a Dataset to Vector type. * * The supported data types of the input column are * - Vector * - float/double type Array. * * Note: The returned column does not have Metadata. * * @param dataset input DataFrame * @param colName column name. * @return Vector column */ def columnToVector(dataset: Dataset[_], colName: String): Column = { val columnDataType = dataset.schema(colName).dataType columnDataType match { case _: VectorUDT => col(colName) case fdt: ArrayType => val transferUDF = fdt.elementType match { case _: FloatType => udf(f = (vector: Seq[Float]) => { val inputArray = Array.fill[Double](vector.size)(0.0) vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble) Vectors.dense(inputArray) }) case _: DoubleType => udf((vector: Seq[Double]) => { Vectors.dense(vector.toArray) }) case other => throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector") } transferUDF(col(colName)) case other => throw new IllegalArgumentException(s"$other column cannot be cast to Vector") } } }
Example 122
Source File: StructuredStreamingKafkaSample.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.structured_streaming import org.apache.spark.sql.{Dataset, SparkSession} object StructuredStreamingKafkaSample extends App { val sparkSession = SparkSession .builder .master("local") .appName("kafka") .getOrCreate() sparkSession.sparkContext.setLogLevel("ERROR") import sparkSession.implicits._ val kafkaDF = sparkSession .readStream .format("kafka") .option("kafka.bootstrap.servers", "127.0.0.1:9092") .option("subscribe", "structured_topic") .load() val data: Dataset[(String, String)] = kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] kafkaDF.printSchema() data.writeStream .outputMode("append") .format("console") .start() .awaitTermination() }
Example 123
Source File: DataPreprocess.scala From xgbspark-text-classification with Apache License 2.0 | 5 votes |
package com.lenovo.ml import org.apache.spark.sql.{SparkSession, DataFrame, Dataset} import scala.collection.mutable import scala.util.matching.Regex import org.ansj.library.DicLibrary import org.ansj.recognition.impl.StopRecognition import org.ansj.splitWord.analysis.DicAnalysis object DataPreprocess { def textCleaner(sparkSession: SparkSession, rawText: DataFrame): Dataset[String] = { // 过滤文本中的时间、网址和邮箱 val regex1 = new Regex("""[-—0-9a-z]+[:]+[0-9a-z]+[:]?""") val regex2 = new Regex("""[0-9]+年|[0-9]+月|[0-9]+[日]|[0-9]+[天]|[0-9]+[号]|[0-9]+[次]""") val regex3 = new Regex("""http[s]?://[a-z0-9./?=_-]+""") val regex4 = new Regex("""[0-9_a-z]+([-+.][0-9_a-z]+)*@[0-9_a-z]+([-.][0-9_a-z]+)*\.[0-9_a-z]+([-.][0-9_a-z]+)*""") import sparkSession.implicits._ rawText.map(x => x.toString).map(x => x.substring(1,x.length - 1).toLowerCase).map(x => regex1.replaceAllIn(x,"")) .map(x => regex2.replaceAllIn(x,"")).map(x => regex3.replaceAllIn(x,"")).map(x => regex4.replaceAllIn(x,"")) } def segWords(sparkSession: SparkSession, stopWordsPath: String, dictionaryPath: String, synonymWordsPath: String, singleWordsPath: String, rawText: DataFrame): DataFrame = { val filter = new StopRecognition() // 设定停用词性 filter.insertStopNatures("w","ns","nr","t","r","u","e","y","o") // 加载停用词表 val stopWords = sparkSession.sparkContext.textFile(stopWordsPath).cache() stopWords.collect().foreach{line => filter.insertStopWords(line)} // 加载自定义词表 val dictionary = sparkSession.sparkContext.textFile(dictionaryPath).cache() dictionary.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)} stopWords.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)} // 构建同义词表 val synonymWords = sparkSession.sparkContext.textFile(synonymWordsPath).cache() var synonymMap: Map[String, String] = Map() synonymWords.collect().foreach{line => val data = line.split(" ",2) synonymMap = synonymMap + (data(0) -> data(1)) } // 构建单字白名单 val singleWords = sparkSession.sparkContext.textFile(singleWordsPath).cache() val singleWhiteList: mutable.Set[String] = mutable.Set() singleWords.collect().foreach{line => singleWhiteList.add(line)} // 通过广播将词表发送给各节点 val stop = sparkSession.sparkContext.broadcast(filter) val dic = sparkSession.sparkContext.broadcast(DicLibrary.get(DicLibrary.DEFAULT)) val synonym = sparkSession.sparkContext.broadcast(synonymMap) val single = sparkSession.sparkContext.broadcast(singleWhiteList) // 读取文本数据,过滤后分词 import sparkSession.implicits._ textCleaner(sparkSession, rawText).map { x => val parse = DicAnalysis.parse(x, dic.value).recognition(stop.value) // 抽取分词结果,不附带词性 val words = for(i<-Range(0,parse.size())) yield parse.get(i).getName val filterWords = words.map(_.trim).filter(x => x.length > 1 || single.value.contains(x)) filterWords.map(x => if(synonym.value.contains(x)) synonym.value(x) else x).mkString(" ") }.toDF("words") } }
Example 124
Source File: SimpleVectorAssembler.scala From albedo with MIT License | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuilder def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val schema = dataset.schema val assembleFunc = udf { r: Row => SimpleVectorAssembler.assemble(r.toSeq: _*) } val args = $(inputCols).map { c => schema(c).dataType match { case DoubleType => dataset(c) case _: VectorUDT => dataset(c) case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid") } } dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol))) } override def transformSchema(schema: StructType): StructType = { val inputColNames = $(inputCols) val outputColName = $(outputCol) val inputDataTypes = inputColNames.map(name => schema(name).dataType) inputDataTypes.foreach { case _: NumericType | BooleanType => case t if t.isInstanceOf[VectorUDT] => case other => throw new IllegalArgumentException(s"Data type $other is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) } override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra) } object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] { override def load(path: String): SimpleVectorAssembler = super.load(path) def assemble(vv: Any*): Vector = { val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case null => // TODO: output Double.NaN? throw new SparkException("Values to assemble cannot be null.") case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } Vectors.sparse(cur, indices.result(), values.result()).compressed } }
Example 125
Source File: ALSRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import com.github.fommil.netlib.F2jBLAS import org.apache.spark.ml.recommendation.ALSModel import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.settings class ALSRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("alsRecommender")) } private def alsModel: ALSModel = { val alsModelPath = s"${settings.dataDir}/${settings.today}/alsModel.parquet" ALSModel.load(alsModelPath) } def blockify(factors: Dataset[(Int, Array[Float])], blockSize: Int = 4096): Dataset[Seq[(Int, Array[Float])]] = { import factors.sparkSession.implicits._ factors.mapPartitions(_.grouped(blockSize)) } override def source = "als" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) import userDF.sparkSession.implicits._ val activeUsers = userDF.select(col($(userCol)).alias("id")) val userFactors = alsModel.userFactors.join(activeUsers, Seq("id")) val itemFactors = alsModel.itemFactors val rank = alsModel.rank val num = $(topK) val userFactorsBlocked = blockify(userFactors.as[(Int, Array[Float])]) val itemFactorsBlocked = blockify(itemFactors.as[(Int, Array[Float])]) val ratings = userFactorsBlocked.crossJoin(itemFactorsBlocked) .as[(Seq[(Int, Array[Float])], Seq[(Int, Array[Float])])] .flatMap { case (srcIter, dstIter) => val m = srcIter.size val n = math.min(dstIter.size, num) val output = new Array[(Int, Int, Float)](m * n) var i = 0 val pq = new BoundedPriorityQueue[(Int, Float)](num)(Ordering.by(_._2)) srcIter.foreach { case (srcId, srcFactor) => dstIter.foreach { case (dstId, dstFactor) => val score = new F2jBLAS().sdot(rank, srcFactor, 1, dstFactor, 1) pq += dstId -> score } pq.foreach { case (dstId, score) => output(i) = (srcId, dstId, score) i += 1 } pq.clear() } output.toSeq } ratings .toDF($(userCol), $(itemCol), $(scoreCol)) .withColumn($(sourceCol), lit(source)) } }
Example 126
Source File: CurationRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import ws.vinta.albedo.utils.DatasetUtils._ class CurationRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("curationRecommender")) } override def source = "curation" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) implicit val spark: SparkSession = userDF.sparkSession import spark.implicits._ val rawStarringDS = loadRawStarringDS().cache() val curatorIds = Array(652070, 1912583, 59990, 646843, 28702) // vinta, saiday, tzangms, fukuball, wancw val curatedRepoDF = rawStarringDS .select($"repo_id", $"starred_at") .where($"user_id".isin(curatorIds: _*)) .groupBy($"repo_id") .agg(max($"starred_at").alias("starred_at")) .orderBy($"starred_at".desc) .limit($(topK)) .cache() def calculateScoreUDF = udf((starred_at: java.sql.Timestamp) => { starred_at.getTime / 1000.0 }) userDF .select($(userCol)) .crossJoin(curatedRepoDF) .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"starred_at").alias($(scoreCol))) .withColumn($(sourceCol), lit(source)) } }
Example 127
Source File: PopularityRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import ws.vinta.albedo.utils.DatasetUtils._ class PopularityRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("popularityRecommender")) } override def source = "popularity" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) implicit val spark: SparkSession = userDF.sparkSession import spark.implicits._ val popularRepoDF = loadPopularRepoDF() .limit($(topK)) .cache() def calculateScoreUDF = udf((stargazers_count: Int, created_at: java.sql.Timestamp) => { val valueScore = math.round(math.log10(stargazers_count) * 1000.0) / 1000.0 val timeScore = (created_at.getTime / 1000.0) / (60 * 60 * 24 * 30 * 12) / 5.0 valueScore + timeScore }) userDF .select($(userCol)) .crossJoin(popularRepoDF) .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"repo_stargazers_count", $"repo_created_at").alias($(scoreCol))) .withColumn($(sourceCol), lit(source)) } }
Example 128
Source File: ContentRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.http.HttpHost import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.elasticsearch.action.search.SearchRequest import org.elasticsearch.client.{RestClient, RestHighLevelClient} import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item import org.elasticsearch.index.query.QueryBuilders._ import org.elasticsearch.search.SearchHit import org.elasticsearch.search.builder.SearchSourceBuilder import ws.vinta.albedo.closures.DBFunctions._ class ContentRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("contentRecommender")) } val enableEvaluationMode = new Param[Boolean](this, "enableEvaluationMode", "Should be enable for evaluation only") def getEnableEvaluationMode: Boolean = $(enableEvaluationMode) def setEnableEvaluationMode(value: Boolean): this.type = set(enableEvaluationMode, value) setDefault(enableEvaluationMode -> false) override def source = "content" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) import userDF.sparkSession.implicits._ val userRecommendedItemDF = userDF .as[Int] .flatMap { case (userId) => { // 因為 More Like This query 用 document id 查詢時 // 結果會過濾掉那些做為條件的 document ids // 但是這樣在 evaluate 的時候就不太合適了 // 所以我們改用後 k 個 repo 當作查詢條件 val limit = $(topK) val offset = if ($(enableEvaluationMode)) $(topK) else 0 val repoIds = selectUserStarredRepos(userId, limit, offset) val lowClient = RestClient.builder(new HttpHost("127.0.0.1", 9200, "http")).build() val highClient = new RestHighLevelClient(lowClient) val fields = Array("description", "full_name", "language", "topics") val texts = Array("") val items = repoIds.map((itemId: Int) => new Item("repo", "repo_info_doc", itemId.toString)) val queryBuilder = moreLikeThisQuery(fields, texts, items) .minTermFreq(2) .maxQueryTerms(50) val searchSourceBuilder = new SearchSourceBuilder() searchSourceBuilder.query(queryBuilder) searchSourceBuilder.size($(topK)) searchSourceBuilder.from(0) val searchRequest = new SearchRequest() searchRequest.indices("repo") searchRequest.types("repo_info_doc") searchRequest.source(searchSourceBuilder) val searchResponse = highClient.search(searchRequest) val hits = searchResponse.getHits val searchHits = hits.getHits val userItemScoreTuples = searchHits.map((searchHit: SearchHit) => { val itemId = searchHit.getId.toInt val score = searchHit.getScore (userId, itemId, score) }) lowClient.close() userItemScoreTuples } } .toDF($(userCol), $(itemCol), $(scoreCol)) .withColumn($(sourceCol), lit(source)) userRecommendedItemDF } }
Example 129
Source File: UserRepoTransformer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ class UserRepoTransformer(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("userRepoTransformer")) } val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) override def transformSchema(schema: StructType): StructType = { $(inputCols).foreach((inputColName: String) => { require(schema.fieldNames.contains(inputColName), s"Input column $inputColName must exist.") }) val newFields: Array[StructField] = Array( StructField("repo_language_index_in_user_recent_repo_languages", IntegerType, nullable = false), StructField("repo_language_count_in_user_recent_repo_languages", IntegerType, nullable = false) ) StructType(schema.fields ++ newFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) import dataset.sparkSession.implicits._ dataset .withColumn("repo_language_index_in_user_recent_repo_languages", repoLanguageIndexInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) .withColumn("repo_language_count_in_user_recent_repo_languages", repoLanguageCountInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) } override def copy(extra: ParamMap): UserRepoTransformer = { defaultCopy(extra) } } object UserRepoTransformer extends DefaultParamsReadable[UserRepoTransformer]
Example 130
Source File: Evaluator.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.evaluation import com.tencent.angel.sona.ml.param.{ParamMap, Params} import org.apache.spark.sql.Dataset /** * :: DeveloperApi :: * Abstract class for evaluators that compute metrics from predictions. */ abstract class Evaluator extends Params { /** * Evaluates model output and returns a scalar metric. * The value of [[isLargerBetter]] specifies whether larger values are better. * * @param dataset a dataset that contains labels/observations and predictions. * @param paramMap parameter map that specifies the input columns and output metrics * @return metric */ def evaluate(dataset: Dataset[_], paramMap: ParamMap): Double = { this.copy(paramMap).evaluate(dataset) } /** * Evaluates model output and returns a scalar metric. * The value of [[isLargerBetter]] specifies whether larger values are better. * * @param dataset a dataset that contains labels/observations and predictions. * @return metric */ def evaluate(dataset: Dataset[_]): Double /** * Indicates whether the metric returned by `evaluate` should be maximized (true, default) * or minimized (false). * A given evaluator may support multiple metrics which may be maximized or minimized. */ def isLargerBetter: Boolean = true override def copy(extra: ParamMap): Evaluator }
Example 131
Source File: IntermediateCacher.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} class IntermediateCacher(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("intermediateCacher")) } val inputCols = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) setDefault(inputCols -> Array.empty[String]) override def transformSchema(schema: StructType): StructType = { schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) val intermediateDF = if ($(inputCols).isEmpty) dataset.toDF() else dataset.select($(inputCols).map(col(_)): _*) intermediateDF.cache() } override def copy(extra: ParamMap): IntermediateCacher = { defaultCopy(extra) } } object IntermediateCacher extends DefaultParamsReadable[IntermediateCacher]
Example 132
Source File: DataFrameConverter.scala From incubator-toree with Apache License 2.0 | 5 votes |
package org.apache.toree.utils import org.apache.spark.sql.{Dataset, Row} import org.apache.toree.plugins.Plugin import play.api.libs.json.{JsObject, Json} import scala.util.Try import org.apache.toree.plugins.annotations.Init import DataFrameConverter._ class DataFrameConverter extends Plugin with LogLike { @Init def init() = { register(this) } def convert(df: Dataset[Row], outputType: String, limit: Int = 10): Try[String] = { Try( outputType.toLowerCase() match { case "html" => convertToHtml(df = df, limit = limit) case "json" => convertToJson(df = df, limit = limit) case "csv" => convertToCsv(df = df, limit = limit) } ) } private def convertToHtml(df: Dataset[Row], limit: Int = 10): String = { val columnFields = df.schema.fieldNames.map(columnName => { s"<th>${columnName}</th>" }).reduce(_ + _) val columns = s"<tr>${columnFields}</tr>" val rows = df.rdd.map(row => { val fieldValues = row.toSeq.map(field => { s"<td>${fieldToString(field)}</td>" }).reduce(_ + _) s"<tr>${fieldValues}</tr>" }).take(limit).reduce(_ + _) s"<table>${columns}${rows}</table>" } private def convertToJson(df: Dataset[Row], limit: Int = 10): String = { val schema = Json.toJson(df.schema.fieldNames) val transformed = df.rdd.map(row => row.toSeq.map(fieldToString).toArray) val rows = transformed.take(limit) JsObject(Seq( "columns" -> schema, "rows" -> Json.toJson(rows) )).toString() } private def convertToCsv(df: Dataset[Row], limit: Int = 10): String = { val headers = df.schema.fieldNames.reduce(_ + "," + _) val rows = df.rdd.map(row => { row.toSeq.map(fieldToString).reduce(_ + "," + _) }).take(limit).reduce(_ + "\n" + _) s"${headers}\n${rows}" } } object DataFrameConverter { def fieldToString(any: Any): String = any match { case null => "null" case seq: Seq[_] => seq.mkString("[", ", ", "]") case _ => any.toString } }
Example 133
Source File: K-Centers.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.clustering.kcenters.dataset @annotation.tailrec def go(cpt: Int, haveAllCentersConverged: Boolean, centers: List[(Int, V)]): List[(Int, V)] = { val preUpdatedCenters = data.groupByKey( cz => obtainNearestCenterID(cz.v, centers, metric) )(encoderInt) .mapGroups(computeCenters)(encoder) .collect .sortBy(_._1) .toList val alignedOldCenters = preUpdatedCenters.map{ case (oldClusterID, _) => centers(oldClusterID) } val updatedCenters = preUpdatedCenters.zipWithIndex.map{ case ((oldClusterID, center), newClusterID) => (newClusterID, center) } val shiftingEnough = areCentersNotMovingEnough(updatedCenters, alignedOldCenters, minShift, metric) if(cpt < maxIterations && !shiftingEnough) { go(cpt + 1, shiftingEnough, updatedCenters) } else { updatedCenters } } immutable.HashMap(go(0, false, centers):_*) } }
Example 134
Source File: K-Means.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.clustering.kcenters.rdd final def fit[D <: ContinuousDistance]( data: RDD[Array[Double]], k: Int, metric: D, minShift: Double, maxIterations: Int, persistanceLVL: StorageLevel ): KMeansModel[D] = { KMeans(k, metric, minShift, maxIterations, persistanceLVL).fit(scalarDataWithIDToClusterizable(data.zipWithIndex)) } }
Example 135
Source File: Utils.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers import au.com.bytecode.opencsv.CSVParser import com.bizo.mighty.csv.{CSVReader, CSVReaderSettings} import org.apache.spark.sql.Dataset object Utils { case class Field(idx: Int, value: String) def readAs[T](filename: String)(implicit settings: CSVReaderSettings, mf: Manifest[T]): Iterator[T] = { val is = getClass.getResourceAsStream(filename) CSVReader(is)(settings) { CSVReader.convertRow[T] } } def split(ds: Dataset[String], delimiter: String = ","): Dataset[Array[String]] = { import ds.sparkSession.implicits._ ds.mapPartitions({ lines => val parser = new CSVParser(delimiter.charAt(0)) lines map parser.parseLine }) } def buildColumns(ds: Dataset[Array[String]]): Dataset[Field] = { import ds.sparkSession.implicits._ ds.flatMap({ values => values.zipWithIndex.map({ case (value, col) => Field(col, value) }) }) } }
Example 136
Source File: AsciiProfiler.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers.raw import io.gzet.profilers.Utils._ import org.apache.spark.sql.{Dataset, Row} case class AsciiProfiler(asciiMap: Map[Int, Ascii]) { def profile(df: Dataset[String]): Dataset[AsciiReport] = { val charset = df.sparkSession.sparkContext.broadcast(asciiMap) import df.sparkSession.implicits._ val charCount = df.flatMap(_.toCharArray.map(_.asInstanceOf[Int])) .groupByKey(t => t) .count() .withColumnRenamed("value", "tmp") .withColumnRenamed("count(1)", "count") charCount.map({ case Row(octet: Int, count: Long) => val ascii = charset.value.getOrElse(octet, Ascii("NA", "NA", "NA", "NA", "NA")) AsciiReport( ascii.binary, ascii.description, count ) }) } } object AsciiProfiler { def apply(): AsciiProfiler = { val ascii = readAs[Ascii]("/ascii.csv").toList AsciiProfiler(ascii.map(a => (a.octet.toInt, a)).toMap) } } case class Ascii( symbol: String, octet: String, hex: String, binary: String, description: String ) case class AsciiReport( binary: String, ascii: String, metricValue: Double )
Example 137
Source File: RowProfiler.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers.raw import org.apache.spark.sql.Dataset case class RowProfiler() { def profile(df: Dataset[String]): Dataset[RowReport] = { import df.sparkSession.implicits._ val report = RowReport(df.count().toDouble) df.sparkSession.createDataset[RowReport]( Seq(report) ) } } case class RowReport( metricValue: Double )
Example 138
Source File: StructuralProfiler.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers.raw import au.com.bytecode.opencsv.CSVParser import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} case class StructuralProfiler(delimiter: String = ",") { def profile(df: Dataset[String]): Dataset[StructuralReport] = { import df.sparkSession.implicits._ val rows = df.mapPartitions({ lines => val parser = new CSVParser(delimiter.charAt(0)) lines.map(line => (parser.parseLine(line).length, line)) }) val fieldCount = rows.groupByKey({ case (fields, line) => fields }).count() .withColumnRenamed("value", "fields") .withColumnRenamed("count(1)", "count") val fieldLine = rows.groupByKey({ case (fields, line) => fields }).reduceGroups({ (v1, v2) => v1 }).map({ case (fields, (_, line)) => (fields, line) }) .withColumnRenamed("_1", "_fieldLine_") .withColumnRenamed("_2", "line") fieldCount.join(fieldLine, col("fields") === col("_fieldLine_")) .drop("_fieldLine_") .map({ case Row(columns: Int, count: Long, line: String) => StructuralReport( columns, count, line ) }) } } case class StructuralReport( fields: Int, metricValue: Double, description: String )
Example 139
Source File: EmptinessProfiler.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers.field import io.gzet.profilers.Utils import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.Dataset import scalaz.Scalaz._ case class EmptinessProfiler() { def profile(df: Dataset[Array[String]]): Dataset[EmptinessReport] = { import df.sparkSession.implicits._ val features = Utils.buildColumns(df) features.map(f => (f.idx, StringUtils.isNotEmpty(f.value))).groupByKey({ case (column, isNotEmpty) => (column, isNotEmpty) }).count().map({ case ((column, isNotEmpty), count) => (column, Map(isNotEmpty -> count)) }).groupByKey({ case (column, map) => column }).reduceGroups({ (v1, v2) => (v1._1, v1._2 |+| v2._2) }).map({ case (col, (_, map)) => val emptiness = map.getOrElse(false, 0L) / (map.getOrElse(true, 0L) + map.getOrElse(false, 0L)).toDouble EmptinessReport( col, emptiness ) }) } } case class EmptinessReport( field: Int, metricValue: Double )
Example 140
Source File: DataFrameFunctions.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import org.apache.spark.sql.{Column, Dataset, Row} class DataFrameFunctions(self: DC[Row]) { def join(right: DC[Row]): DC[Row] = { val f = (left: Dataset[_], right: Dataset[_]) => { left.join(right) } val hashTarget = Seq("join") new MultiDatasetTransformDC(self, right, f, hashTarget) } def join(right: DC[Row], usingColumn: String): DC[Row] = { val f = (left: Dataset[_], right: Dataset[_]) => { left.join(right, usingColumn) } val hashTarget = Seq("join", usingColumn) new MultiDatasetTransformDC(self, right, f, hashTarget) } def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner") def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = { val f = (left: Dataset[_], right: Dataset[_]) => { left.join(right, joinExprs) } val hashTarget = Seq("join", joinType, joinExprs.toString()) new MultiDatasetTransformDC(self, right, f, hashTarget) } }
Example 141
Source File: WordCount.scala From Scalaprof with GNU General Public License v2.0 | 5 votes |
package edu.neu.csye._7200 import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.{SparkConf, SparkContext} object WordCount extends App { def wordCount(lines: RDD[String],separator: String) = { lines.flatMap(_.split(separator)) .map((_,1)) .reduceByKey(_ + _) } def wordCount2(lines: RDD[String], separator: String) = { lines.flatMap(_.split(separator)) .filter(!_.contains("He")) .map(_.replace(",", "")) .map((_,1)) .reduceByKey(_ + _) } def wordCount3(lines: RDD[String], separator: String) = { lines.flatMap(_.split(separator)) .filter(myFilter(_, "He")) .map(myReplacer _) .map((_,1)) .reduceByKey(_ + _) } def myFilter(input: String, keyword: String) = !input.contains(keyword) def myReplacer(input: String) = input.replace(",","") case class Word(word: String, count: Int) def createWordDS(ds: Dataset[String], separator: String)(implicit spark:SparkSession) = { import spark.implicits._ ds.flatMap(_.split(separator)) .map((_,1)) .map(Word.tupled) .as[Word] } //For Spark 1.0-1.9 val sc = new SparkContext(new SparkConf().setAppName("WordCount").setMaster("local[*]")) wordCount(sc.textFile("input//WordCount.txt")," ").collect().foreach(println(_)) sc.stop() //For Spark 2.0+ implicit val spark = SparkSession .builder() .appName("WordCount") .master("local[*]") .getOrCreate() wordCount(spark.read.textFile("input//WordCount.txt").rdd," ").collect().foreach(println(_)) //Spark SQL example val wordDS = createWordDS(spark.read.textFile("input//WordCount.txt")," ") wordDS.createTempView("words") wordDS.cache() spark.sql("select word, count(*) from words group by word").show(10) spark.stop() }
Example 142
Source File: SuicidalMonkeyProcessor.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import pipelines.spark.sql.SQLImplicits._ class SuicidalMonkeyProcessor extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.key.toString) val shape = StreamletShape(in, out) val rng = scala.util.Random override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val outStream = process(readStream(in)) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data]): Dataset[Data] = { inDataset.mapPartitions { iter ⇒ // monkey business // The logic in this processor causes the current executor to crash with a certain probability. // comment out to see the process working if (rng.nextDouble() < SequenceSettings.FailureProbability) { sys.exit(-1) } iter } } } }
Example 143
Source File: SparkSequenceGeneratorIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.LongType import org.apache.spark.sql.streaming.OutputMode import pipelines.streamlets._ import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import pipelines.spark.sql.SQLImplicits._ class SparkSequenceGeneratorIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.key.toString) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) override def buildStreamingQueries = { writeStream(process, out, OutputMode.Append).toQueryExecution } private def process: Dataset[Data] = { session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .withColumn("key", ($"value" / SequenceSettings.GroupSize).cast(LongType)) .as[Data] } } }
Example 144
Source File: MovingAverageSparklet.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class MovingAverageSparklet extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Agg]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data]): Dataset[Agg] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", "1 minutes") .groupBy(window($"ts", "1 minute", "30 seconds"), $"src", $"gauge").agg(avg($"value") as "avg") query.select($"src", $"gauge", $"avg" as "value").as[Agg] } } }
Example 145
Source File: SparkRandomGenDataIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import java.sql.Timestamp import scala.util.Random import pipelines.streamlets.{ IntegerConfigParameter, StreamletShape } import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import pipelines.spark.sql.SQLImplicits._ case class Rate(timestamp: Timestamp, value: Long) class SparkRandomGenDataIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.src) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to produce.", Some(50)) override def configParameters = Vector(RecordsPerSecond) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(process, out, OutputMode.Append).toQueryExecution } private def process: Dataset[Data] = { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) val gaugeGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "oil" else "gas" val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] rateStream.map { case Rate(timestamp, value) ⇒ Data(s"src-${value % 100}", timestamp.getTime, gaugeGen(), Random.nextDouble() * value) } } } }
Example 146
Source File: CallStatsAggregator.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.examples.carly.aggregator import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import pipelines.streamlets._ import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.streaming.OutputMode import pipelines.spark.sql.SQLImplicits._ import org.apache.log4j.{ Level, Logger } import pipelines.examples.carly.data._ class CallStatsAggregator extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) //tag::docs-schemaAware-example[] val in = AvroInlet[CallRecord]("in") val out = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString) val shape = StreamletShape(in, out) //end::docs-schemaAware-example[] val GroupByWindow = DurationConfigParameter( "group-by-window", "Window duration for the moving average computation", Some("1 minute")) val Watermark = DurationConfigParameter( "watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute")) override def configParameters = Vector(GroupByWindow, Watermark) override def createLogic = new SparkStreamletLogic { val watermark = context.streamletConfig.getDuration(Watermark.key) val groupByWindow = context.streamletConfig.getDuration(GroupByWindow.key) //tag::docs-aggregationQuery-example[] override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Update).toQueryExecution } private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", s"${watermark.toMillis()} milliseconds") .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds")) .agg(avg($"duration") as "avgCallDuration", sum($"duration") as "totalCallDuration") .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType)) query .select($"window.start".cast(LongType) as "startTime", $"windowDuration", $"avgCallDuration", $"totalCallDuration") .as[AggregatedCallStats] } //end::docs-aggregationQuery-example[] } }
Example 147
Source File: CallRecordGeneratorIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.examples.carly.aggregator import java.sql.Timestamp import scala.util.Random import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, SparkSession } import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.LongType import pipelines.streamlets._ import pipelines.streamlets.avro._ import pipelines.spark.sql.SQLImplicits._ import pipelines.examples.carly.data.CallRecord import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.log4j.{ Level, Logger } case class Rate(timestamp: Timestamp, value: Long) class CallRecordGeneratorIngress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) val out = AvroOutlet[CallRecord]("out", _.user) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) override def buildStreamingQueries = { val outStream = DataGenerator.mkData(super.session, recordsPerSecond) writeStream(outStream, out, OutputMode.Append).toQueryExecution } } } object DataGenerator { def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = { // do we need to expose this through configuration? val MaxTime = 2.hours.toMillis val MaxUsers = 100000 val TS0 = new java.sql.Timestamp(0) val ZeroTimestampProb = 0.05 // error rate // Random Data Generator val usersUdf = udf(() ⇒ "user-" + Random.nextInt(MaxUsers)) val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing") // Time-biased randomized filter - 1/2 hour cycles val sinTime: Long ⇒ Double = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI) val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob val timeFilterUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng)) val zeroTimestampUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ { if (rng < ZeroTimestampProb) { TS0 } else { ts } }) val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand()) val sampledData = randomDataset.where(timeFilterUdf($"timestamp", $"rng")) .withColumn("user", usersUdf()) .withColumn("other", usersUdf()) .withColumn("direction", directionUdf()) .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType)) .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng")) .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp" as "timestamp") .as[CallRecord] sampledData } }
Example 148
Source File: IdentitySparkProcessor1.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class IdentitySparkProcessor1 extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(readStream(in).map(d ⇒ d.copy(t1 = TimeOps.nowAsOption)), out, OutputMode.Append).toQueryExecution } } }
Example 149
Source File: SparkRandomGenDataIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import java.sql.Timestamp import scala.util.Random import pipelines.streamlets.{ DurationConfigParameter, IntegerConfigParameter, StreamletShape } import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.{ OutputMode, Trigger } import pipelines.spark.sql.SQLImplicits._ case class Rate(timestamp: Timestamp, value: Long) class SparkRandomGenDataIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.src) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to produce.", Some(50)) val RampUpTime = DurationConfigParameter( "ramp-up-time", "Time to reach max records per second.", Some("0 seconds")) override def configParameters = Vector(RecordsPerSecond, RampUpTime) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(process, out, OutputMode.Append).toQueryExecution } private def process: Dataset[Data] = { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) val rampUpTime = context.streamletConfig.getDuration(RampUpTime.key, java.util.concurrent.TimeUnit.SECONDS) println(s"Using rampup time of $rampUpTime seconds") val gaugeGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "oil" else "gas" val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .option("rampUpTime", s"${rampUpTime}s") .load() .as[Rate] rateStream.map { case Rate(timestamp, value) ⇒ Data(s"src-${value % 1000}", timestamp.getTime, None, None, gaugeGen(), value) } } } }
Example 150
Source File: MyDatasetFunc.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter3 import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Dataset, SparkSession} //import spark.ml.cookbook.chapter3.{Car, MyDatasetData} //import scala.collection.mutable import scala.collection.mutable.ListBuffer object MyDatasetFunc { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("mydatasetfunc") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ val cars = spark.createDataset(MyDatasetData.carData) cars.show(false) val modelData = cars.groupByKey(_.make).mapGroups({ case (make, car) => { val carModel = new ListBuffer[String]() car.map(_.model).foreach({ c => carModel += c }) (make, carModel) } }) modelData.show(false) spark.stop() } }
Example 151
Source File: CardinalityProfiler.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers.field import io.gzet.profilers.Utils import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} import scala.collection.mutable import scalaz.Scalaz._ case class CardinalityProfiler(topN: Int = 5) { def profile(df: Dataset[Array[String]]): Dataset[CardinalityReport] = { val total = df.sparkSession.sparkContext.broadcast(df.count()) import df.sparkSession.implicits._ val features = Utils.buildColumns(df) val topNValues = features.groupByKey({ field => field }).count().map({ case (field, count) => (field.idx, Map(field.value -> count)) }).groupByKey({ case (column, map) => column }).reduceGroups({ (v1, v2) => val m1 = v1._2 val m2 = v2._2 val m = (m1 |+| m2).toSeq.sortBy(_._2).reverse (v1._1, m.take(math.min(m.size, topN)).toMap) }).map({ case (column, (_, map)) => val top = map.keySet.toArray (column, top) }) .withColumnRenamed("_1", "_topNValues_") .withColumnRenamed("_2", "description") val cardinalities = features.distinct().groupByKey(_.idx).count().map({ case (column, distinctValues) => val cardinality = distinctValues / total.value.toDouble (column, cardinality) }) .withColumnRenamed("_1", "column") .withColumnRenamed("_2", "cardinality") cardinalities.join(topNValues, col("column") === col("_topNValues_")) .drop("_topNValues_") .map({ case Row(column: Int, cardinality: Double, description: mutable.WrappedArray[String]) => CardinalityReport( column, cardinality, description.toArray ) }) } } case class CardinalityReport( field: Int, metricValue: Double, description: Array[String] )
Example 152
Source File: MultiGroupedTransformDC.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.bloomberg.sparkflow.serialization.Hashing import org.apache.spark.sql.{Dataset, Encoder, KeyValueGroupedDataset, SparkSession} import scala.concurrent.duration.Duration import scala.concurrent.{Await, Future} import scala.reflect.ClassTag import scala.concurrent.ExecutionContext.Implicits.global class MultiGroupedTransformDC[K, V, U, T: ClassTag] (left: KeyValueGroupedDC[K, V], right: KeyValueGroupedDC[K, U], f: (KeyValueGroupedDataset[K, V], KeyValueGroupedDataset[K, U]) => Dataset[T]) (implicit tEncoder: Encoder[T]) extends DC[T](tEncoder, Seq(left, right)) { override def computeDataset(spark: SparkSession) = { val leftFuture = Future{left.get(spark)} val rightFuture = Future{right.get(spark)} val ld = Await.result(leftFuture, Duration.Inf) val rd = Await.result(rightFuture, Duration.Inf) val dataset = f(ld, rd) dataset } override def computeSignature() = { Hashing.hashString(left.getSignature + right.getSignature + Hashing.hashClass(f)) } }
Example 153
Source File: DatasetTransformDC.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.bloomberg.sparkflow.serialization.Hashing._ import org.apache.spark.sql.{Dataset, Encoder, SparkSession} private[sparkflow] class DatasetTransformDC[U, T] (encoder: Encoder[U], val prev: DC[T], f: (Dataset[T]) => Dataset[U], hashTargets: Seq[String]) extends DC[U](encoder, Seq(prev)) { // // def this(prev: DC[T], f: Dataset[T] => Dataset[U], hashTarget: AnyRef)(implicit tEncoder: Encoder[T], uEncoder: Encoder[U]) = { // this(prev, uEncoder, f, Seq(hashClass(hashTarget))) // } // // def this(prev: DC[T], f: Dataset[T] => Dataset[U], hashTarget: AnyRef, hashTargets: Seq[String])(implicit tEncoder: Encoder[T], uEncoder: Encoder[U]) = { // this(prev,uEncoder, f, hashClass(hashTarget) +: hashTargets) // } def computeDataset(spark: SparkSession) = { val dataset = f(prev.getDataset(spark)) dataset } override def computeSignature() = { hashString(prev.getSignature + hashSeq(hashTargets)) } }
Example 154
Source File: DataFrameWordCountTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.dataframe import com.github.dnvriend.TestSpec import org.apache.spark.sql.{ DataFrame, Dataset } class DataFrameWordCountTest extends TestSpec { it should "wordcount alice in wonderland" in withSparkSession { spark => import org.apache.spark.sql.functions._ import spark.implicits._ val lines: Dataset[String] = spark.read.text(TestSpec.AliceInWonderlandText).as[String] lines.count shouldBe 3599 // alice in wonderland contains 3599 lines val words: DataFrame = lines.flatMap((line: String) => line.split(" ")).map(_.trim).filter(_.nonEmpty).toDF("word") words.count() shouldBe 26467 // there are 26,467 words in the book, excluding spaces val wordCount: Dataset[(String, Long)] = words.groupBy('word).agg(count('word).as("count")).orderBy('count.desc).as[(String, Long)].cache wordCount.take(1).head shouldBe ("the", 1505) // the word 'the' is used 1505 times wordCount.filter(lower('word) === "alice").take(1).head shouldBe ("Alice", 221) wordCount.filter(lower('word) === "queen").take(1).head shouldBe ("Queen", 34) wordCount.filter(lower('word) === "rabbit").take(1).head shouldBe ("Rabbit", 29) wordCount.filter(lower('word) === "cheshire").take(1).head shouldBe ("Cheshire", 6) } }
Example 155
Source File: DatasetTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.dataset import com.github.dnvriend.TestSpec import com.github.dnvriend.spark._ import com.github.dnvriend.spark.datasources.person.Person import org.apache.spark.sql.Dataset class DatasetTest extends TestSpec { lazy val xs = Seq( Person(1, "foo", 30), Person(2, "bar", 21), Person(3, "baz", 25), Person(4, "jaz", 40), Person(5, "bab", 50) ) it should "typed Dataset operations: count" in withSparkSession { spark => import spark.implicits._ val ds: Dataset[Person] = xs.toDS() ds.count() shouldBe 5 } it should "untyped Dataset operations: (aka DataFrame, everything is a Row)" in withSparkSession { spark => import spark.implicits._ val ds = xs.toDS() ds.createOrReplaceTempView("people") ds.sqlContext.sql("SELECT COUNT(*) FROM people") // Array[Row] .head.getLong(0) shouldBe 5 } it should "count SQL, convert back to typed with .as[Long]" in withSparkSession { spark => import spark.implicits._ val ds = xs.toDS() ds.createOrReplaceTempView("people") ds.sqlContext.sql("SELECT COUNT(*) FROM people").as[Long].head() shouldBe 5 } it should "count using dataset operations" in withSparkSession { spark => import spark.implicits._ val ds = xs.toDS() ds.count() shouldBe 5 } it should "filter a ds" in withSparkSession { spark => import spark.implicits._ val ds = xs.toDS() ds.filter(_.age < 30).count shouldBe 2 ds.filter(_.age > 30).count shouldBe 2 ds.filter(_.age >= 30).count shouldBe 3 } it should "load people parquet" in withSparkSession { spark => val people = spark.read.parquet(TestSpec.PeopleParquet) people.count shouldBe 5 } it should "load purchase_items parquet" in withSparkSession { spark => val people = spark.read.parquet(TestSpec.PurchaseItems) people.count shouldBe 25 } it should "load transactions parquet" in withSparkSession { spark => import spark.implicits._ val tx = spark.read.parquet(TestSpec.Transactions).as[Transaction] tx.count shouldBe 1000 } }
Example 156
Source File: WithUtils.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.common import java.io.Closeable import java.util.concurrent.locks.Lock import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset import org.apache.spark.storage.StorageLevel import scala.util.control.NonFatal object WithUtils { def withCloseable[T <: Closeable, R](closeable: T)(func: T => R): R = { var triedToClose = false try { func(closeable) } catch { case NonFatal(e) => try { closeable.close() } catch { case NonFatal(t) => e.addSuppressed(t) } triedToClose = true throw e } finally { // if we haven't tried to close it in the exception handler, try here. if (!triedToClose) { closeable.close() } } } def withLock[T](lock: Lock)(f: => T): T = { lock.lock() try { f } finally { lock.unlock() } } def withCachedRDD[T, U](rdd: RDD[T])(f: RDD[T] => U): U = { // Caching in MEMORY_ONLY (or even MEMORY_AND_DISK) can result in OOMs rdd.persist(StorageLevel.DISK_ONLY) try { f(rdd) } finally { rdd.unpersist() } } def withCachedDataset[T, U](ds: Dataset[T])(f: Dataset[T] => U): U = { // Caching in MEMORY_ONLY (or even MEMORY_AND_DISK) can result in OOMs ds.persist(StorageLevel.DISK_ONLY) try { f(ds) } finally { ds.unpersist() } } }
Example 157
Source File: KCore.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.kcore import com.tencent.angel.sona.context.PSContext import org.apache.spark.SparkContext import com.tencent.angel.sona.graph.params._ import com.tencent.angel.sona.ml.Transformer import com.tencent.angel.sona.ml.param.ParamMap import com.tencent.angel.sona.ml.util.Identifiable import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.storage.StorageLevel class KCore(override val uid: String) extends Transformer with HasSrcNodeIdCol with HasDstNodeIdCol with HasOutputNodeIdCol with HasOutputCoreIdCol with HasStorageLevel with HasPartitionNum with HasPSPartitionNum with HasUseBalancePartition { def this() = this(Identifiable.randomUID("KCore")) override def transform(dataset: Dataset[_]): DataFrame = { val edges = dataset.select($(srcNodeIdCol), $(dstNodeIdCol)).rdd .map(row => (row.getLong(0), row.getLong(1))) .filter(e => e._1 != e._2) edges.persist(StorageLevel.DISK_ONLY) val maxId = edges.map(e => math.max(e._1, e._2)).max() + 1 val minId = edges.map(e => math.min(e._1, e._2)).min() val nodes = edges.flatMap(e => Iterator(e._1, e._2)) val numEdges = edges.count() println(s"minId=$minId maxId=$maxId numEdges=$numEdges level=${$(storageLevel)}") // Start PS and init the model println("start to run ps") PSContext.getOrCreate(SparkContext.getOrCreate()) val model = KCorePSModel.fromMinMax(minId, maxId, nodes, $(psPartitionNum), $(useBalancePartition)) var graph = edges.flatMap(e => Iterator((e._1, e._2), (e._2, e._1))) .groupByKey($(partitionNum)) .mapPartitionsWithIndex((index, edgeIter) => Iterator(KCoreGraphPartition.apply(index, edgeIter))) graph.persist($(storageLevel)) graph.foreachPartition(_ => Unit) graph.foreach(_.initMsgs(model)) var curIteration = 0 var numMsgs = model.numMsgs() var prev = graph println(s"numMsgs=$numMsgs") do { curIteration += 1 graph = prev.map(_.process(model, numMsgs, curIteration == 1)) graph.persist($(storageLevel)) graph.count() prev.unpersist(true) prev = graph model.resetMsgs() numMsgs = model.numMsgs() println(s"curIteration=$curIteration numMsgs=$numMsgs") } while (numMsgs > 0) val retRDD = graph.map(_.save()).flatMap{case (nodes,cores) => nodes.zip(cores)} .map(r => Row.fromSeq(Seq[Any](r._1, r._2))) dataset.sparkSession.createDataFrame(retRDD, transformSchema(dataset.schema)) } override def transformSchema(schema: StructType): StructType = { StructType(Seq( StructField(s"${$(outputNodeIdCol)}", LongType, nullable = false), StructField(s"${$(outputCoreIdCol)}", IntegerType, nullable = false) )) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) }
Example 158
Source File: Correlation.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.stat import org.apache.spark.linalg.{SQLDataTypes, Vector} import scala.collection.JavaConverters._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.types.{StructField, StructType} /** * API for correlation functions in MLlib, compatible with DataFrames and Datasets. * * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]] * to spark.ml's Vector types. */ object Correlation { /** * :: Experimental :: * Compute the correlation matrix for the input Dataset of Vectors using the specified method. * Methods currently supported: `pearson` (default), `spearman`. * * @param dataset A dataset or a dataframe * @param column The name of the column of vectors for which the correlation coefficient needs * to be computed. This must be a column of the dataset, and it must contain * Vector objects. * @param method String specifying the method to use for computing correlation. * Supported: `pearson` (default), `spearman` * @return A dataframe that contains the correlation matrix of the column of vectors. This * dataframe contains a single row and a single column of name * '$METHODNAME($COLUMN)'. * @throws IllegalArgumentException if the column is not a valid column in the dataset, or if * the content of this column is not of type Vector. * * Here is how to access the correlation coefficient: * {{{ * val data: Dataset[Vector] = ... * val Row(coeff: Matrix) = Correlation.corr(data, "value").head * // coeff now contains the Pearson correlation matrix. * }}} * * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector], * which is fairly costly. Cache the input Dataset before calling corr with `method = "spearman"` * to avoid recomputing the common lineage. */ def corr(dataset: Dataset[_], column: String, method: String): DataFrame = { val rdd = dataset.select(column).rdd.map { case Row(v: Vector) => v } val oldM = Statistics.corr(rdd, method) val name = s"$method($column)" val schema = StructType(Array(StructField(name, SQLDataTypes.MatrixType, nullable = false))) dataset.sparkSession.createDataFrame(Seq(Row(oldM)).asJava, schema) } /** * Compute the Pearson correlation matrix for the input Dataset of Vectors. */ def corr(dataset: Dataset[_], column: String): DataFrame = { corr(dataset, column, "pearson") } }
Example 159
Source File: Estimator.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml import com.tencent.angel.sona.ml.param.{ParamMap, ParamPair} import scala.annotation.varargs import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.sql.Dataset /** * :: DeveloperApi :: * Abstract class for estimators that fit models to data. */ @DeveloperApi abstract class Estimator[M <: Model[M]] extends PipelineStage { /** * Fits a single model to the input data with optional parameters. * * @param dataset input dataset * @param firstParamPair the first param pair, overrides embedded params * @param otherParamPairs other param pairs. These values override any specified in this * Estimator's embedded ParamMap. * @return fitted model */ @varargs def fit(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): M = { val map = new ParamMap() .put(firstParamPair) .put(otherParamPairs: _*) fit(dataset, map) } /** * Fits a single model to the input data with provided parameter map. * * @param dataset input dataset * @param paramMap Parameter map. * These values override any specified in this Estimator's embedded ParamMap. * @return fitted model */ def fit(dataset: Dataset[_], paramMap: ParamMap): M = { copy(paramMap).fit(dataset) } /** * Fits a model to the input data. */ def fit(dataset: Dataset[_]): M /** * Fits multiple models to the input data with multiple sets of parameters. * The default implementation uses a for loop on each parameter map. * Subclasses could override this to optimize multi-model training. * * @param dataset input dataset * @param paramMaps An array of parameter maps. * These values override any specified in this Estimator's embedded ParamMap. * @return fitted models, matching the input parameter maps */ def fit(dataset: Dataset[_], paramMaps: Array[ParamMap]): Seq[M] = { paramMaps.map(fit(dataset, _)) } override def copy(extra: ParamMap): Estimator[M] }
Example 160
Source File: RegressionEvaluator.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.evaluation import com.tencent.angel.sona.ml.evaluation.evaluating.RegressionSummaryImpl import com.tencent.angel.sona.ml.param.{Param, ParamMap, ParamValidators} import com.tencent.angel.sona.ml.param.shared.{HasLabelCol, HasPredictionCol} import com.tencent.angel.sona.ml.util._ import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{DoubleType, FloatType} import org.apache.spark.sql.util.SONASchemaUtils /** * :: Experimental :: * Evaluator for regression, which expects two input columns: prediction and label. */ final class RegressionEvaluator(override val uid: String) extends Evaluator with HasPredictionCol with HasLabelCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("regEval")) /** * Param for metric name in evaluation. Supports: * - `"rmse"` (default): root mean squared error * - `"mse"`: mean squared error * - `"r2"`: R^2^ metric * - `"mae"`: mean absolute error * * @group param */ val metricName: Param[String] = { val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae")) new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae)", allowedParams) } def getMetricName: String = $(metricName) def setMetricName(value: String): this.type = set(metricName, value) def setPredictionCol(value: String): this.type = set(predictionCol, value) def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SONASchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SONASchemaUtils.checkNumericType(schema, $(labelCol)) val summary = new RegressionSummaryImpl(dataset.toDF(), $(predictionCol), $(labelCol)) val metrics = summary.regMetrics val metric = $(metricName) match { case "rmse" => summary.rmse case "mse" => summary.mse case "r2" => summary.r2 case "mae" => summary.absDiff } metric } override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { override def load(path: String): RegressionEvaluator = super.load(path) }
Example 161
Source File: GenericFunSpecSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test import org.scalatest.FunSpec import org.apache.spark.sql.Dataset class GenericFunSpecSuite extends FunSpec with SharedSparkSession { import testImplicits._ private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS describe("Simple Dataset") { it("should have the specified number of elements") { assert(8 === ds.count) } it("should have the specified number of unique elements") { assert(8 === ds.distinct.count) } it("should have the specified number of elements in each column") { assert(8 === ds.select("_1").count) assert(8 === ds.select("_2").count) } it("should have the correct number of distinct elements in each column") { assert(8 === ds.select("_1").distinct.count) assert(4 === ds.select("_2").distinct.count) } } }
Example 162
Source File: A_1_BasicOperation.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.structured_streaming import java.sql.Timestamp import org.apache.spark.sql.types.{BooleanType, StringType, StructType, TimestampType} import org.apache.spark.sql.{Dataset, SparkSession} object A_1_BasicOperation { //DateTime要使用Timestamp case类必须使用java.sql。在catalyst中作为TimestampType调用的时间戳 case class DeviceData(device: String, deviceType: String, signal: Double, time: Timestamp) def main(args: Array[String]): Unit = { val spark = SparkSession.builder() .appName(A_1_BasicOperation.getClass.getName) .master("local") .getOrCreate() val timeStructType = new StructType().add("device", StringType) .add("deviceType", StringType) .add("signal", BooleanType) .add("time", TimestampType) val dataFrame = spark.read.json("src/main/resources/sparkresource/device.json") import spark.implicits._ val ds: Dataset[DeviceData] = dataFrame.as[DeviceData] //使用无类型方式查询,类sql dataFrame.select("device").where("signal>10").show() //使用有类型方式进行查询 ds.filter(_.signal > 10).map(_.device).show() //使用无类型方式进行groupBy,并进行统计 dataFrame.groupBy("deviceType").count().show() import org.apache.spark.sql.expressions.scalalang.typed //使用有类型方式进行 计算每种类型的设备的平均信号值 ds.groupByKey(_.deviceType).agg(typed.avg(_.signal)).show() //也可以使用创建临时视图的形式,使用sql语句进行查询 dataFrame.createOrReplaceTempView("device") spark.sql("select * from device").show() //可以使用isStreaming来判断是否有流数据 println(dataFrame.isStreaming) } }
Example 163
Source File: Writer.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.pileup import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} object Writer { val mapToString = (map: Map[Byte, Short]) => { if (map == null) "null" else map.map({ case (k, v) => k.toChar -> v }).toSeq.sortBy(_._1).mkString.replace(" -> ", ":") } def saveToFile(spark: SparkSession, res: Dataset[Row], path: String) = { spark.udf.register("mapToString", mapToString) res .selectExpr("contig", "pos_start", "pos_end", "ref", "cast(coverage as int)", "mapToString(alts)") .coalesce(1) .write .mode(SaveMode.Overwrite) .csv(path) } }
Example 164
Source File: PileupTestBase.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.pileup import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import org.apache.spark.sql.types.{IntegerType, ShortType, StringType, StructField, StructType} import org.scalatest.{BeforeAndAfter, FunSuite} class PileupTestBase extends FunSuite with DataFrameSuiteBase with BeforeAndAfter with SharedSparkContext{ val sampleId = "NA12878.multichrom.md" val samResPath: String = getClass.getResource("/multichrom/mdbam/samtools.pileup").getPath val referencePath: String = getClass.getResource("/reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta").getPath val bamPath: String = getClass.getResource(s"/multichrom/mdbam/${sampleId}.bam").getPath val cramPath : String = getClass.getResource(s"/multichrom/mdcram/${sampleId}.cram").getPath val tableName = "reads_bam" val tableNameCRAM = "reads_cram" val schema: StructType = StructType( List( StructField("contig", StringType, nullable = true), StructField("position", IntegerType, nullable = true), StructField("reference", StringType, nullable = true), StructField("coverage", ShortType, nullable = true), StructField("pileup", StringType, nullable = true), StructField("quality", StringType, nullable = true) ) ) before { System.setProperty("spark.kryo.registrator", "org.biodatageeks.sequila.pileup.serializers.CustomKryoRegistrator") spark .conf.set("spark.sql.shuffle.partitions",1) //FIXME: In order to get orderBy in Samtools tests working - related to exchange partitions stage spark.sql(s"DROP TABLE IF EXISTS $tableName") spark.sql( s""" |CREATE TABLE $tableName |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource |OPTIONS(path "$bamPath") | """.stripMargin) spark.sql(s"DROP TABLE IF EXISTS $tableNameCRAM") spark.sql( s""" |CREATE TABLE $tableNameCRAM |USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource |OPTIONS(path "$cramPath", refPath "$referencePath" ) | """.stripMargin) val mapToString = (map: Map[Byte, Short]) => { if (map == null) "null" else map.map({ case (k, v) => k.toChar -> v}).mkString.replace(" -> ", ":") } val byteToString = ((byte: Byte) => byte.toString) spark.udf.register("mapToString", mapToString) spark.udf.register("byteToString", byteToString) } }
Example 165
Source File: KafkaSource.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper def read(startingOption: String = "startingOffsets", partitionsAndOffsets: String = "earliest") : Dataset[SimpleSongAggregationKafka] = { log.warn("Reading from Kafka") spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", KafkaService.topicName) .option("enable.auto.commit", false) // Cannot be set to true in Spark Strucutured Streaming https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#kafka-specific-configurations .option("group.id", "Structured-Streaming-Examples") .option("failOnDataLoss", false) // when starting a fresh kafka (default location is temporary (/tmp) and cassandra is not (var/lib)), we have saved different offsets in Cassandra than real offsets in kafka (that contains nothing) .option(startingOption, partitionsAndOffsets) //this only applies when a new query is started and that resuming will always pick up from where the query left off .load() .withColumn(KafkaService.radioStructureName, // nested structure with our json from_json($"value".cast(StringType), KafkaService.schemaOutput) //From binary to JSON object ).as[SimpleSongAggregationKafka] .filter(_.radioCount != null) //TODO find a better way to filter bad json } }
Example 166
Source File: KafkaSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.streaming.StreamingQuery import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper object KafkaSink extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ def writeStream(staticInputDS: Dataset[SimpleSongAggregation]) : StreamingQuery = { log.warn("Writing to Kafka") staticInputDS .select(to_json(struct($"*")).cast(StringType).alias("value")) .writeStream .outputMode("update") .format("kafka") .option("kafka.bootstrap.servers", KafkaService.bootstrapServers) .queryName("Kafka - Count number of broadcasts for a title/artist by radio") .option("topic", "test") .start() } def debugStream(staticKafkaInputDS: Dataset[SimpleSongAggregationKafka]) = { staticKafkaInputDS .writeStream .queryName("Debug Stream Kafka") .format("console") .start() } }
Example 167
Source File: CassandraSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package cassandra.StreamSinkProvider import cassandra.{CassandraDriver, CassandraKafkaMetadata} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.functions.max import spark.SparkHelper import cassandra.CassandraDriver import com.datastax.spark.connector._ import kafka.KafkaMetadata import log.LazyLogger import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.types.LongType import radio.SimpleSongAggregation private def saveKafkaMetaData(df: DataFrame) = { val kafkaMetadata = df .groupBy($"partition") .agg(max($"offset").cast(LongType).as("offset")) .as[KafkaMetadata] log.warn("Saving Kafka Metadata (partition and offset per topic (only one in our example)") kafkaMetadata.show() kafkaMetadata.rdd.saveToCassandra(CassandraDriver.namespace, CassandraDriver.kafkaMetadata, SomeColumns("partition", "offset") ) //Otherway to save offset inside Cassandra //kafkaMetadata.collect().foreach(CassandraKafkaMetadata.save) } }
Example 168
Source File: ElasticSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package elastic import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} import radio.{SimpleSongAggregation, Song} import org.elasticsearch.spark.sql.streaming._ import org.elasticsearch.spark.sql._ import org.elasticsearch.spark.sql.streaming.EsSparkSqlStreamingSink object ElasticSink { def writeStream(ds: Dataset[Song] ) : StreamingQuery = { ds //Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark .writeStream .outputMode(OutputMode.Append) //Only mode for ES .format("org.elasticsearch.spark.sql") //es .queryName("ElasticSink") .start("test/broadcast") //ES index } }
Example 169
Source File: MapGroupsWithState.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package mapGroupsWithState import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.types.StringType import spark.SparkHelper import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode} import radio.{ArtistAggregationState, SimpleSongAggregation, SimpleSongAggregationKafka} object MapGroupsWithState extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ def updateArtistStateWithEvent(state: ArtistAggregationState, artistCount : SimpleSongAggregation) = { log.warn("MapGroupsWithState - updateArtistStateWithEvent") if(state.artist == artistCount.artist) { ArtistAggregationState(state.artist, state.count + artistCount.count) } else { state } } def updateAcrossEvents(artist:String, inputs: Iterator[SimpleSongAggregation], oldState: GroupState[ArtistAggregationState]): ArtistAggregationState = { var state: ArtistAggregationState = if (oldState.exists) oldState.get else ArtistAggregationState(artist, 1L) // for every rows, let's count by artist the number of broadcast, instead of counting by artist, title and radio for (input <- inputs) { state = updateArtistStateWithEvent(state, input) oldState.update(state) } state } def write(ds: Dataset[SimpleSongAggregationKafka] ) = { ds.select($"radioCount.title", $"radioCount.artist", $"radioCount.radio", $"radioCount.count") .as[SimpleSongAggregation] .groupByKey(_.artist) .mapGroupsWithState(GroupStateTimeout.NoTimeout)(updateAcrossEvents) //we can control what should be done with the state when no update is received after a timeout. .writeStream .outputMode(OutputMode.Update()) .format("console") .queryName("mapGroupsWithState - counting artist broadcast") .start() } }
Example 170
Source File: ParquetService.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package parquetHelper import log.LazyLogger import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import radio.{SimpleSongAggregation, Song} import spark.SparkHelper object ParquetService extends LazyLogger { val pathRadioStationSongs = "data/allRadioPartitionByRadioAndDate.parquet" val pathRadioES = "data/broadcast.parquet" private val spark = SparkHelper.getSparkSession() import spark.implicits._ val schema = new StructType() .add("timestamp", TimestampType) .add("title", StringType) .add("artist", StringType) .add("radio", StringType) .add("humanDate", LongType) .add("hour", IntegerType) .add("minute", IntegerType) .add("allArtists", StringType) .add("year", IntegerType) .add("month", IntegerType) .add("day", IntegerType) def batchWay() = { //Classic Batch way val batchWay = spark .read .schema(ParquetService.schema) .parquet(pathRadioStationSongs) .where($"artist" === "Drake") .groupBy($"radio", $"artist", $"title") .count() .orderBy("count") .as[Song] batchWay.show() batchWay } def streamingWay() : Dataset[SimpleSongAggregation] = { log.warn("Starting to stream events from Parquet files....") spark .readStream .schema(ParquetService.schema) .option("maxFilesPerTrigger", 1000) // Treat a sequence of files as a stream by picking one file at a time .parquet(pathRadioStationSongs) .as[Song] .where($"artist" === "Drake") .groupBy($"radio", $"artist", $"title") .count() .as[SimpleSongAggregation] } def streamEachEvent : Dataset[Song] = { spark .readStream .schema(ParquetService.schema) .option("maxFilesPerTrigger", 1000) // Treat a sequence of files as a stream by picking one file at a time .parquet(pathRadioES) .as[Song] .where($"artist" === "Drake") .withWatermark("timestamp", "10 minutes") .as[Song] } //Process stream on console to debug only def debugStream(staticInputDF: DataFrame) = { staticInputDF.writeStream .format("console") .outputMode("complete") .queryName("Console - Count number of broadcasts for a title/artist by radio") .start() } }
Example 171
Source File: Aggregator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression import org.apache.spark.sql.{DataFrame, Dataset, Encoder, TypedColumn} def toColumn( implicit bEncoder: Encoder[B], cEncoder: Encoder[O]): TypedColumn[I, O] = { val expr = new AggregateExpression( TypedAggregateExpression(this), Complete, false) new TypedColumn[I, O](expr, encoderFor[O]) } }
Example 172
Source File: RichSparkFunctions.scala From lighthouse with Apache License 2.0 | 5 votes |
package be.dataminded.lighthouse.pipeline import com.typesafe.scalalogging.LazyLogging import org.apache.spark.sql.{Dataset, Encoder} import org.apache.spark.storage.StorageLevel import scala.reflect.ClassTag object RichSparkFunctions extends LazyLogging { class DatasetSparkFunction[A <: Dataset[_]: ClassTag](function: SparkFunction[A]) { def printSchema(): SparkFunction[A] = function.map { dataSet => dataSet.printSchema() dataSet } def as[T: Encoder]: SparkFunction[Dataset[T]] = function.map(_.as[T]) def cache(storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): SparkFunction[A] = function.map { _.persist(storageLevel) } def dropCache(): SparkFunction[A] = function.map { _.unpersist() } def write(sink: Sink, sinks: Sink*): SparkFunction[A] = { if (sinks.isEmpty) function.map { data => sink.write(data); data } else (sink +: sinks).foldLeft(function.cache())((f, sink) => f.write(sink)) } def count(): SparkFunction[Long] = { function.map { dataSet => val n = dataSet.count() logger.debug(s"The data set produced $n rows") n } } } }
Example 173
Source File: GenericFlatSpecSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test import org.scalatest.FlatSpec import org.apache.spark.sql.Dataset class GenericFlatSpecSuite extends FlatSpec with SharedSparkSession { import testImplicits._ private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS "A Simple Dataset" should "have the specified number of elements" in { assert(8 === ds.count) } it should "have the specified number of unique elements" in { assert(8 === ds.distinct.count) } it should "have the specified number of elements in each column" in { assert(8 === ds.select("_1").count) assert(8 === ds.select("_2").count) } it should "have the correct number of distinct elements in each column" in { assert(8 === ds.select("_1").distinct.count) assert(4 === ds.select("_2").distinct.count) } }
Example 174
Source File: GenericWordSpecSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test import org.scalatest.WordSpec import org.apache.spark.sql.Dataset class GenericWordSpecSuite extends WordSpec with SharedSparkSession { import testImplicits._ private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS "A Simple Dataset" when { "looked at as complete rows" should { "have the specified number of elements" in { assert(8 === ds.count) } "have the specified number of unique elements" in { assert(8 === ds.distinct.count) } } "refined to specific columns" should { "have the specified number of elements in each column" in { assert(8 === ds.select("_1").count) assert(8 === ds.select("_2").count) } "have the correct number of distinct elements in each column" in { assert(8 === ds.select("_1").distinct.count) assert(4 === ds.select("_2").distinct.count) } } } }
Example 175
Source File: cache.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan case class CacheTableCommand( tableIdent: TableIdentifier, plan: Option[LogicalPlan], isLazy: Boolean) extends RunnableCommand { require(plan.isEmpty || tableIdent.database.isEmpty, "Database name is not allowed in CACHE TABLE AS SELECT") override protected def innerChildren: Seq[QueryPlan[_]] = plan.toSeq override def run(sparkSession: SparkSession): Seq[Row] = { plan.foreach { logicalPlan => Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString) } sparkSession.catalog.cacheTable(tableIdent.quotedString) if (!isLazy) { // Performs eager caching sparkSession.table(tableIdent).count() } Seq.empty[Row] } } case class UncacheTableCommand( tableIdent: TableIdentifier, ifExists: Boolean) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val tableId = tableIdent.quotedString if (!ifExists || sparkSession.catalog.tableExists(tableId)) { sparkSession.catalog.uncacheTable(tableId) } Seq.empty[Row] } } override def makeCopy(newArgs: Array[AnyRef]): ClearCacheCommand = ClearCacheCommand() }
Example 176
Source File: FrequentItems.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 177
Source File: SaveIntoDataSourceCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.CreatableRelationProvider case class SaveIntoDataSourceCommand( query: LogicalPlan, dataSource: CreatableRelationProvider, options: Map[String, String], mode: SaveMode) extends RunnableCommand { override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { dataSource.createRelation( sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query)) Seq.empty[Row] } override def simpleString: String = { val redacted = SQLConf.get.redactOptions(options) s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}" } }
Example 178
Source File: JsonUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.json import org.apache.spark.input.PortableDataStream import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset import org.apache.spark.sql.catalyst.json.JSONOptions object JsonUtils { def sample(json: RDD[PortableDataStream], options: JSONOptions): RDD[PortableDataStream] = { require(options.samplingRatio > 0, s"samplingRatio (${options.samplingRatio}) should be greater than 0") if (options.samplingRatio > 0.99) { json } else { json.sample(withReplacement = false, options.samplingRatio, 1) } } }
Example 179
Source File: Aggregator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 180
Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.Properties import java.util.concurrent.atomic.AtomicInteger import org.scalatest.time.SpanSugar._ import scala.collection.mutable import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.{StreamTest, Trigger} import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession} // Run tests in KafkaSourceSuiteBase in continuous execution mode. class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest { import testImplicits._ override val brokerProps = Map("auto.create.topics.enable" -> "false") test("subscribing topic by pattern with topic deletions") { val topicPrefix = newTopic() val topic = topicPrefix + "-seems" val topic2 = topicPrefix + "-bad" testUtils.createTopic(topic, partitions = 5) testUtils.sendMessages(topic, Array("-1")) require(testUtils.getLatestOffsets(Set(topic)).size === 5) val reader = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", testUtils.brokerAddress) .option("kafka.metadata.max.age.ms", "1") .option("subscribePattern", s"$topicPrefix-.*") .option("failOnDataLoss", "false") val kafka = reader.load() .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = kafka.map(kv => kv._2.toInt + 1) testStream(mapped)( makeSureGetOffsetCalled, AddKafkaData(Set(topic), 1, 2, 3), CheckAnswer(2, 3, 4), Execute { query => testUtils.deleteTopic(topic) testUtils.createTopic(topic2, partitions = 5) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists { r => // Ensure the new topic is present and the old topic is gone. r.knownPartitions.exists(_.topic == topic2) }, s"query never reconfigured to new topic $topic2") } }, AddKafkaData(Set(topic2), 4, 5, 6), CheckAnswer(2, 3, 4, 5, 6, 7) ) } } class KafkaContinuousSourceStressForDontFailOnDataLossSuite extends KafkaSourceStressForDontFailOnDataLossSuite { override protected def startStream(ds: Dataset[Int]) = { ds.writeStream .format("memory") .queryName("memory") .trigger(Trigger.Continuous("1 second")) .start() } }
Example 181
Source File: PredictorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasWeightCol import org.apache.spark.ml.util._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext { import PredictorSuite._ test("should support all NumericType labels and weights, and not support other types") { val df = spark.createDataFrame(Seq( (0, 1, Vectors.dense(0, 2, 3)), (1, 2, Vectors.dense(0, 3, 9)), (0, 3, Vectors.dense(0, 2, 6)) )).toDF("label", "weight", "features") val types = Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0)) val predictor = new MockPredictor().setWeightCol("weight") types.foreach { t => predictor.fit(df.select(col("label").cast(t), col("weight").cast(t), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label").cast(StringType), col("weight"), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label"), col("weight").cast(StringType), col("features"))) } } } object PredictorSuite { class MockPredictor(override val uid: String) extends Predictor[Vector, MockPredictor, MockPredictionModel] with HasWeightCol { def this() = this(Identifiable.randomUID("mockpredictor")) def setWeightCol(value: String): this.type = set(weightCol, value) override def train(dataset: Dataset[_]): MockPredictionModel = { require(dataset.schema("label").dataType == DoubleType) require(dataset.schema("weight").dataType == DoubleType) new MockPredictionModel(uid) } override def copy(extra: ParamMap): MockPredictor = throw new NotImplementedError() } class MockPredictionModel(override val uid: String) extends PredictionModel[Vector, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictormodel")) override def predict(features: Vector): Double = throw new NotImplementedError() override def copy(extra: ParamMap): MockPredictionModel = throw new NotImplementedError() } }
Example 182
Source File: CallStatsAggregator.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.callrecordaggregator import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.streaming.OutputMode import cloudflow.spark.sql.SQLImplicits._ import org.apache.log4j.{ Level, Logger } import carly.data._ class CallStatsAggregator extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) //tag::docs-schemaAware-example[] val in = AvroInlet[CallRecord]("in") val out = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString) val shape = StreamletShape(in, out) //end::docs-schemaAware-example[] val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute")) val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute")) override def configParameters = Vector(GroupByWindow, Watermark) override def createLogic = new SparkStreamletLogic { val watermark = Watermark.value val groupByWindow = GroupByWindow.value //tag::docs-aggregationQuery-example[] override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Update).toQueryExecution } //end::docs-aggregationQuery-example[] private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", s"${watermark.toMillis()} milliseconds") .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds")) .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration")) .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType)) query .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration") .as[AggregatedCallStats] } } }
Example 183
Source File: CSVProfiler.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.profilers import io.gzet.profilers.field.{CardinalityProfiler, EmptinessProfiler, MaskBasedProfiler, PredefinedMasks} import io.gzet.profilers.raw.{AsciiProfiler, RowProfiler, StructuralProfiler} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Dataset, SparkSession} import org.elasticsearch.spark.sql._ object CSVProfiler { Logger.getLogger("akka").setLevel(Level.WARN) Logger.getLogger("org").setLevel(Level.WARN) val HEADER = Array( "rowId", "firstName", "lastName", "email", "gender", "ipAddress", "shaPass" ) def main(args: Array[String]) { val spark = SparkSession.builder().appName("Profiler").getOrCreate() import spark.implicits._ val rawDf: Dataset[String] = spark.read.text(args.head).map(_.getAs[String](0)) rawDf.cache() rawDf.count() val tabDf: Dataset[Array[String]] = Utils.split(rawDf, delimiter = ",") val sources = spark.sparkContext.broadcast(rawDf.inputFiles) val ingestTime = spark.sparkContext.broadcast(new java.util.Date().getTime) val headers = spark.sparkContext.broadcast(HEADER.zipWithIndex.map(_.swap).toMap) RowProfiler.apply().profile(rawDf).map({ report => ("row.count", report.metricValue, Map[String, String]()) }).union(AsciiProfiler.apply().profile(rawDf).map({ report => ("row.ascii", report.metricValue, Map(Tags.ASCII_NAME -> report.ascii, Tags.ASCII_BINARY -> report.binary)) })).union(StructuralProfiler.apply(delimiter = ",").profile(rawDf).map({ report => ("field.count", report.metricValue, Map(Tags.EXTRA -> report.description, Tags.FIELD_COUNT -> report.fields.toString)) })).union(EmptinessProfiler.apply().profile(tabDf).map({ report => ("field.emptiness", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString)) })).union(CardinalityProfiler.apply(topN = 5).profile(tabDf).map({ report => ("field.cardinality", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.ASCIICLASS_LOWGRAIN).profile(tabDf).map({ report => ("field.ascii.low", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.ASCIICLASS_HIGHGRAIN).profile(tabDf).map({ report => ("field.ascii.high", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.POP_CHECKS).profile(tabDf).map({ report => ("field.pop.check", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.CLASS_FREQS).profile(tabDf).map({ report => ("field.class.freq", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(","))) })).map({ case (metricName, metricValue, tags) => val newTags = { if (tags.contains(Tags.FIELD_IDX)) { val fieldIdx = tags.get(Tags.FIELD_IDX).get.toInt val fieldName = headers.value.getOrElse(fieldIdx, "NA") tags ++ Map(Tags.FIELD_NAME -> fieldName) } else { tags } } ReportBuilder.create .withName(metricName) .withMetric(metricValue) .withSources(sources.value) .withTime(ingestTime.value) .withTags(newTags) .build }).toDF().saveToEs("profiler/mock") } }
Example 184
Source File: JsonDynamicDeserializer.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.deserializers.generic import org.apache.spark.sql.{DataFrame, Dataset} import com.paypal.gimel.deserializers.generic.conf.{GenericDeserializerConfigs, GenericDeserializerConfiguration, GenericDeserializerConstants} import com.paypal.gimel.serde.common.Deserializer class JsonDynamicDeserializer extends Deserializer { override def deserialize(dataframe: DataFrame, props: Map[String, Any] = Map.empty): DataFrame = { val conf = new GenericDeserializerConfiguration(props) if (!dataframe.columns.contains(conf.columnToDeserialize)) { throw new IllegalArgumentException( s""" | Column to Deserialize does not exist in dataframe --> ${conf.columnToDeserialize} | Please set the property ${GenericDeserializerConfigs.columnToDeserializeKey} | Note: Default value is "${GenericDeserializerConstants.columnToDeserialize}" """.stripMargin ) } else { val sparkSession = dataframe.sparkSession import sparkSession.implicits._ val deserializedDS: Dataset[String] = dataframe.map { eachRow => eachRow.getAs(conf.columnToDeserialize).asInstanceOf[Array[Byte]].map(_.toChar).mkString } sparkSession.read.json(deserializedDS) } } }
Example 185
Source File: JsonStaticDeserializer.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.deserializers.generic import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import com.paypal.gimel.deserializers.generic.conf.{GenericDeserializerConfigs, GenericDeserializerConfiguration, GenericDeserializerConstants} import com.paypal.gimel.serde.common.Deserializer import com.paypal.gimel.serde.common.utils.SQLDataTypesUtils class JsonStaticDeserializer extends Deserializer { override def deserialize(dataframe: DataFrame, props: Map[String, Any] = Map.empty): DataFrame = { val conf = new GenericDeserializerConfiguration(props) if (!dataframe.columns.contains(conf.columnToDeserialize)) { throw new IllegalArgumentException( s""" | Column to Deserialize does not exist in dataframe --> ${conf.columnToDeserialize} | Please set the property ${GenericDeserializerConfigs.columnToDeserializeKey} | Note: Default value is "${GenericDeserializerConstants.columnToDeserialize}" """.stripMargin ) } else { if (conf.fieldsBindToJson.isEmpty) { throw new Exception ("You need to provide fields in json by setting " + GenericDeserializerConfigs.fieldsBindToJson + " property.") } else { val schema = SQLDataTypesUtils.getSchemaFromBindToFieldsJson(conf.fieldsBindToJson) dataframe.selectExpr("cast (" + conf.columnToDeserialize + " as string) as json") .select(from_json(col("json"), schema).as("data")).select("data.*") } } } }
Example 186
Source File: TestSparkStreamletContext.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark package testkit import java.nio.file.attribute.FileAttribute import com.typesafe.config._ import scala.reflect.runtime.universe._ import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, Encoder, SparkSession } import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery, Trigger } import cloudflow.streamlets._ import org.apache.spark.sql.catalyst.InternalRow class TestSparkStreamletContext(override val streamletRef: String, session: SparkSession, inletTaps: Seq[SparkInletTap[_]], outletTaps: Seq[SparkOutletTap[_]], override val config: Config = ConfigFactory.empty) extends SparkStreamletContext(StreamletDefinition("appId", "appVersion", streamletRef, "streamletClass", List(), List(), config), session) { val ProcessingTimeInterval = 1500.milliseconds override def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] = inletTaps .find(_.portName == inPort.name) .map(_.instream.asInstanceOf[MemoryStream[In]].toDF.as[In]) .getOrElse(throw TestContextException(inPort.name, s"Bad test context, could not find source for inlet ${inPort.name}")) override def writeStream[Out](stream: Dataset[Out], outPort: CodecOutlet[Out], outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = { // RateSource can only work with a microBatch query because it contains no data at time zero. // Trigger.Once requires data at start to work. val trigger = if (isRateSource(stream)) { Trigger.ProcessingTime(ProcessingTimeInterval) } else { Trigger.Once() } val streamingQuery = outletTaps .find(_.portName == outPort.name) .map { outletTap ⇒ stream.writeStream .outputMode(outputMode) .format("memory") .trigger(trigger) .queryName(outletTap.queryName) .start() } .getOrElse(throw TestContextException(outPort.name, s"Bad test context, could not find destination for outlet ${outPort.name}")) streamingQuery } override def checkpointDir(dirName: String): String = { val fileAttibutes: Array[FileAttribute[_]] = Array() val tmpDir = java.nio.file.Files.createTempDirectory("spark-test", fileAttibutes: _*) tmpDir.toFile.getAbsolutePath } private def isRateSource(stream: Dataset[_]): Boolean = { import org.apache.spark.sql.execution.command.ExplainCommand val explain = ExplainCommand(stream.queryExecution.logical, true) val res = session.sessionState.executePlan(explain).executedPlan.executeCollect() res.exists((row: InternalRow) => row.getString(0).contains("org.apache.spark.sql.execution.streaming.sources.RateStreamProvider")) } } case class TestContextException(portName: String, msg: String) extends RuntimeException(msg)
Example 187
Source File: SparkAvroDecoder.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark.avro import org.apache.log4j.Logger import java.io.ByteArrayOutputStream import scala.reflect.runtime.universe._ import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord } import org.apache.avro.io.{ DecoderFactory, EncoderFactory } import org.apache.spark.sql.{ Dataset, Encoder, Row } import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder } import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.StructType import org.apache.avro.Schema import cloudflow.spark.sql.SQLImplicits._ case class EncodedKV(key: String, value: Array[Byte]) case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) { val encoder: Encoder[T] = implicitly[Encoder[T]] val sqlSchema: StructType = encoder.schema val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema) @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) @transient lazy val rowConverter = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema) @transient lazy val datumReader = new GenericDatumReader[GenericRecord](_avroSchema) @transient lazy val decoder = DecoderFactory.get def decode(bytes: Array[Byte]): Row = { val binaryDecoder = decoder.binaryDecoder(bytes, null) val record = datumReader.read(null, binaryDecoder) rowConverter(record).asInstanceOf[GenericRow] } } case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) { @transient lazy val log = Logger.getLogger(getClass.getName) val BufferSize = 5 * 1024 // 5 Kb val encoder = implicitly[Encoder[T]] val sqlSchema = encoder.schema @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) val recordName = "topLevelRecord" // ??? val recordNamespace = "recordNamespace" // ??? @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace) // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage def rowToBytes(row: Row): Array[Byte] = { val genRecord = converter(row).asInstanceOf[GenericRecord] if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord") val datumWriter = new GenericDatumWriter[GenericRecord](_avroSchema) val avroEncoder = EncoderFactory.get val byteArrOS = new ByteArrayOutputStream(BufferSize) val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null) datumWriter.write(genRecord, binaryEncoder) binaryEncoder.flush() byteArrOS.toByteArray } def encode(dataset: Dataset[T]): Dataset[Array[Byte]] = dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]] // Note to self: I'm not sure how heavy this chain of transformations is def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = { val encoder = encoderFor[T] implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind() dataset.map { value ⇒ val key = keyFun(value) val internalRow = encoder.toRow(value) val row = rowEncoder.fromRow(internalRow) val bytes = rowToBytes(row) EncodedKV(key, bytes) } } }
Example 188
Source File: SparkEgressSpec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import org.apache.spark.sql.{ Dataset, Encoder, SparkSession } import org.apache.spark.sql.streaming.{ OutputMode, Trigger } import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkEgressSpec extends SparkScalaTestSupport { "SparkEgress" should { "materialize streaming data to sink" in { val testKit = SparkStreamletTestkit(session) def asCollection[T: Encoder](session: SparkSession, queryName: String): List[T] = session.sql(s"select * from $queryName").as[T].collect().toList val instance = new MySparkEgress() // setup inlet tap on inlet port val in: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in) // build data and send to inlet tap val data = (1 to 10).map(i ⇒ Data(i, s"name$i")) in.addData(data) val run = testKit.run(instance, Seq(in), Seq.empty) run.failures mustBe ('empty) run.totalRows mustBe (20) val r1 = asCollection[String](session, "allNames") val r2 = asCollection[String](session, "allNamesUpper") // assert r1 must contain("name1") r2 must contain("NAME1") } } } class MySparkEgress extends SparkStreamlet { val in = AvroInlet[Data]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = process(readStream(in)) private def process(inDataset: Dataset[Data]): StreamletQueryExecution = { val q1 = inDataset .map { d ⇒ d.name } .writeStream .format("memory") .option("truncate", false) .queryName("allNames") .outputMode(OutputMode.Append()) .trigger(Trigger.Once) .start() val q2 = inDataset .map { d ⇒ d.name.toUpperCase } .writeStream .format("memory") .option("truncate", false) .queryName("allNamesUpper") .outputMode(OutputMode.Append()) .trigger(Trigger.Once) .start() StreamletQueryExecution(q1, q2) } } }
Example 189
Source File: SparkJoin3Spec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkJoin3Spec extends SparkScalaTestSupport { "SparkJoin3" should { "process streaming data" in { val testKit = SparkStreamletTestkit(session) val instance = new MySparkJoin3() // setup inlet tap on inlet port val in0: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in0) val in1: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in1) val in2: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in2) // setup outlet tap on outlet port val out: SparkOutletTap[Simple] = testKit.outletAsTap[Simple](instance.out) // build data and send to inlet tap val List(d1, d2, d3) = (1 to 30).map(i ⇒ Data(i, s"name$i")).sliding(10, 10).toList in0.addData(d1) in1.addData(d2) in2.addData(d3) val run = testKit.run(instance, Seq(in0, in1, in2), Seq(out)) run.totalRows must be(30) // get data from outlet tap val results = out.asCollection(session) // assert results must contain(Simple("name1")) results must contain(Simple("name11")) results must contain(Simple("name21")) (results must have).length(30) } } } // create sparkStreamlet class MySparkJoin3 extends SparkStreamlet { // comment: all inlets could be in different formats, one proto, one avro, one csv.. val in0 = AvroInlet[Data]("in0") val in1 = AvroInlet[Data]("in1") val in2 = AvroInlet[Data]("in2") val out = AvroOutlet[Simple]("out", _.name) val shape = StreamletShape(out).withInlets(in0, in1, in2) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset0 = readStream(in0) val dataset1 = readStream(in1) val dataset2 = readStream(in2) val outStream: Dataset[Simple] = process(dataset0, dataset1, dataset2) val query = writeStream(outStream, out, OutputMode.Append) StreamletQueryExecution(query) } private def process(in0: Dataset[Data], in1: Dataset[Data], in2: Dataset[Data]): Dataset[Simple] = in0.union(in1.union(in2)).select($"name").as[Simple] } }
Example 190
Source File: SparkIngressSpec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import scala.collection.immutable.Seq import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.execution.streaming.MemoryStream import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkIngressSpec extends SparkScalaTestSupport { "SparkIngress" should { "produce elements to its outlet" in { val testKit = SparkStreamletTestkit(session) val instance = new MySparkIngress() // setup outlet tap on outlet port val out: SparkOutletTap[Data] = testKit.outletAsTap[Data](instance.out) val run = testKit.run(instance, Seq.empty, Seq(out)) // get processed rows from the run run.totalRows must be(10) // get data from outlet tap val results = out.asCollection(session) // assert results must contain(Data(1, "name1")) } } } // create sparkStreamlet class MySparkIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.id.toString) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { private def process: Dataset[Data] = { implicit val sqlCtx = session.sqlContext val data = (1 to 10).map(i ⇒ Data(i, s"name$i")) val m = MemoryStream[Data] m.addData(data) m.toDF.as[Data] } override def buildStreamingQueries = { val outStream: Dataset[Data] = process require(outStream.isStreaming, "The Dataset created by an Ingress must be a Streaming Dataset") val query = writeStream(outStream, out, OutputMode.Append) StreamletQueryExecution(query) } } }
Example 191
Source File: SparkRandomGenIngress.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.sparkdoc import scala.util.Random import cloudflow.spark._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import java.sql.Timestamp class SparkRandomGenDataIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.key) val shape = StreamletShape(out) case class Rate(timestamp: Timestamp, value: Long) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = writeStream(process, out, OutputMode.Append).toQueryExecution private def process: Dataset[Data] = { val recordsPerSecond = 10 val keyGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "keyOne" else "keyTwo" val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] rateStream.map { case Rate(_, value) ⇒ Data(keyGen(), value.toInt) } } } }
Example 192
Source File: ClusteringEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset class ClusteringEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var irisDataset: Dataset[_] = _ override def beforeAll(): Unit = { super.beforeAll() irisDataset = spark.read.format("libsvm").load("../data/mllib/iris_libsvm.txt") } test("params") { ParamsSuite.checkParams(new ClusteringEvaluator) } test("read/write") { val evaluator = new ClusteringEvaluator() .setPredictionCol("myPrediction") .setFeaturesCol("myLabel") testDefaultReadWrite(evaluator) } test("squared euclidean Silhouette") { val evaluator = new ClusteringEvaluator() .setFeaturesCol("features") .setPredictionCol("label") assert(evaluator.evaluate(irisDataset) ~== 0.6564679231 relTol 1e-5) } test("number of clusters must be greater than one") { val singleClusterDataset = irisDataset.where($"label" === 0.0) val evaluator = new ClusteringEvaluator() .setFeaturesCol("features") .setPredictionCol("label") val e = intercept[AssertionError]{ evaluator.evaluate(singleClusterDataset) } assert(e.getMessage.contains("Number of clusters must be greater than one")) } }
Example 193
Source File: MovingAverageSparklet.scala From cloudflow with Apache License 2.0 | 5 votes |
package sensors import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class MovingAverageSparklet extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Agg]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data]): Dataset[Agg] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", "1 minutes") .groupBy(window($"ts", "1 minute", "30 seconds"), $"src", $"gauge") .agg(avg($"value").as("avg")) query.select($"src", $"gauge", $"avg".as("value")).as[Agg] } } }
Example 194
Source File: CallStatsAggregator.scala From cloudflow with Apache License 2.0 | 5 votes |
package carly.aggregator import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.streaming.OutputMode import cloudflow.spark.sql.SQLImplicits._ import org.apache.log4j.{ Level, Logger } import carly.data._ class CallStatsAggregator extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) //tag::docs-schemaAware-example[] val in = AvroInlet[CallRecord]("in") val out = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString) val shape = StreamletShape(in, out) //end::docs-schemaAware-example[] val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute")) val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute")) override def configParameters = Vector(GroupByWindow, Watermark) override def createLogic = new SparkStreamletLogic { val watermark = Watermark.value val groupByWindow = GroupByWindow.value // val t0 = System.currentTimeMillis() // serialization error! //tag::docs-aggregationQuery-example[] override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Update).toQueryExecution } private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", s"${watermark.toMillis()} milliseconds") .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds")) .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration")) .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType)) query .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration") .as[AggregatedCallStats] } //end::docs-aggregationQuery-example[] } }
Example 195
Source File: CallRecordGeneratorIngress.scala From cloudflow with Apache License 2.0 | 5 votes |
package carly.aggregator import java.sql.Timestamp import scala.util.Random import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, SparkSession } import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.LongType import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.sql.SQLImplicits._ import carly.data.CallRecord import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.log4j.{ Level, Logger } case class Rate(timestamp: Timestamp, value: Long) class CallRecordGeneratorIngress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) val out = AvroOutlet[CallRecord]("out", _.user) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = RecordsPerSecond.value override def buildStreamingQueries = { val outStream = DataGenerator.mkData(super.session, recordsPerSecond) writeStream(outStream, out, OutputMode.Append).toQueryExecution } } } object DataGenerator { def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = { // do we need to expose this through configuration? val MaxTime = 2.hours.toMillis val MaxUsers = 100000 val TS0 = new java.sql.Timestamp(0) val ZeroTimestampProb = 0.05 // error rate // Random Data Generator val usersUdf = udf(() ⇒ "user-" + Random.nextInt(MaxUsers)) val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing") // Time-biased randomized filter - 1/2 hour cycles val sinTime: Long ⇒ Double = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI) val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob val timeFilterUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng)) val zeroTimestampUdf = udf { (ts: java.sql.Timestamp, rng: Double) ⇒ if (rng < ZeroTimestampProb) { TS0 } else { ts } } val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand()) val sampledData = randomDataset .where(timeFilterUdf($"timestamp", $"rng")) .withColumn("user", usersUdf()) .withColumn("other", usersUdf()) .withColumn("direction", directionUdf()) .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType)) .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng")) .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp".as("timestamp")) .as[CallRecord] sampledData } }
Example 196
Source File: SparkOutput.scala From cloudflow with Apache License 2.0 | 5 votes |
package swissknife.spark import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode import swissknife.data.Data class SparkOutput extends SparkStreamlet { val in = AvroInlet[Data]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { val sparkLocality = context.session.conf.getOption("spark.locality.wait").getOrElse("") val feedbackMsg = s"locality=[$sparkLocality]" override def buildStreamingQueries = { val query = readStream(in) // we add this to the output to make it observable from the outside .withColumn("payload", lit(feedbackMsg)) // we add this to the output to make it observable from the outside .writeStream .format("console") .option("truncate","false") .start query.toQueryExecution } } }
Example 197
Source File: SparkCounter.scala From cloudflow with Apache License 2.0 | 5 votes |
package swissknife.spark import cloudflow.streamlets.{StreamletShape, StringConfigParameter} import cloudflow.streamlets.avro._ import cloudflow.spark.{SparkStreamlet, SparkStreamletLogic} import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode import swissknife.data.Data class SparkCounter extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.src) val shape = StreamletShape(in, out) val configurableMessage = StringConfigParameter("configurable-message", "Configurable message.", Some("spark-original")) override def configParameters = Vector(configurableMessage) override def createLogic() = new SparkStreamletLogic { val msg = configurableMessage.value override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset, msg) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data], message: String): Dataset[Data] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withColumn("updated_src", concat($"src", lit("-spark"))) .withWatermark("ts", "0 seconds") .groupBy(window($"ts", "5 seconds"), $"updated_src") .agg(max($"count").as("count")) query.select($"updated_src".as("src"), $"window.start".as("timestamp"), lit(message).as("payload"), $"count").as[Data] } } }
Example 198
Source File: SparkDataGenerator.scala From cloudflow with Apache License 2.0 | 5 votes |
package swissknife.spark import java.sql.Timestamp import cloudflow.streamlets.{ IntegerConfigParameter, StreamletShape } import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.functions._ import cloudflow.spark.sql.SQLImplicits._ import swissknife.data.Data case class Rate(timestamp: Timestamp, value: Long) class SparkDataGenerator extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.src) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to produce.", Some(1)) override def configParameters = Vector(RecordsPerSecond) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = writeStream(process, out, OutputMode.Append).toQueryExecution private def process: Dataset[Data] = { val recordsPerSecond = RecordsPerSecond.value session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .select(lit("origin").as("src"), $"timestamp", lit("").as("payload"), $"value".as("count")) .as[Data] } } }
Example 199
Source File: RichSparkFunctionsSpec.scala From lighthouse with Apache License 2.0 | 5 votes |
package be.dataminded.lighthouse.pipeline import java.io.ByteArrayOutputStream import be.dataminded.lighthouse.testing.SharedSparkSession import better.files._ import org.apache.spark.sql.Dataset import org.apache.spark.storage.StorageLevel import org.scalatest.BeforeAndAfter import org.scalatest.funspec.AnyFunSpec import org.scalatest.matchers.should.Matchers class RichSparkFunctionsSpec extends AnyFunSpec with Matchers with SharedSparkSession with BeforeAndAfter { import spark.implicits._ describe("SparkFunctions with a DataSet inside should have extra functionality") { val function = SparkFunction.of(Seq(1, 2, 3, 4, 5).toDS()) it("can cache") { function.cache().run(spark).storageLevel should equal(StorageLevel.MEMORY_ONLY) } it("can drop the cache") { function.cache().dropCache().run(spark).storageLevel should equal(StorageLevel.NONE) } it("can be written to a sink") { function.write(OrcSink("target/output/orc")).run(spark) file"target/output/orc".exists should be(true) } it("can be written to multiple sinks") { function.write(OrcSink("target/output/orc"), OrcSink("target/output/orc2")).run(spark) file"target/output/orc".exists should be(true) file"target/output/orc2".exists should be(true) } it("is being cached when writing to multiple sinks for performance") { val result = function.write(OrcSink("target/output/orc"), OrcSink("target/output/orc2")).run(spark) result.storageLevel should equal(StorageLevel.MEMORY_ONLY) } it("can easily be counted") { function.count().run(spark) should equal(5) } it("can print the schema") { val stream = new ByteArrayOutputStream() Console.withOut(stream) { function.printSchema().run(spark) } stream.toString() should include("value: integer (nullable = false)") } it("can be be used as a Dataset") { function.as[Int].run(spark) shouldBe a[Dataset[_]] } } after { file"target/output/orc".delete(true) file"target/output/orc2".delete(true) } }
Example 200
Source File: AvroDataLink.scala From lighthouse with Apache License 2.0 | 5 votes |
package be.dataminded.lighthouse.datalake import org.apache.spark.sql.{DataFrame, Dataset, SaveMode} class AvroDataLink( val path: LazyConfig[String], saveMode: SaveMode = SaveMode.Overwrite, partitionedBy: List[String] = List.empty, options: Map[String, String] = Map.empty ) extends PathBasedDataLink { override def doRead(path: String): DataFrame = { spark.read .format("com.databricks.spark.avro") .options(options) .load(path) } override def doWrite[T](dataset: Dataset[T], path: String): Unit = { dataset.write .format("com.databricks.spark.avro") .partitionBy(partitionedBy: _*) .options(options) .mode(saveMode) .save(path) } }