org.apache.spark.sql.Dataset Scala Examples

The following examples show how to use org.apache.spark.sql.Dataset. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingConsumer.scala    From Scala-Programming-Projects   with MIT License 11 votes vote down vote up
package coinyser

import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.functions._

object StreamingConsumer {
  def fromJson(df: DataFrame): Dataset[Transaction] = {
    import df.sparkSession.implicits._
    val schema = Seq.empty[Transaction].toDS().schema
    df.select(from_json(col("value").cast("string"), schema).alias("v"))
      .select("v.*").as[Transaction]
  }

  def transactionStream(implicit spark: SparkSession, config: KafkaConfig): Dataset[Transaction] =
    fromJson(spark.readStream.format("kafka")
      .option("kafka.bootstrap.servers", config.bootStrapServers)
      .option("startingoffsets", "earliest")
      .option("subscribe", config.transactionsTopic)
      .load()
    )

} 
Example 2
Source File: MultilayerPerceptronClassifierWrapper.scala    From drizzle-spark   with Apache License 2.0 8 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel,
    val labelCount: Long,
    val layers: Array[Int],
    val weights: Array[Double]
  ) extends MLWritable {

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
      val rMetadata = parse(rMetadataStr)
      val labelCount = (rMetadata \ "labelCount").extract[Long]
      val layers = (rMetadata \ "layers").extract[Array[Int]]
      val weights = (rMetadata \ "weights").extract[Array[Double]]

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = ("class" -> instance.getClass.getName) ~
        ("labelCount" -> instance.labelCount) ~
        ("layers" -> instance.layers.toSeq) ~
        ("weights" -> instance.weights.toArray.toSeq)
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 3
Source File: CogroupTest.scala    From spark-tools   with Apache License 2.0 6 votes vote down vote up
package io.univalence.plumbus
import io.univalence.plumbus.test.SparkTestLike
import org.apache.spark.sql.Dataset
import org.scalatest.{ FunSuiteLike, Matchers }
import com.github.mrpowers.spark.fast.tests.DatasetComparer

class CogroupTest extends FunSuiteLike with SparkTestLike with Matchers with DatasetComparer {
  import spark.implicits._
  import io.univalence.plumbus.cogroup._

  val person1 = PersonWithId("1", "John", 32)
  val person2 = PersonWithId("2", "Mary", 32)

  val address1 = Address("1", "address1")
  val address2 = Address("2", "address2")
  val address3 = Address("1", "address3")

  val persons: Dataset[PersonWithId] = Seq(person1, person2).toDS()
  val addresses: Dataset[Address]    = Seq(address1, address2, address3).toDS()

  test("apply test") {
    val applyDS = apply(persons, addresses)(_.id, _.idPerson)
    val expectedDS = Seq(
      ("1", Seq(person1), Seq(address1, address3)),
      ("2", Seq(person2), Seq(address2))
    ).toDS()
    assertSmallDatasetEquality(applyDS, expectedDS, orderedComparison = false)
  }
}

case class Address(idPerson: String, name: String) 
Example 4
Source File: MNISTBenchmark.scala    From spark-knn   with Apache License 2.0 6 votes vote down vote up
package com.github.saurfang.spark.ml.knn.examples

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.log4j

import scala.collection.mutable


object MNISTBenchmark {

  val logger = log4j.Logger.getLogger(getClass)

  def main(args: Array[String]) {
    val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt)
    val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2"
    val numPartitions = if(args.length >= 3) args(2).toInt else 10
    val models = if(args.length >=4) args(3).split(',') else Array("tree","naive")

    val spark = SparkSession.builder().getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._

    //read in raw label and features
    val rawDataset = MLUtils.loadLibSVMFile(sc, path)
      .zipWithIndex()
      .filter(_._2 < ns.max)
      .sortBy(_._2, numPartitions = numPartitions)
      .keys
      .toDF()

    // convert "features" from mllib.linalg.Vector to ml.linalg.Vector
    val dataset =  MLUtils.convertVectorColumnsToML(rawDataset)
      .cache()
    dataset.count() //force persist

    val limiter = new Limiter()
    val knn = new KNNClassifier()
      .setTopTreeSize(numPartitions * 10)
      .setFeaturesCol("features")
      .setPredictionCol("prediction")
      .setK(1)
    val naiveKNN = new NaiveKNNClassifier()

    val pipeline = new Pipeline()
      .setStages(Array(limiter, knn))
    val naivePipeline = new Pipeline()
      .setStages(Array(limiter, naiveKNN))

    val paramGrid = new ParamGridBuilder()
      .addGrid(limiter.n, ns)
      .build()

    val bm = new Benchmarker()
      .setEvaluator(new MulticlassClassificationEvaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumTimes(3)

    val metrics = mutable.ArrayBuffer[String]()
    if(models.contains("tree")) {
      val bmModel = bm.setEstimator(pipeline).fit(dataset)
      metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}"
    }
    if(models.contains("naive")) {
      val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset)
      metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}"
    }
    logger.info(metrics.mkString("\n"))
  }
}

class Limiter(override val uid: String) extends Transformer {
  def this() = this(Identifiable.randomUID("limiter"))

  val n: IntParam = new IntParam(this, "n", "number of rows to limit")

  def setN(value: Int): this.type = set(n, value)

  // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN)
  override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF()

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = schema
} 
Example 5
Source File: StreamingConsumerApp.scala    From Scala-Programming-Projects   with MIT License 5 votes vote down vote up
package coinyser

import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.functions._

object StreamingConsumerApp extends App {

  implicit val spark: SparkSession = SparkSession
    .builder
    .master("local[*]")
    .appName("StreamingConsumerApp")
    .getOrCreate()

  implicit val config: KafkaConfig = KafkaConfig(
    bootStrapServers = "localhost:9092",
    transactionsTopic = "transactions_draft3"
  )

  val txStream: Dataset[Transaction] = StreamingConsumer.transactionStream

  import spark.implicits._

  // TODO move that to a Query class between batch and streaming
  val groupedStream = txStream
    .withWatermark("date", "1 second")
    .groupBy(window($"date", "1 minutes").as("window"))
    .agg(
      count($"tid").as("count"),
      avg("price").as("avgPrice"),
      stddev("price").as("stddevPrice"),
      last("price").as("lastPrice"),
      sum("amount").as("sumAmount")
    )
    .select("window.start", "count", "avgPrice", "lastPrice", "stddevPrice", "sumAmount")

  groupedStream
    .writeStream
    .format("console")
    .queryName("groupedTx")
    .outputMode("append")
    .start()


  Thread.sleep(Long.MaxValue)

} 
Example 6
Source File: cogroup.scala    From spark-tools   with Apache License 2.0 5 votes vote down vote up
package io.univalence.plumbus

import org.apache.spark.Partitioner
import org.apache.spark.rdd.{ CoGroupedRDD, RDD }
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{ ArrayType, StructField }
import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row }

import scala.reflect.ClassTag
import scala.util.Try

object cogroup {

  
  implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) {
    def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] =
      //Use SparkAddOn ?
      ???
  }

  def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)(
    implicit encA: Encoder[A],
    encB: Encoder[B],
    encC: Encoder[K],
    enc: Encoder[(K, Seq[A], Seq[B])],
    ca: ClassTag[A],
    ck: ClassTag[K],
    cb: ClassTag[B]
  ): Dataset[(K, Seq[A], Seq[B])] =
    left.sparkSession.implicits
      .rddToDatasetHolder(
        RDD
          .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft))
          .cogroup(right.rdd.keyBy(keyRight))
          .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) })
      )
      .toDS

  def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)(
    byKey: String,
    partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*)
  ): Try[DataFrame] =
    Try {
      val subGroup: Seq[DataFrame]  = namedSubGroup.map(_._2)
      val allFrames: Seq[DataFrame] = group +: subGroup
      val allFramesKeyed: Seq[RDD[(String, Row)]] =
        allFrames.map(df => {
          val idx = df.columns.indexOf(byKey)
          df.rdd.keyBy(_.get(idx).toString)
        })

      val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner)

      val rowRdd: RDD[Row] =
        cogroupRdd.map(x => {
          val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq)
          val seq                   = rows.head.head.toSeq ++ rows.tail

          new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row]
        })

      val schema =
        types.StructType(
          group.schema.fields
            ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) }
        )

      group.sparkSession.createDataFrame(rowRdd, schema)
    }

} 
Example 7
Source File: CompressDumpTest.scala    From spark-tools   with Apache License 2.0 5 votes vote down vote up
package io.univalence.plumbus

import io.univalence.plumbus.compress.CompressDump
import org.apache.spark.sql.{ DataFrame, Dataset, SparkSession }
import org.scalatest.FunSuite

class CompressDumpTest extends FunSuite {

  val ss: SparkSession =
    SparkSession
      .builder()
      .master("local[*]")
      .appName("test")
      .config("spark.default.parallelism", "1")
      .getOrCreate()

  import ss.implicits._

  test("compressUsingDF2") {
    val stringToRs: Map[String, Seq[R]] =
      Map(
        "dump1" -> Seq(
          R(1, "a", 1),
          R(2, "b", 22)
        ),
        "dump2" -> Seq(
          R(1, "a", 3),
          R(2, "b", 22)
        )
      )

    val df1: Dataset[(Int, Seq[RCompressed])] =
      CompressDump
        .compressUsingDF2(dfs = stringToRs.mapValues(s => ss.createDataset(s).toDF()), groupExpr = "id")
        .as[(Int, Seq[RCompressed])]

    

    val map: Map[Int, Seq[RCompressed]] = df1.collect().toMap

    assert(map(1).size == 2)
  }

}

case class R(id: Int, a: String, b: Int)

case class RCompressed(id: Int, a: String, b: Int, compressDumpDts: Seq[String]) 
Example 8
Source File: package.scala    From spark-tools   with Apache License 2.0 5 votes vote down vote up
package io.univalence

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.centrifuge_sql._

package object centrifuge {

  type AnnotationSql = Annotation

  object AnnotationSql {

    def apply(
      msg: String,
      onField: String,
      fromFields: Vector[String],
      isError: Boolean,
      count: Long
    ): Annotation = Annotation(
      message    = msg,
      isError    = isError,
      count      = count,
      onField    = Some(onField),
      fromFields = fromFields
    )

  }

  object implicits {
    implicit def QADFOps[T](dataframe: Dataset[T]): QADF =
      new QADF(dataframe.toDF())
    implicit def sparkSessionOps(ss: SparkSession): QATools = new QATools(ss)
  }

} 
Example 9
Source File: HashingTF.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
} 
Example 10
Source File: SQLTransformer.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType


  @Since("1.6.0")
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val tableName = Identifiable.randomUID(uid)
    dataset.createOrReplaceTempView(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val result = dataset.sparkSession.sql(realStatement)
    dataset.sparkSession.catalog.dropTempView(tableName)
    result
  }

  @Since("1.6.0")
  override def transformSchema(schema: StructType): StructType = {
    val spark = SparkSession.builder().getOrCreate()
    val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty))
    val dummyDF = spark.createDataFrame(dummyRDD, schema)
    val tableName = Identifiable.randomUID(uid)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    dummyDF.createOrReplaceTempView(tableName)
    val outputSchema = spark.sql(realStatement).schema
    spark.catalog.dropTempView(tableName)
    outputSchema
  }

  @Since("1.6.0")
  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}

@Since("1.6.0")
object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {

  @Since("1.6.0")
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 11
Source File: BinaryClassificationEvaluator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType


  @Since("1.2.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "areaUnderROC")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT))
    SchemaUtils.checkNumericType(schema, $(labelCol))

    // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2.
    val scoreAndLabels =
      dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map {
        case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label)
        case Row(rawPrediction: Double, label: Double) => (rawPrediction, label)
      }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    val metric = $(metricName) match {
      case "areaUnderROC" => metrics.areaUnderROC()
      case "areaUnderPR" => metrics.areaUnderPR()
    }
    metrics.unpersist()
    metric
  }

  @Since("1.5.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "areaUnderROC" => true
    case "areaUnderPR" => true
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] {

  @Since("1.6.0")
  override def load(path: String): BinaryClassificationEvaluator = super.load(path)
} 
Example 12
Source File: MulticlassClassificationEvaluator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType


  @Since("1.5.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "f1")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels =
      dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map {
        case Row(prediction: Double, label: Double) => (prediction, label)
      }
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "f1" => metrics.weightedFMeasure
      case "weightedPrecision" => metrics.weightedPrecision
      case "weightedRecall" => metrics.weightedRecall
      case "accuracy" => metrics.accuracy
    }
    metric
  }

  @Since("1.5.0")
  override def isLargerBetter: Boolean = true

  @Since("1.5.0")
  override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object MulticlassClassificationEvaluator
  extends DefaultParamsReadable[MulticlassClassificationEvaluator] {

  @Since("1.6.0")
  override def load(path: String): MulticlassClassificationEvaluator = super.load(path)
} 
Example 13
Source File: RegressionEvaluator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, FloatType}


  @Since("1.4.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "rmse")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType))
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels = dataset
      .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType))
      .rdd
      .map { case Row(prediction: Double, label: Double) => (prediction, label) }
    val metrics = new RegressionMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "rmse" => metrics.rootMeanSquaredError
      case "mse" => metrics.meanSquaredError
      case "r2" => metrics.r2
      case "mae" => metrics.meanAbsoluteError
    }
    metric
  }

  @Since("1.4.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "rmse" => false
    case "mse" => false
    case "r2" => true
    case "mae" => false
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] {

  @Since("1.6.0")
  override def load(path: String): RegressionEvaluator = super.load(path)
} 
Example 14
Source File: RWrapperUtils.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.spark.internal.Logging
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.Dataset

object RWrapperUtils extends Logging {

  
  def checkDataColumns(rFormula: RFormula, data: Dataset[_]): Unit = {
    if (data.schema.fieldNames.contains(rFormula.getFeaturesCol)) {
      val newFeaturesName = s"${Identifiable.randomUID(rFormula.getFeaturesCol)}"
      logWarning(s"data containing ${rFormula.getFeaturesCol} column, " +
        s"using new name $newFeaturesName instead")
      rFormula.setFeaturesCol(newFeaturesName)
    }
  }
} 
Example 15
Source File: Transformer.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import scala.annotation.varargs

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


  protected def validateInputType(inputType: DataType): Unit = {}

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val transformUDF = udf(this.createTransformFunc, outputDataType)
    dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): T = defaultCopy(extra)
} 
Example 16
Source File: TokenizerSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}

@BeanInfo
case class TokenizerTestData(rawText: String, wantedTokens: Array[String])

class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new Tokenizer)
  }

  test("read/write") {
    val t = new Tokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }
}

class RegexTokenizerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import org.apache.spark.ml.feature.RegexTokenizerSuite._
  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegexTokenizer)
  }

  test("RegexTokenizer") {
    val tokenizer0 = new RegexTokenizer()
      .setGaps(false)
      .setPattern("\\w+|\\p{Punct}")
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset0 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")),
      TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct"))
    ).toDF()
    testRegexTokenizer(tokenizer0, dataset0)

    val dataset1 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")),
      TokenizerTestData("Te,st. punct", Array("punct"))
    ).toDF()
    tokenizer0.setMinTokenLength(3)
    testRegexTokenizer(tokenizer0, dataset1)

    val tokenizer2 = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset2 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")),
      TokenizerTestData("Te,st.  punct", Array("te,st.", "punct"))
    ).toDF()
    testRegexTokenizer(tokenizer2, dataset2)
  }

  test("RegexTokenizer with toLowercase false") {
    val tokenizer = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
      .setToLowercase(false)
    val dataset = Seq(
      TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")),
      TokenizerTestData("java scala", Array("java", "scala"))
    ).toDF()
    testRegexTokenizer(tokenizer, dataset)
  }

  test("read/write") {
    val t = new RegexTokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setMinTokenLength(2)
      .setGaps(false)
      .setPattern("hi")
      .setToLowercase(false)
    testDefaultReadWrite(t)
  }
}

object RegexTokenizerSuite extends SparkFunSuite {

  def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = {
    t.transform(dataset)
      .select("tokens", "wantedTokens")
      .collect()
      .foreach { case Row(tokens, wantedTokens) =>
        assert(tokens === wantedTokens)
      }
  }
} 
Example 17
Source File: NGramSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}

@BeanInfo
case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String])

class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import org.apache.spark.ml.feature.NGramSuite._
  import testImplicits._

  test("default behavior yields bigram features") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
    val dataset = Seq(NGramTestData(
      Array("Test", "for", "ngram", "."),
      Array("Test for", "for ngram", "ngram .")
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("NGramLength=4 yields length 4 n-grams") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = Seq(NGramTestData(
      Array("a", "b", "c", "d", "e"),
      Array("a b c d", "b c d e")
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("empty input yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = Seq(NGramTestData(Array(), Array())).toDF()
    testNGram(nGram, dataset)
  }

  test("input array < n yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(6)
    val dataset = Seq(NGramTestData(
      Array("a", "b", "c", "d", "e"),
      Array()
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("read/write") {
    val t = new NGram()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setN(3)
    testDefaultReadWrite(t)
  }
}

object NGramSuite extends SparkFunSuite {

  def testNGram(t: NGram, dataset: Dataset[_]): Unit = {
    t.transform(dataset)
      .select("nGrams", "wantedNGrams")
      .collect()
      .foreach { case Row(actualNGrams, wantedNGrams) =>
        assert(actualNGrams === wantedNGrams)
      }
  }
} 
Example 18
Source File: SQLBuilderTest.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst

import scala.util.control.NonFatal

import org.apache.spark.sql.{DataFrame, Dataset, QueryTest}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.hive.test.TestHiveSingleton


abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton {
  protected def checkSQL(e: Expression, expectedSQL: String): Unit = {
    val actualSQL = e.sql
    try {
      assert(actualSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following expression:
             |
             |${e.prettyName}
             |
             |$cause
           """.stripMargin)
    }
  }

  protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = {
    val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) =>
      fail(
        s"""Cannot convert the following logical query plan to SQL:
           |
           |${plan.treeString}
         """.stripMargin)
    }

    try {
      assert(generatedSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following logical query plan:
             |
             |${plan.treeString}
             |
             |$cause
           """.stripMargin)
    }

    checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan))
  }

  protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = {
    checkSQL(df.queryExecution.analyzed, expectedSQL)
  }
} 
Example 19
Source File: Aggregator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression


  def toColumn: TypedColumn[IN, OUT] = {
    implicit val bEncoder = bufferEncoder
    implicit val cEncoder = outputEncoder

    val expr =
      AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        isDistinct = false)

    new TypedColumn[IN, OUT](expr, encoderFor[OUT])
  }
} 
Example 20
Source File: FrequentItems.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 21
Source File: cache.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command

import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

case class CacheTableCommand(
    tableIdent: TableIdentifier,
    plan: Option[LogicalPlan],
    isLazy: Boolean) extends RunnableCommand {
  require(plan.isEmpty || tableIdent.database.isEmpty,
    "Database name is not allowed in CACHE TABLE AS SELECT")

  override protected def innerChildren: Seq[QueryPlan[_]] = {
    plan.toSeq
  }

  override def run(sparkSession: SparkSession): Seq[Row] = {
    plan.foreach { logicalPlan =>
      Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString)
    }
    sparkSession.catalog.cacheTable(tableIdent.quotedString)

    if (!isLazy) {
      // Performs eager caching
      sparkSession.table(tableIdent).count()
    }

    Seq.empty[Row]
  }
}


case class UncacheTableCommand(tableIdent: TableIdentifier) extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    sparkSession.catalog.uncacheTable(tableIdent.quotedString)
    Seq.empty[Row]
  }
}


case object ClearCacheCommand extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    sparkSession.catalog.clearCache()
    Seq.empty[Row]
  }
} 
Example 22
Source File: FilterTopFeaturesProcess.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.s2jobs.wal.process

import org.apache.s2graph.s2jobs.task.TaskConf
import org.apache.s2graph.s2jobs.wal.WalLogAgg
import org.apache.s2graph.s2jobs.wal.transformer.{DefaultTransformer, Transformer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import play.api.libs.json.{JsObject, Json}

object FilterTopFeaturesProcess {
  private var validFeatureHashKeys: Set[Long] = null
  def getValidFeatureHashKeys(validFeatureHashKeysBCast: Broadcast[Array[Long]]): Set[Long] = {
    if (validFeatureHashKeys == null) {
      validFeatureHashKeys = validFeatureHashKeysBCast.value.toSet
    }

    validFeatureHashKeys
  }

  def collectDistinctFeatureHashes(ss: SparkSession,
                                   filteredDict: DataFrame): Array[Long] = {
    import ss.implicits._

    val featureHashUDF = udf((dim: String, value: String) => WalLogAgg.toFeatureHash(dim, value))

    filteredDict.withColumn("featureHash", featureHashUDF(col("dim"), col("value")))
      .select("featureHash")
      .distinct().as[Long].collect()
  }

  def filterTopKsPerDim(dict: DataFrame,
                        maxRankPerDim: Broadcast[Map[String, Int]],
                        defaultMaxRank: Int): DataFrame = {
    val filterUDF = udf((dim: String, rank: Long) => {
      rank < maxRankPerDim.value.getOrElse(dim, defaultMaxRank)
    })

    dict.filter(filterUDF(col("dim"), col("rank")))
  }

  def filterWalLogAgg(ss: SparkSession,
                      walLogAgg: Dataset[WalLogAgg],
                      transformers: Seq[Transformer],
                      validFeatureHashKeysBCast: Broadcast[Array[Long]]) = {
    import ss.implicits._
    walLogAgg.mapPartitions { iter =>
      val validFeatureHashKeys = getValidFeatureHashKeys(validFeatureHashKeysBCast)

      iter.map { walLogAgg =>
        WalLogAgg.filterProps(walLogAgg, transformers, validFeatureHashKeys)
      }
    }
  }
}

class FilterTopFeaturesProcess(taskConf: TaskConf) extends org.apache.s2graph.s2jobs.task.Process(taskConf) {

  import FilterTopFeaturesProcess._

  
  override def execute(ss: SparkSession, inputMap: Map[String, DataFrame]): DataFrame = {
    import ss.implicits._

    val maxRankPerDim = taskConf.options.get("maxRankPerDim").map { s =>
      Json.parse(s).as[JsObject].fields.map { case (k, jsValue) =>
        k -> jsValue.as[Int]
      }.toMap
    }
    val maxRankPerDimBCast = ss.sparkContext.broadcast(maxRankPerDim.getOrElse(Map.empty))

    val defaultMaxRank = taskConf.options.get("defaultMaxRank").map(_.toInt)

    val featureDict = inputMap(taskConf.options("featureDict"))
    val walLogAgg = inputMap(taskConf.options("walLogAgg")).as[WalLogAgg]

    val transformers = TaskConf.parseTransformers(taskConf)

    val filteredDict = filterTopKsPerDim(featureDict, maxRankPerDimBCast, defaultMaxRank.getOrElse(Int.MaxValue))
    val validFeatureHashKeys = collectDistinctFeatureHashes(ss, filteredDict)
    val validFeatureHashKeysBCast = ss.sparkContext.broadcast(validFeatureHashKeys)

    filterWalLogAgg(ss, walLogAgg, transformers, validFeatureHashKeysBCast).toDF()
  }

  override def mandatoryOptions: Set[String] = Set("featureDict", "walLogAgg")
} 
Example 23
Source File: Deserializer.scala    From almaren-framework   with Apache License 2.0 5 votes vote down vote up
package com.github.music.of.the.ainur.almaren.state.core

import com.github.music.of.the.ainur.almaren.State
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{DataType, StructType}

import scala.language.implicitConversions
import com.github.music.of.the.ainur.almaren.Almaren
import com.github.music.of.the.ainur.almaren.util.Constants
import org.apache.spark.sql.Dataset

abstract class Deserializer() extends State {
  override def executor(df: DataFrame): DataFrame = deserializer(df)
  def deserializer(df: DataFrame): DataFrame
  implicit def string2Schema(schema: String): DataType =
    StructType.fromDDL(schema)
  
}

case class AvroDeserializer(columnName: String,schema: String) extends Deserializer {
  import org.apache.spark.sql.avro._
  import org.apache.spark.sql.functions._
  override def deserializer(df: DataFrame): DataFrame = {
    logger.info(s"columnName:{$columnName}, schema:{$schema}")
    df.withColumn(columnName,from_avro(col(columnName),schema))
      .select("*",columnName.concat(".*")).drop(columnName)
  }
}

case class JsonDeserializer(columnName: String,schema: Option[String]) extends Deserializer {
  import org.apache.spark.sql.functions._
  override def deserializer(df: DataFrame): DataFrame = {
    import df.sparkSession.implicits._
    logger.info(s"columnName:{$columnName}, schema:{$schema}")
    df.withColumn(columnName,
      from_json(col(columnName),
        schema.getOrElse(getSchemaDDL(df.selectExpr(columnName).as[(String)]))))
      .select("*",columnName.concat(".*"))
      .drop(columnName)
  }
  private def getSchemaDDL(df: Dataset[String]): String =
    Almaren.spark.getOrCreate().read.json(df.sample(Constants.sampleDeserializer)).schema.toDDL
}

case class XMLDeserializer(columnName: String) extends Deserializer {
  import com.databricks.spark.xml.XmlReader
  override def deserializer(df: DataFrame): DataFrame = {
    logger.info(s"columnName:{$columnName}")
    new XmlReader().xmlRdd(df.sparkSession,df.select(columnName).rdd.map(r => r(0).asInstanceOf[String])).toDF
  }
} 
Example 24
Source File: PipelineWrapper.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature

import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.sql.{DataFrame, Dataset}

class PipelineWrapper() {

  var pipeline = new Pipeline()

  var transformers: Array[TransformerWrapper] = Array()

  def setTransformers(value: Array[TransformerWrapper]): this.type = {
    transformers = value
    setStages(PipelineBuilder.build(transformers))
    this
  }

  def setStages(value: Array[_ <: PipelineStage]): Unit = {
    pipeline = pipeline.setStages(value)
  }

  def fit(dataset: Dataset[_]): PipelineModelWrapper = {
    new PipelineModelWrapper(pipeline.fit(dataset), transformers)
  }

}

class PipelineModelWrapper(val model: PipelineModel,
                           val transformers: Array[TransformerWrapper]) {

  def transform(dataset: Dataset[_]): DataFrame = {
    var df = model.transform(dataset)
    if (transformers.length >= 2) {
      (0 until transformers.length - 1).foreach { i =>
        val outCols = transformers(i).getOutputCols
        for (col <- outCols) {
          df = df.drop(col)
        }
      }
    }
    df
  }
} 
Example 25
Source File: Sampler.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature.preprocess

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

import scala.util.Random


class Sampler(fraction: Double,
              override val uid: String,
              seed: Int = Random.nextInt)
  extends Transformer {

  def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler"))

  
  final def getOutputCol: String = $(inputCol)

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.sample(false, fraction, seed).toDF
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def copy(extra: ParamMap): Sampler = defaultCopy(extra)
}

object Sampler {

  def main(args: Array[String]): Unit = {
    val ss = SparkSession
      .builder
      .master("local")
      .appName("preprocess")
      .getOrCreate()

    val training = ss.read.format("libsvm")
      .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt")

    println(training.count)

    val sampler = new Sampler(0.5)
      .setInputCol("features")

    val pipeline = new Pipeline()
      .setStages(Array(sampler))

    val model = pipeline.fit(training)

    val test = ss.read.format("libsvm")
      .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt")

    model.transform(test).select("*")
      .collect()
      .foreach { case Row(label: Double, vector: Vector) =>
        println(s"($label, " +
          s"${vector.toSparse.indices.mkString("[", ",", "]")}, " +
          s"${vector.toSparse.values.mkString("[", ",", "]")}")
      }

    ss.stop()
  }
} 
Example 26
Source File: FeatureUtils.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.sql.{Dataset, Row}

import scala.language.postfixOps

object FeatureUtils {

  def maxDim(dataset: Dataset[Row], col: String = "features"): Int = {
    dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] =>
      val dim = rows.map { case Row(v: Vector) =>
        v match {
          case sv: SparseVector => sv.indices.last
          case dv: DenseVector => dv.size
        }
      }.max
      Iterator(dim)
    }.max + 1
  }

  def countNonZero(dataset: Dataset[Row], col: String = "features"): Array[Int] = {
    dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] =>
      val mergeIndices = rows.map { case Row(v: Vector) =>
        v match {
          case sv: SparseVector =>
            sv.indices.toList
        }
      }.reduce(_ union _ distinct)
      Iterator(mergeIndices)
    }.reduce((a, b) => (a union b).distinct).toArray
  }

} 
Example 27
Source File: package.scala    From amadou   with Apache License 2.0 5 votes vote down vote up
package com.mediative.amadou

import com.google.api.services.bigquery.model._
import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
import com.google.cloud.hadoop.io.bigquery._
import org.apache.hadoop.fs.{FileSystem, Path}
import net.ceedubs.ficus.readers.ValueReader
import net.ceedubs.ficus.FicusInstances

import org.apache.spark.sql.{Dataset, SparkSession, Encoder}
import java.util.concurrent.ThreadLocalRandom
import scala.collection.JavaConversions._

package object bigquery extends FicusInstances {

  object CreateDisposition extends Enumeration {
    val CREATE_IF_NEEDED, CREATE_NEVER = Value
  }

  object WriteDisposition extends Enumeration {
    val WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY = Value
  }

  val BQ_CSV_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss zzz"

  object TableNotFound {
    import com.google.api.client.googleapis.json.GoogleJsonResponseException
    import com.google.api.client.googleapis.json.GoogleJsonError
    import scala.collection.JavaConverters._

    def unapply(error: Throwable): Option[GoogleJsonError.ErrorInfo] = error match {
      case error: GoogleJsonResponseException =>
        Some(error.getDetails)
          .filter(_.getCode == 404)
          .flatMap(_.getErrors.asScala.find(_.getReason == "notFound"))
      case _ => None
    }
  }

  def tableHasDataForDate(
      spark: SparkSession,
      table: TableReference,
      date: java.sql.Date,
      column: String): Boolean = {
    val bq = BigQueryClient.getInstance(spark.sparkContext.hadoopConfiguration)
    bq.hasDataForDate(table, date, column)
  }

  
    def saveAsBigQueryTable(
        tableRef: TableReference,
        writeDisposition: WriteDisposition.Value,
        createDisposition: CreateDisposition.Value): Unit = {
      val bucket = conf.get(BigQueryConfiguration.GCS_BUCKET_KEY)
      val temp =
        s"spark-bigquery-${System.currentTimeMillis()}=${ThreadLocalRandom.current.nextInt(Int.MaxValue)}"
      val gcsPath = s"gs://$bucket/spark-bigquery-tmp/$temp"
      self.write.json(gcsPath)

      val schemaFields = self.schema.fields.map { field =>
        import org.apache.spark.sql.types._

        val fieldType = field.dataType match {
          case BooleanType    => "BOOLEAN"
          case LongType       => "INTEGER"
          case IntegerType    => "INTEGER"
          case StringType     => "STRING"
          case DoubleType     => "FLOAT"
          case TimestampType  => "TIMESTAMP"
          case _: DecimalType => "INTEGER"
        }
        new TableFieldSchema().setName(field.name).setType(fieldType)
      }.toList

      val tableSchema = new TableSchema().setFields(schemaFields)

      bq.load(gcsPath, tableRef, tableSchema, writeDisposition, createDisposition)
      delete(new Path(gcsPath))
    }

    private def delete(path: Path): Unit = {
      val fs = FileSystem.get(path.toUri, conf)
      fs.delete(path, true)
      ()
    }

  }

  implicit val valueReader: ValueReader[BigQueryTable.PartitionStrategy] =
    ValueReader[String].map {
      _ match {
        case "month" => BigQueryTable.PartitionByMonth
        case "day"   => BigQueryTable.PartitionByDay
        case other   => sys.error(s"Unknown partition strategy")
      }
    }
} 
Example 28
Source File: DatasetExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch05

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.{Dataset, SQLContext}
import org.apache.spark.sql.functions._

private case class Person(id: Int, name: String, age: Int)

object DatasetExample {

  
  def main(args: Seq[String]): Unit = {
    val conf = new SparkConf().setAppName("DatasetExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    run(sc, sqlContext)
    sc.stop()
  }

  def run(sc: SparkContext, sqlContext: SQLContext): Unit = {
    import sqlContext.implicits._

    // Creates a Dataset from a `Seq`
    val seq = Seq((1, "Bob", 23), (2, "Tom", 23), (3, "John", 22))
    val ds1: Dataset[(Int, String, Int)] = sqlContext.createDataset(seq)
    val ds2: Dataset[(Int, String, Int)] = seq.toDS()

    // Creates a Dataset from a `RDD`
    val rdd = sc.parallelize(seq)
    val ds3: Dataset[(Int, String, Int)] = sqlContext.createDataset(rdd)
    val ds4: Dataset[(Int, String, Int)] = rdd.toDS()

    // Creates a Dataset from a `DataFrame`
    val df = rdd.toDF("id", "name", "age")
    val ds5: Dataset[Person] = df.as[Person]

    // Selects a column
    ds5.select(expr("name").as[String]).show()

    // Filtering
    ds5.filter(_.name == "Bob").show()
    ds5.filter(person => person.age == 23).show()

    // Groups and counts the number of rows
    ds5.groupBy(_.age).count().show()
  }
} 
Example 29
Source File: MovieRecommendation.scala    From Scala-Machine-Learning-Projects   with MIT License 5 votes vote down vote up
package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLImplicits
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object MovieRecommendation {  
  //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. 
  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = {
    val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
    val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating)
    }.join(data.map(x => ((x.user, x.product), x.rating))).values
    if (implicitPrefs) {
      println("(Prediction, Rating)")
      println(predictionsAndRatings.take(5).mkString("\n"))
    }
    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
  }

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("JavaLDAExample")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/").
      getOrCreate()

    val ratigsFile = "data/ratings.csv"
    val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile)

    val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp"))
    ratingsDF.show(false)

    val moviesFile = "data/movies.csv"
    val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile)

    val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres"))
    moviesDF.show(false)

    ratingsDF.createOrReplaceTempView("ratings")
    moviesDF.createOrReplaceTempView("movies")

    

    var rmseTest = computeRmse(model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    println("----------------------------------")
    val recommendationsUser = model.recommendProducts(668, 6)
    recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
    println("----------------------------------")

    spark.stop()
  }
} 
Example 30
Source File: Describe.scala    From Scala-Machine-Learning-Projects   with MIT License 5 votes vote down vote up
package com.packt.ScalaML.ChrunPrediction

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel }
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.max
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

import org.apache.spark._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset

import org.apache.spark.ml.linalg.{ Matrix, Vectors }
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row

object Describe {
  case class CustomerAccount(state_code: String, account_length: Integer, area_code: String,
    international_plan: String, voice_mail_plan: String, num_voice_mail: Double,
    total_day_mins: Double, total_day_calls: Double, total_day_charge: Double,
    total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double,
    total_night_mins: Double, total_night_calls: Double, total_night_charge: Double,
    total_international_mins: Double, total_international_calls: Double, total_international_charge: Double,
    total_international_num_calls: Double, churn: String)

  val schema = StructType(Array(
    StructField("state_code", StringType, true),
    StructField("account_length", IntegerType, true),
    StructField("area_code", StringType, true),
    StructField("international_plan", StringType, true),
    StructField("voice_mail_plan", StringType, true),
    StructField("num_voice_mail", DoubleType, true),
    StructField("total_day_mins", DoubleType, true),
    StructField("total_day_calls", DoubleType, true),
    StructField("total_day_charge", DoubleType, true),
    StructField("total_evening_mins", DoubleType, true),
    StructField("total_evening_calls", DoubleType, true),
    StructField("total_evening_charge", DoubleType, true),
    StructField("total_night_mins", DoubleType, true),
    StructField("total_night_calls", DoubleType, true),
    StructField("total_night_charge", DoubleType, true),
    StructField("total_international_mins", DoubleType, true),
    StructField("total_international_calls", DoubleType, true),
    StructField("total_international_charge", DoubleType, true),
    StructField("total_international_num_calls", DoubleType, true),
    StructField("churn", StringType, true)))

  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName("Desribe")
      .getOrCreate()

    spark.conf.set("spark.debug.maxToStringFields", 10000)
    val DEFAULT_MAX_TO_STRING_FIELDS = 2500
    if (SparkEnv.get != null) {
      SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS)
    } else {
      DEFAULT_MAX_TO_STRING_FIELDS
    }
    import spark.implicits._

    val trainSet: Dataset[CustomerAccount] = spark.read.
      option("inferSchema", "false")
      .format("com.databricks.spark.csv")
      .schema(schema)
      .load("data/churn-bigml-80.csv")
      .as[CustomerAccount]

    val statsDF = trainSet.describe()   
    statsDF.show()

    trainSet.createOrReplaceTempView("UserAccount")
    spark.catalog.cacheTable("UserAccount")
    
    spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show()
    spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show()
    trainSet.groupBy("churn").count.show()
    spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn")
    
  }
} 
Example 31
Source File: Preprocessing.scala    From Scala-Machine-Learning-Projects   with MIT License 5 votes vote down vote up
package com.packt.ScalaML.ChrunPrediction

import org.apache.spark._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset



object Preprocessing {
  case class CustomerAccount(state_code: String, account_length: Integer, area_code: String,
    international_plan: String, voice_mail_plan: String, num_voice_mail: Double,
    total_day_mins: Double, total_day_calls: Double, total_day_charge: Double,
    total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double,
    total_night_mins: Double, total_night_calls: Double, total_night_charge: Double,
    total_international_mins: Double, total_international_calls: Double, total_international_charge: Double,
    total_international_num_calls: Double, churn: String)

  val schema = StructType(Array(
    StructField("state_code", StringType, true),
    StructField("account_length", IntegerType, true),
    StructField("area_code", StringType, true),
    StructField("international_plan", StringType, true),
    StructField("voice_mail_plan", StringType, true),
    StructField("num_voice_mail", DoubleType, true),
    StructField("total_day_mins", DoubleType, true),
    StructField("total_day_calls", DoubleType, true),
    StructField("total_day_charge", DoubleType, true),
    StructField("total_evening_mins", DoubleType, true),
    StructField("total_evening_calls", DoubleType, true),
    StructField("total_evening_charge", DoubleType, true),
    StructField("total_night_mins", DoubleType, true),
    StructField("total_night_calls", DoubleType, true),
    StructField("total_night_charge", DoubleType, true),
    StructField("total_international_mins", DoubleType, true),
    StructField("total_international_calls", DoubleType, true),
    StructField("total_international_charge", DoubleType, true),
    StructField("total_international_num_calls", DoubleType, true),
    StructField("churn", StringType, true)))

  val spark: SparkSession = SparkSessionCreate.createSession("ChurnPredictionRandomForest")
  import spark.implicits._

  val trainSet: Dataset[CustomerAccount] = spark.read.
    option("inferSchema", "false")
    .format("com.databricks.spark.csv")
    .schema(schema)
    .load("data/churn-bigml-80.csv")
    .as[CustomerAccount]

  val statsDF = trainSet.describe()
  statsDF.show()
  trainSet.cache()

  trainSet.groupBy("churn").sum("total_international_num_calls").show()
  trainSet.groupBy("churn").sum("total_international_charge").show()

  val testSet: Dataset[CustomerAccount] = spark.read.
    option("inferSchema", "false")
    .format("com.databricks.spark.csv")
    .schema(schema)
    .load("data/churn-bigml-20.csv")
    .as[CustomerAccount]

  testSet.describe()
  testSet.cache()

  trainSet.printSchema()
  trainSet.show()

  trainSet.createOrReplaceTempView("UserAccount")
  spark.catalog.cacheTable("UserAccount")

  /////////////// Feature engineering
  spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show()
  spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show()
  trainSet.groupBy("churn").count.show()
  spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) as Total_intl_call FROM UserAccount GROUP BY churn").show()

  val fractions = Map("False" -> 0.1675, "True" -> 1.0)

  //Here we're keeping all instances of the Churn=True class, but downsampling the Churn=False class to a fraction of 388/2278.
  val churnDF = trainSet.stat.sampleBy("churn", fractions, 123456L)

  churnDF.groupBy("churn").count.show()

  val trainDF = churnDF
    .drop("state_code")
    .drop("area_code")
    .drop("voice_mail_plan")
    .drop("total_day_charge")
    .drop("total_evening_charge")

  println(trainDF.count)
  trainDF.select("account_length", "international_plan", "num_voice_mail", "total_day_calls", "total_international_num_calls", "churn").show(10)
} 
Example 32
Source File: XmlReader.scala    From spark-xml   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.xml

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession}
import org.apache.spark.sql.types.StructType
import com.databricks.spark.xml.util.XmlFile
import com.databricks.spark.xml.util.FailFastMode


  @deprecated("Use xmlFile(SparkSession, ...)", "0.5.0")
  def xmlFile(sqlContext: SQLContext, path: String): DataFrame = {
    // We need the `charset` and `rowTag` before creating the relation.
    val (charset, rowTag) = {
      val options = XmlOptions(parameters.toMap)
      (options.charset, options.rowTag)
    }
    val relation = XmlRelation(
      () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag),
      Some(path),
      parameters.toMap,
      schema)(sqlContext)
    sqlContext.baseRelationToDataFrame(relation)
  }

  @deprecated("Use xmlRdd(SparkSession, ...)", "0.5.0")
  def xmlRdd(sqlContext: SQLContext, xmlRDD: RDD[String]): DataFrame = {
    val relation = XmlRelation(
      () => xmlRDD,
      None,
      parameters.toMap,
      schema)(sqlContext)
    sqlContext.baseRelationToDataFrame(relation)
  }

} 
Example 33
Source File: GroupSortedDataset.scala    From spark-sorted   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.sorted.sql

import scala.reflect.ClassTag

import org.apache.spark.sql.{ Column, Dataset, Encoder }
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder }

import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate }

object GroupSortedDataset {
  private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = {
    val key = col(dataset.columns.head)
    val valueSort = {
      val sort = sortBy(col(dataset.columns.last))
      if (reverse) sort.desc else sort.asc
    }
    new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort))
  }
}

class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable {
  def toDS: Dataset[(K, V)] = dataset

  def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f))
  }

  def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIterator(_)(f))
  }

  def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f)))
  }

  def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] =
    mapStreamByKey(iter => Iterator(iter.reduceLeft(f)))

  def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(_.scanLeft(wCreate())(f))
  }
} 
Example 34
Source File: SparkSuite.scala    From spark-sorted   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.sorted

import org.scalactic.Equality
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{ Dataset, SparkSession }

object SparkSuite {
  lazy val spark: SparkSession = {
    val session = SparkSession.builder
      .master("local[*]")
      .appName("test")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.ui.enabled", false)
      .config("spark.sql.shuffle.partitions", 4)
      .getOrCreate()
    session
  }
  lazy val sc: SparkContext = spark.sparkContext

  lazy val jsc = new JavaSparkContext(sc)
  def javaSparkContext() = jsc
}

trait SparkSuite {
  implicit lazy val spark: SparkSession = SparkSuite.spark
  implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext

  implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] {
    private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size)

    def areEqual(a: RDD[X], b: Any): Boolean = b match {
      case s: Seq[_] => toCounts(a.collect) == toCounts(s)
      case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect)
    }
  }

  implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] {
    def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b)
  }
  
  implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] {
    def areEqual(a: Dataset[X], b: Any): Boolean = b match {
      case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd)
      case x => rddEq.areEqual(a.rdd, x)
    }
  }
} 
Example 35
Source File: VLORRealDataExample.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.example

import org.apache.spark.ml.classification.{LogisticRegression, VLogisticRegression}
import org.apache.spark.sql.{Dataset, SparkSession}

object VLORRealDataExample {

  // https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a
  def main(args: Array[String]) = {
    val spark = SparkSession
      .builder()
      .appName("VLogistic Regression real data example")
      .getOrCreate()

    val sc = spark.sparkContext

    val dataset1: Dataset[_] = spark.read.format("libsvm").load("data/a9a")

    val trainer = new LogisticRegression()
      .setFitIntercept(false)
      .setRegParam(0.5)
    val model = trainer.fit(dataset1)

    val vtrainer = new VLogisticRegression()
      .setColsPerBlock(100)
      .setRowsPerBlock(10)
      .setColPartitions(3)
      .setRowPartitions(3)
      .setRegParam(0.5)
    val vmodel = vtrainer.fit(dataset1)

    println(s"VLogistic regression coefficients: ${vmodel.coefficients}")
    println(s"Logistic regression coefficients: ${model.coefficients}")

    sc.stop()
  }
} 
Example 36
Source File: LORExample2.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.example

import org.apache.spark.ml.classification.MyLogisticRegression
import org.apache.spark.sql.{Dataset, SparkSession}

object LORExample2 {

  def main(args: Array[String]) = {

    var maxIter: Int = 100

    var dimension: Int = 780
    var regParam: Double = 0.5
    var fitIntercept: Boolean = true
    var elasticNetParam = 1.0

    var dataPath: String = null

    try {
      maxIter = args(0).toInt

      dimension = args(1).toInt

      regParam = args(2).toDouble
      fitIntercept = args(3).toBoolean
      elasticNetParam = args(4).toDouble

      dataPath = args(5)
    } catch {
      case _: Throwable =>
        println("Param list: "
          + "maxIter dimension"
          + " regParam fitIntercept elasticNetParam dataPath")
        println("parameter description:" +
          "\nmaxIter          max iteration number for VLogisticRegression" +
          "\ndimension        training data dimension number" +
          "\nregParam         regularization parameter" +
          "\nfitIntercept     whether to train intercept, true or false" +
          "\nelasticNetParam  elastic net parameter for regulization" +
          "\ndataPath         training data path on HDFS")

        System.exit(-1)
    }

    val spark = SparkSession
      .builder()
      .appName("LOR for testing")
      .getOrCreate()

    val sc = spark.sparkContext

    try {
      println(s"begin load data from $dataPath")
      val dataset: Dataset[_] = spark.read.format("libsvm")
        .option("numFeatures", dimension.toString)
        .load(dataPath)

      val trainer = new MyLogisticRegression()
        .setMaxIter(maxIter)
        .setRegParam(regParam)
        .setFitIntercept(fitIntercept)
        .setElasticNetParam(elasticNetParam)

      val model = trainer.fit(dataset)

      println(s"LOR done, coeffs non zeros: ${model.coefficients.numNonzeros}")
    } catch {
      case e: Exception =>
        e.printStackTrace()
    }finally {
      // println("Press ENTER to exit.")
      // System.in.read()
    }
    sc.stop()
  }

} 
Example 37
Source File: VSoftmaxRegressionSuite.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.classification

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg.{SparseMatrix, Vector, Vectors}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}

import scala.language.existentials


class VSoftmaxRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {

  import testImplicits._

  private val seed = 42
  @transient var multinomialDataset: Dataset[_] = _
  private val eps: Double = 1e-5

  override def beforeAll(): Unit = {
    super.beforeAll()

    multinomialDataset = {
      val nPoints = 50
      val coefficients = Array(
        -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
        -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)

      val xMean = Array(5.843, 3.057, 3.758, 1.199)
      val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)

      val testData = LogisticRegressionSuite.generateMultinomialLogisticInput(
        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)

      val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed))
      df.cache()
      println("softmax test data:")
      df.show(10, false)
      df
    }
  }

  test("test on multinomialDataset") {

    def b2s(b: Boolean): String = {
      if (b) "w/" else "w/o"
    }

    for (standardization <- Seq(false, true)) {
      for ((reg, elasticNet) <- Seq((0.0, 0.0), (2.3, 0.0), (0.3, 0.05), (0.01, 1.0))) {
        println()
        println(s"# test ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}")

        val trainer = new LogisticRegression()
          .setFamily("multinomial")
          .setStandardization(standardization)
          .setWeightCol("weight")
          .setRegParam(reg)
          .setFitIntercept(false)
          .setElasticNetParam(elasticNet)

        val model = trainer.fit(multinomialDataset)

        val vtrainer = new VSoftmaxRegression()
          .setColsPerBlock(2)
          .setRowsPerBlock(5)
          .setColPartitions(2)
          .setRowPartitions(3)
          .setWeightCol("weight")
          .setGeneratingFeatureMatrixBuffer(2)
          .setStandardization(standardization)
          .setRegParam(reg)
          .setElasticNetParam(elasticNet)
        val vmodel = vtrainer.fit(multinomialDataset)

        println(s"VSoftmaxRegression coefficientMatrix:\n" +
          s"${vmodel.coefficientMatrix.asInstanceOf[SparseMatrix].toDense},\n" +
          s"ml.SoftmaxRegression coefficientMatrix:\n" +
          s"${model.coefficientMatrix}\n")

        assert(vmodel.coefficientMatrix ~== model.coefficientMatrix relTol eps)
      }
    }
  }
} 
Example 38
Source File: StreamingIncrementCommand.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.xsql.execution.command

import java.util.Locale

import org.apache.spark.SparkException
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode}
import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.execution.streaming.StreamingRelationV2
import org.apache.spark.sql.sources.v2.StreamWriteSupport
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql.xsql.DataSourceManager._
import org.apache.spark.sql.xsql.StreamingSinkType


case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand {

  private var outputMode: OutputMode = OutputMode.Append
  // dummy
  override def output: Seq[AttributeReference] = Seq.empty
  // dummy
  override def producedAttributes: AttributeSet = plan.producedAttributes

  override def run(sparkSession: SparkSession): Seq[Row] = {
    import StreamingSinkType._
    val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan))
    val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema))
    plan.collectLeaves.head match {
      case StreamingRelationV2(_, _, extraOptions, _, _) =>
        val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK)
        val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv =>
          val key = kv._1.substring(STREAMING_SINK_PREFIX.length)
          (key, kv._2)
        }
        StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match {
          case CONSOLE =>
          case TEXT | PARQUET | ORC | JSON | CSV =>
            if (sinkOptions.get(STREAMING_SINK_PATH) == None) {
              throw new SparkException("Sink type is file, must config path")
            }
          case KAFKA =>
            if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) {
              throw new SparkException("Sink type is kafka, must config bootstrap servers")
            }
            if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) {
              throw new SparkException("Sink type is kafka, must config kafka topic")
            }
          case _ =>
            throw new SparkException(
              "Sink type is invalid, " +
                s"select from ${StreamingSinkType.values}")
        }
        val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf)
        val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",")
        val sink = ds.newInstance() match {
          case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) =>
            w
          case _ =>
            val ds = DataSource(
              sparkSession,
              className = source,
              options = sinkOptions.toMap,
              partitionColumns = Nil)
            ds.createSink(InternalOutputModes.Append)
        }
        val outputMode = InternalOutputModes(
          extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE))
        val duration =
          extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION)
        val trigger =
          extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match {
            case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration)
            case STREAMING_ONCE_TRIGGER => Trigger.Once()
            case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration)
          }
        val query = sparkSession.sessionState.streamingQueryManager.startQuery(
          extraOptions.get("queryName"),
          extraOptions.get(STREAMING_CHECKPOINT_LOCATION),
          df,
          sinkOptions.toMap,
          sink,
          outputMode,
          useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK,
          recoverFromCheckpointLocation = true,
          trigger = trigger)
        query.awaitTermination()
    }
    // dummy
    Seq.empty
  }
}

case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode {
  override def output: Seq[Attribute] = child.output
} 
Example 39
Source File: Aggregator.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression


  def toColumn: TypedColumn[IN, OUT] = {
    implicit val bEncoder = bufferEncoder
    implicit val cEncoder = outputEncoder

    val expr =
      AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        isDistinct = false)

    new TypedColumn[IN, OUT](expr, encoderFor[OUT])
  }
} 
Example 40
Source File: JsonUtils.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.json

import org.apache.spark.input.PortableDataStream
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.catalyst.json.JSONOptions

object JsonUtils {
  
  def sample(json: RDD[PortableDataStream], options: JSONOptions): RDD[PortableDataStream] = {
    require(options.samplingRatio > 0,
      s"samplingRatio (${options.samplingRatio}) should be greater than 0")
    if (options.samplingRatio > 0.99) {
      json
    } else {
      json.sample(withReplacement = false, options.samplingRatio, 1)
    }
  }
} 
Example 41
Source File: SaveIntoDataSourceCommand.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources.CreatableRelationProvider


case class SaveIntoDataSourceCommand(
    query: LogicalPlan,
    dataSource: CreatableRelationProvider,
    options: Map[String, String],
    mode: SaveMode) extends RunnableCommand {

  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)

  override def run(sparkSession: SparkSession): Seq[Row] = {
    dataSource.createRelation(
      sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query))

    Seq.empty[Row]
  }

  override def simpleString: String = {
    val redacted = SQLConf.get.redactOptions(options)
    s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}"
  }
} 
Example 42
Source File: CSVUtils.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.csv

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

object CSVUtils {
  
  def sample(csv: RDD[Array[String]], options: CSVOptions): RDD[Array[String]] = {
    require(options.samplingRatio > 0,
      s"samplingRatio (${options.samplingRatio}) should be greater than 0")
    if (options.samplingRatio > 0.99) {
      csv
    } else {
      csv.sample(withReplacement = false, options.samplingRatio, 1)
    }
  }
} 
Example 43
Source File: FrequentItems.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 44
Source File: cache.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command

import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

case class CacheTableCommand(
    tableIdent: TableIdentifier,
    plan: Option[LogicalPlan],
    isLazy: Boolean) extends RunnableCommand {
  require(plan.isEmpty || tableIdent.database.isEmpty,
    "Database name is not allowed in CACHE TABLE AS SELECT")

  override protected def innerChildren: Seq[QueryPlan[_]] = plan.toSeq

  override def run(sparkSession: SparkSession): Seq[Row] = {
    plan.foreach { logicalPlan =>
      Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString)
    }
    sparkSession.catalog.cacheTable(tableIdent.quotedString)

    if (!isLazy) {
      // Performs eager caching
      sparkSession.table(tableIdent).count()
    }

    Seq.empty[Row]
  }
}


case class UncacheTableCommand(
    tableIdent: TableIdentifier,
    ifExists: Boolean) extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val tableId = tableIdent.quotedString
    if (!ifExists || sparkSession.catalog.tableExists(tableId)) {
      sparkSession.catalog.uncacheTable(tableId)
    }
    Seq.empty[Row]
  }
}


  override def makeCopy(newArgs: Array[AnyRef]): ClearCacheCommand = ClearCacheCommand()
} 
Example 45
Source File: ConsoleWriter.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.sources

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.sources.v2.DataSourceOptions
import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage}
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
import org.apache.spark.sql.types.StructType


class ConsoleWriter(schema: StructType, options: DataSourceOptions)
    extends StreamWriter with Logging {

  // Number of rows to display, by default 20 rows
  protected val numRowsToShow = options.getInt("numRows", 20)

  // Truncate the displayed data if it is too long, by default it is true
  protected val isTruncated = options.getBoolean("truncate", true)

  assert(SparkSession.getActiveSession.isDefined)
  protected val spark = SparkSession.getActiveSession.get

  def createWriterFactory(): DataWriterFactory[InternalRow] = PackedRowWriterFactory

  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {
    // We have to print a "Batch" label for the epoch for compatibility with the pre-data source V2
    // behavior.
    printRows(messages, schema, s"Batch: $epochId")
  }

  def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}

  protected def printRows(
      commitMessages: Array[WriterCommitMessage],
      schema: StructType,
      printMessage: String): Unit = {
    val rows = commitMessages.collect {
      case PackedRowCommitMessage(rs) => rs
    }.flatten

    // scalastyle:off println
    println("-------------------------------------------")
    println(printMessage)
    println("-------------------------------------------")
    // scalastyle:off println
    Dataset.ofRows(spark, LocalRelation(schema.toAttributes, rows))
      .show(numRowsToShow, isTruncated)
  }

  override def toString(): String = {
    s"ConsoleWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
  }
} 
Example 46
Source File: TestCsvData.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.csv

import org.apache.spark.sql.{Dataset, Encoders, SparkSession}

private[csv] trait TestCsvData {
  protected def spark: SparkSession

  def sampledTestData: Dataset[String] = {
    spark.range(0, 100, 1).map { index =>
      val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46,
        57, 62, 68, 72)
      if (predefinedSample.contains(index)) {
        index.toString
      } else {
        (index.toDouble + 0.1).toString
      }
    }(Encoders.STRING)
  }
} 
Example 47
Source File: GenericWordSpecSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.test

import org.scalatest.WordSpec

import org.apache.spark.sql.Dataset


class GenericWordSpecSuite extends WordSpec with SharedSparkSession {
  import testImplicits._

  private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS

  "A Simple Dataset" when {
    "looked at as complete rows" should {
      "have the specified number of elements" in {
        assert(8 === ds.count)
      }
      "have the specified number of unique elements" in {
        assert(8 === ds.distinct.count)
      }
    }
    "refined to specific columns" should {
      "have the specified number of elements in each column" in {
        assert(8 === ds.select("_1").count)
        assert(8 === ds.select("_2").count)
      }
      "have the correct number of distinct elements in each column" in {
        assert(8 === ds.select("_1").distinct.count)
        assert(4 === ds.select("_2").distinct.count)
      }
    }
  }
} 
Example 48
Source File: GenericFlatSpecSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.test

import org.scalatest.FlatSpec

import org.apache.spark.sql.Dataset


class GenericFlatSpecSuite extends FlatSpec with SharedSparkSession {
  import testImplicits._

  private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS

  "A Simple Dataset" should "have the specified number of elements" in {
    assert(8 === ds.count)
  }
  it should "have the specified number of unique elements" in {
      assert(8 === ds.distinct.count)
  }
  it should "have the specified number of elements in each column" in {
    assert(8 === ds.select("_1").count)
    assert(8 === ds.select("_2").count)
  }
  it should "have the correct number of distinct elements in each column" in {
    assert(8 === ds.select("_1").distinct.count)
    assert(4 === ds.select("_2").distinct.count)
  }
} 
Example 49
Source File: GenericFunSpecSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.test

import org.scalatest.FunSpec

import org.apache.spark.sql.Dataset


class GenericFunSpecSuite extends FunSpec with SharedSparkSession {
  import testImplicits._

  private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS

  describe("Simple Dataset") {
    it("should have the specified number of elements") {
      assert(8 === ds.count)
    }
    it("should have the specified number of unique elements") {
      assert(8 === ds.distinct.count)
    }
    it("should have the specified number of elements in each column") {
      assert(8 === ds.select("_1").count)
      assert(8 === ds.select("_2").count)
    }
    it("should have the correct number of distinct elements in each column") {
      assert(8 === ds.select("_1").distinct.count)
      assert(4 === ds.select("_2").distinct.count)
    }
  }
} 
Example 50
Source File: CassandraStorage.scala    From graphsense-transformation   with MIT License 5 votes vote down vote up
package at.ac.ait.storage

import com.datastax.spark.connector.rdd.ValidRDDType
import com.datastax.spark.connector.rdd.reader.RowReaderFactory
import com.datastax.spark.connector.writer.{RowWriterFactory}
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import org.apache.spark.sql.{Dataset, Encoder, SparkSession}
import scala.reflect.ClassTag

import at.ac.ait.Util._

class CassandraStorage(spark: SparkSession) {

  import spark.implicits._
  import com.datastax.spark.connector._

  def load[T <: Product: ClassTag: RowReaderFactory: ValidRDDType: Encoder](
      keyspace: String,
      tableName: String,
      columns: ColumnRef*
  ) = {
    spark.sparkContext.setJobDescription(s"Loading table ${tableName}")
    val table = spark.sparkContext.cassandraTable[T](keyspace, tableName)
    if (columns.isEmpty)
      table.toDS().as[T]
    else
      table.select(columns: _*).toDS().as[T]
  }

  def store[T <: Product: RowWriterFactory](
      keyspace: String,
      tableName: String,
      df: Dataset[T]
  ) = {

    spark.sparkContext.setJobDescription(s"Writing table ${tableName}")
    val dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
    val timestamp = LocalDateTime.now().format(dtf)
    println(s"[$timestamp] Writing table ${tableName}")
    time { df.rdd.saveToCassandra(keyspace, tableName) }
  }
} 
Example 51
Source File: StringMap.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._


    private val className = classOf[StringMap].getName

    override def load(path: String): StringMap = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("labels", "handleInvalid", "defaultValue").head()
      val labels = data.getAs[Map[String, Double]](0)
      val handleInvalid = HandleInvalid.fromString(data.getAs[String](1))
      val defaultValue = data.getAs[Double](2)

      val model = new StringMapModel(labels, handleInvalid = handleInvalid, defaultValue = defaultValue)
      val transformer = new StringMap(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

} 
Example 52
Source File: MathUnary.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
import org.apache.spark.sql.functions.udf


    private val className = classOf[MathUnary].getName

    override def load(path: String): MathUnary = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("operation").head()
      val operation = data.getAs[String](0)

      val model = MathUnaryModel(UnaryOperation.forName(operation))
      val transformer = new MathUnary(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

} 
Example 53
Source File: WordLengthFilter.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.WordLengthFilterModel
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, Params}
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}



  final def getWordLength: Int = $(wordLength)
}

class WordLengthFilter(override val uid: String) extends Transformer
  with WordLengthFilterParams
  with DefaultParamsWritable {

  val defaultLength = 3
  var model: WordLengthFilterModel = new WordLengthFilterModel(defaultLength) //Initialize with default filter length 3

  def this(model: WordLengthFilterModel) = this(uid = Identifiable.randomUID("filter_words"))
  def this() = this(new WordLengthFilterModel)

  def setInputCol(value: String): this.type = set(inputCol, value)
  def setOutputCol(value: String): this.type = set(outputCol, value)
  def setWordLength(value: Int = defaultLength): this.type = set(wordLength, value)

  override def transform(dataset: Dataset[_]): DataFrame = {
    if(defaultLength != getWordLength) model = new WordLengthFilterModel(getWordLength)
    val filterWordsUdf = udf {
      (words: Seq[String]) => model(words)
    }

    dataset.withColumn($(outputCol), filterWordsUdf(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer =  defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    require(schema($(inputCol)).dataType.isInstanceOf[ArrayType],
      s"Input column must be of type ArrayType(StringType,true) but got ${schema($(inputCol)).dataType}")
    val inputFields = schema.fields

    require(!inputFields.exists(_.name == $(outputCol)),
      s"Output column ${$(outputCol)} already exists.")

    StructType(schema.fields :+ StructField($(outputCol), ArrayType(StringType, true)))

  }
}

object WordLengthFilter extends  DefaultParamsReadable[WordLengthFilter] {
  override def load(path: String): WordLengthFilter = super.load(path)
} 
Example 54
Source File: MultinomialLabeler.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.MultinomialLabelerModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasFeaturesCol
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.{udf, col}
import ml.combust.mleap.core.util.VectorConverters._


class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"),
                         val model: MultinomialLabelerModel) extends Transformer
  with HasFeaturesCol
  with HasProbabilitiesCol
  with HasLabelsCol {

  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
  def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value)
  def setLabelsCol(value: String): this.type = set(labelsCol, value)

  @org.apache.spark.annotation.Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val probabilitiesUdf = udf {
      (vector: Vector) => model.top(vector).map(_._1).toArray
    }

    val labelsUdf = udf {
      (vector: Vector) => model.topLabels(vector).toArray
    }

    dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))).
      withColumn($(labelsCol), labelsUdf(col($(featuresCol))))
  }

  override def copy(extra: ParamMap): Transformer =
    copyValues(new MultinomialLabeler(uid, model), extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT],
      s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}")
    val inputFields = schema.fields
    require(!inputFields.exists(_.name == $(probabilitiesCol)),
      s"Output column ${$(probabilitiesCol)} already exists.")
    require(!inputFields.exists(_.name == $(labelsCol)),
      s"Output column ${$(labelsCol)} already exists.")

    StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)),
      StructField($(labelsCol), ArrayType(StringType))))
  }
} 
Example 55
Source File: ChangingDesign.scala    From Hands-On-Big-Data-Analytics-with-PySpark   with MIT License 5 votes vote down vote up
package com.tomekl007.chapter_3

import com.tomekl007.{UserData, UserTransaction}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.scalatest.FunSuite

class ChangingDesign extends FunSuite {
  val spark = SparkSession.builder().master("local[2]").getOrCreate()

  test("example of operation that is causing shuffle") {
    import spark.sqlContext.implicits._
    val userData =
      spark.sparkContext.makeRDD(List(
        UserData("user_1", "1"),
        UserData("user_2", "2"),
        UserData("user_4", "200")
      )).toDS()
    val repartitionedUserData = userData.repartition(userData("userId"))

    val transactionData =
      spark.sparkContext.makeRDD(List(
        UserTransaction("user_1", 100),
        UserTransaction("user_2", 300),
        UserTransaction("user_3", 1300)
      )).toDS()

    val repartitionedTransactionData = transactionData.repartition(transactionData("userId"))


    //when
    //data is already partitioned using join-column. Don't need to shuffle
    val res: Dataset[(UserData, UserTransaction)]
    = repartitionedUserData.joinWith(repartitionedTransactionData, userData("userId") === transactionData("userId"), "inner")


		//then
		res.show()
		assert(res.count() == 2)
	  }
	} 
Example 56
package com.tomekl007.chapter_3

import com.tomekl007.{UserData, UserTransaction}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.scalatest.FunSuite

class TestingOperationsThatCausesShuffle extends FunSuite {
  val spark = SparkSession.builder().master("local[2]").getOrCreate()

  test("example of operation that is causing shuffle") {
    import spark.sqlContext.implicits._
    val userData =
      spark.sparkContext.makeRDD(List(
        UserData("user_1", "1"),
        UserData("user_2", "2"),
        UserData("user_4", "200")
      )).toDS()

    val transactionData =
      spark.sparkContext.makeRDD(List(
        UserTransaction("user_1", 100),
        UserTransaction("user_2", 300),
        UserTransaction("user_3", 1300)
      )).toDS()


    //shuffle: userData can stay on the current executors, but data from
    //transactionData needs to be send to those executors according to joinColumn
    //causing shuffle
    //when
    val res: Dataset[(UserData, UserTransaction)]
    = userData.joinWith(transactionData, userData("userId") === transactionData("userId"), "inner")


    //then
    res.show()
    assert(res.count() == 2)
  }
} 
Example 57
package org.scalaml.spark

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, Dataset, Encoders, SparkSession}


private[spark] object DatasetGenerator {

      // Generation of a dataset of type {Double, Double} with a by-name initialization function
  final def toDSPairDouble(
    numDataPoints: Int
  )(
    generator: Int => (Double, Double)
  )(implicit sessionLifeCycle: SessionLifeCycle): Dataset[(Double, Double)] =
    toDSPairDouble(Seq.tabulate(numDataPoints)(generator(_)))

    // Generation of a dataset of type {Double, Double} from a sequence of same type
  def toDSPairDouble(
    data: Seq[(Double, Double)]
  )(implicit sessionLifeCycle: SessionLifeCycle): Dataset[(Double, Double)] = {
    import sessionLifeCycle.sparkSession.implicits._
    data.toDS()
  }

    // Generation of a dataset of type Double
  def toDSDouble(data: Seq[Double])(implicit sessionLifeCycle: SessionLifeCycle): Dataset[Double] = {
    import sessionLifeCycle.sparkSession.implicits._
    data.toDS()
  }

    // Generation of a dataset of type Int
  def toDSInt(data: Seq[Int])(implicit sessionLifeCycle: SessionLifeCycle): Dataset[Int] = {
    import sessionLifeCycle.sparkSession.implicits._
    data.toDS()
  }
}

// --------------------------  EOF ---------------------------------------------- 
Example 58
Source File: OpBoston.scala    From transmogrifai-helloworld-sbt   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.hw.boston

import com.salesforce.op._
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.CustomReader
import com.salesforce.op.stages.impl.regression.RegressionModelSelector
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry._
import com.salesforce.op.stages.impl.tuning.DataSplitter
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}


object OpBoston extends OpAppWithRunner with BostonFeatures {

  ////////////////////////////////////////////////////////////////////////////////
  // READERS DEFINITION
  /////////////////////////////////////////////////////////////////////////////////

  val randomSeed = 42L

  def customRead(path: String)(implicit spark: SparkSession): RDD[BostonHouse] = {
    val myFile = spark.sparkContext.textFile(path)

    myFile.filter(_.nonEmpty).zipWithIndex.map { case (x, id) =>
      val words = x.replaceAll("\\s+", " ").replaceAll(s"^\\s+(?m)", "").replaceAll(s"(?m)\\s+$$", "").split(" ")
      BostonHouse(id.toInt, words(0).toDouble, words(1).toDouble, words(2).toDouble, words(3), words(4).toDouble,
        words(5).toDouble, words(6).toDouble, words(7).toDouble, words(8).toInt, words(9).toDouble,
        words(10).toDouble, words(11).toDouble, words(12).toDouble, words(13).toDouble)
    }
  }

  val trainingReader = new CustomReader[BostonHouse](key = _.rowId.toString) {
    def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = Left {
      val Array(train, _) = customRead(getFinalReadPath(params)).randomSplit(weights = Array(0.9, 0.1), randomSeed)
      train
    }
  }

  val scoringReader = new CustomReader[BostonHouse](key = _.rowId.toString) {
    def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = Left {
      val Array(_, test) = customRead(getFinalReadPath(params)).randomSplit(weights = Array(0.9, 0.1), randomSeed)
      test
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // WORKFLOW DEFINITION
  /////////////////////////////////////////////////////////////////////////////////

  val houseFeatures = Seq(crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat).transmogrify()

  val splitter = DataSplitter(seed = randomSeed)

  val prediction = RegressionModelSelector
    .withCrossValidation(
      dataSplitter = Some(splitter), seed = randomSeed,
      modelTypesToUse = Seq(OpGBTRegressor, OpRandomForestRegressor)
    ).setInput(medv, houseFeatures).getOutput()

  val workflow = new OpWorkflow().setResultFeatures(prediction)

  val evaluator = Evaluators.Regression().setLabelCol(medv).setPredictionCol(prediction)

  def runner(opParams: OpParams): OpWorkflowRunner =
    new OpWorkflowRunner(
      workflow = workflow,
      trainingReader = trainingReader,
      scoringReader = scoringReader,
      evaluationReader = Option(trainingReader),
      evaluator = Option(evaluator),
      scoringEvaluator = None,
      featureToComputeUpTo = Option(houseFeatures)
    )

} 
Example 59
Source File: GaussianProcessCommons.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.commons

import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
import breeze.optimize.LBFGSB
import org.apache.spark.ml.commons.kernel.{EyeKernel, Kernel, _}
import org.apache.spark.ml.commons.util.DiffFunctionMemoized
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.util.Instrumentation
import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Row}

private[ml] trait GaussianProcessCommons[F, E <: Predictor[F, E, M], M <: PredictionModel[F, M]]
  extends ProjectedGaussianProcessHelper {  this: Predictor[F, E, M] with GaussianProcessParams =>

  protected val getKernel : () => Kernel = () => $(kernel)() + $(sigma2).const * new EyeKernel

  protected def getPoints(dataset: Dataset[_]) = {
    dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map {
      case Row(label: Double, features: Vector) => LabeledPoint(label, features)
    }
  }

  protected def groupForExperts(points: RDD[LabeledPoint]) = {
    val numberOfExperts = Math.round(points.count().toDouble / $(datasetSizeForExpert))
    points.zipWithIndex.map { case(instance, index) =>
      (index % numberOfExperts, instance)
    }.groupByKey().map(_._2)
  }

  protected def getExpertLabelsAndKernels(points: RDD[LabeledPoint]): RDD[(BDV[Double], Kernel)] = {
    groupForExperts(points).map { chunk =>
      val (labels, trainingVectors) = chunk.map(lp => (lp.label, lp.features)).toArray.unzip
      (BDV(labels: _*), getKernel().setTrainingVectors(trainingVectors))
    }
  }

  protected def projectedProcess(expertLabelsAndKernels: RDD[(BDV[Double], Kernel)],
                                 points: RDD[LabeledPoint],
                                 optimalHyperparameters: BDV[Double]) = {
    val activeSet = $(activeSetProvider)($(activeSetSize), expertLabelsAndKernels, points,
      getKernel, optimalHyperparameters, $(seed))

    points.unpersist()

    val (matrixKmnKnm, vectorKmny) = getMatrixKmnKnmAndVectorKmny(expertLabelsAndKernels, activeSet)

    expertLabelsAndKernels.unpersist()

    val optimalKernel = getKernel().setHyperparameters(optimalHyperparameters).setTrainingVectors(activeSet)

    // inv(sigma^2 K_mm + K_mn * K_nm) * K_mn * y
    val (magicVector, magicMatrix) = getMagicVector(optimalKernel,
      matrixKmnKnm, vectorKmny, activeSet, optimalHyperparameters)

    new GaussianProjectedProcessRawPredictor(magicVector, magicMatrix, optimalKernel)
  }

  
  protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor) : M
}

class GaussianProjectedProcessRawPredictor private[commons] (val magicVector: BDV[Double],
                                                             val magicMatrix: BDM[Double],
                                                             val kernel: Kernel) extends Serializable {
  def predict(features: Vector): (Double, Double) = {
    val cross = kernel.crossKernel(features)
    val selfKernel = kernel.selfKernel(features)
    (cross * magicVector, selfKernel + cross * magicMatrix * cross.t)
  }
} 
Example 60
Source File: GaussianProcessRegression.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.regression

import breeze.linalg.{DenseVector => BDV, _}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.commons._
import org.apache.spark.ml.commons.kernel.Kernel
import org.apache.spark.ml.commons.util._
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{Identifiable, Instrumentation}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Dataset


class GaussianProcessRegression(override val uid: String)
  extends Regressor[Vector, GaussianProcessRegression, GaussianProcessRegressionModel]
    with GaussianProcessParams
    with GaussianProcessCommons[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with Logging {

  def this() = this(Identifiable.randomUID("gaussProcessReg"))

  override protected def train(dataset: Dataset[_]): GaussianProcessRegressionModel = {
    val instr = Instrumentation.create(this, dataset)

    val points: RDD[LabeledPoint] = getPoints(dataset).cache()

    val expertLabelsAndKernels: RDD[(BDV[Double], Kernel)] = getExpertLabelsAndKernels(points).cache()

    val optimalHyperparameters = optimizeHypers(instr, expertLabelsAndKernels, likelihoodAndGradient)

    expertLabelsAndKernels.foreach(_._2.setHyperparameters(optimalHyperparameters))

    produceModel(instr,
      points, expertLabelsAndKernels, optimalHyperparameters)
  }

  private def likelihoodAndGradient(yAndK : (BDV[Double], Kernel), x : BDV[Double]) = {
    val (y: BDV[Double], kernel : Kernel) = yAndK
    kernel.setHyperparameters(x)
    val (k, derivative) = kernel.trainingKernelAndDerivative()
    val (_, logdet, kinv) = logDetAndInv(k)
    val alpha = kinv * y
    val likelihood = 0.5 * (y.t * alpha) + 0.5 * logdet

    val alphaAlphaTMinusKinv = alpha * alpha.t
    alphaAlphaTMinusKinv -= kinv

    val gradient = derivative.map(derivative => -0.5 * sum(derivative *= alphaAlphaTMinusKinv))
    (likelihood, BDV(gradient:_*))
  }

  override def copy(extra: ParamMap): GaussianProcessRegression = defaultCopy(extra)

  override protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor): GaussianProcessRegressionModel = new GaussianProcessRegressionModel(uid, rawPredictor)
}

class GaussianProcessRegressionModel private[regression](override val uid: String,
          private val gaussianProjectedProcessRawPredictor: GaussianProjectedProcessRawPredictor)
  extends RegressionModel[Vector, GaussianProcessRegressionModel] {

  override protected def predict(features: Vector): Double = {
    gaussianProjectedProcessRawPredictor.predict(features)._1
  }

  override def copy(extra: ParamMap): GaussianProcessRegressionModel = {
    val newModel = copyValues(new GaussianProcessRegressionModel(uid, gaussianProjectedProcessRawPredictor), extra)
    newModel.setParent(parent)
  }
} 
Example 61
Source File: DataFrameConverter.scala    From incubator-toree   with Apache License 2.0 5 votes vote down vote up
package org.apache.toree.utils

import org.apache.spark.sql.{Dataset, Row}
import org.apache.toree.plugins.Plugin
import play.api.libs.json.{JsObject, Json}

import scala.util.Try
import org.apache.toree.plugins.annotations.Init

import DataFrameConverter._

class DataFrameConverter extends Plugin with LogLike {
  @Init def init() = {
    register(this)
  }

  def convert(df: Dataset[Row], outputType: String, limit: Int = 10): Try[String] = {
    Try(
      outputType.toLowerCase() match {
        case "html" =>
          convertToHtml(df = df, limit = limit)
        case "json" =>
          convertToJson(df = df, limit = limit)
        case "csv" =>
          convertToCsv(df = df, limit = limit)
      }
    )
  }

  private def convertToHtml(df: Dataset[Row], limit: Int = 10): String = {
      val columnFields = df.schema.fieldNames.map(columnName => {
        s"<th>${columnName}</th>"
      }).reduce(_ + _)
      val columns = s"<tr>${columnFields}</tr>"
      val rows = df.rdd.map(row => {
        val fieldValues = row.toSeq.map(field => {
         s"<td>${fieldToString(field)}</td>"
        }).reduce(_ + _)
        s"<tr>${fieldValues}</tr>"
      }).take(limit).reduce(_ + _)
      s"<table>${columns}${rows}</table>"
  }

  private def convertToJson(df: Dataset[Row], limit: Int = 10): String = {
    val schema = Json.toJson(df.schema.fieldNames)
    val transformed = df.rdd.map(row =>
      row.toSeq.map(fieldToString).toArray)
    val rows = transformed.take(limit)
    JsObject(Seq(
      "columns" -> schema,
      "rows" -> Json.toJson(rows)
    )).toString()
  }

  private def convertToCsv(df: Dataset[Row], limit: Int = 10): String = {
      val headers = df.schema.fieldNames.reduce(_ + "," + _)
      val rows = df.rdd.map(row => {
        row.toSeq.map(fieldToString).reduce(_ + "," + _)
      }).take(limit).reduce(_ + "\n" + _)
      s"${headers}\n${rows}"
  }

}

object DataFrameConverter {

  def fieldToString(any: Any): String =
    any match {
      case null => "null"
      case seq: Seq[_] => seq.mkString("[", ", ", "]")
      case _ => any.toString
    }

} 
Example 62
Source File: CustomSinkSuite.scala    From spark-structured-streaming-ml   with Apache License 2.0 5 votes vote down vote up
package com.highperformancespark.examples.structuredstreaming

import com.holdenkarau.spark.testing.DataFrameSuiteBase

import scala.collection.mutable.ListBuffer

import org.scalatest.FunSuite

import org.apache.spark._
import org.apache.spark.sql.{Dataset, DataFrame, Encoder, SQLContext}
import org.apache.spark.sql.execution.streaming.MemoryStream

class CustomSinkSuite extends FunSuite with DataFrameSuiteBase {

  test("really simple test of the custom sink") {
    import spark.implicits._
    val input = MemoryStream[String]
    val doubled = input.toDS().map(x => x + " " + x)
    val formatName = ("com.highperformancespark.examples" +
      "structuredstreaming.CustomSinkCollectorProvider")
    val query = doubled.writeStream
      .queryName("testCustomSinkBasic")
      .format(formatName)
      .start()
    val inputData = List("hi", "holden", "bye", "pandas")
    input.addData(inputData)
    assert(query.isActive === true)
    query.processAllAvailable()
    assert(query.exception === None)
    assert(Pandas.results(0) === inputData.map(x => x + " " + x))
  }
}

object Pandas{
  val results = new ListBuffer[Seq[String]]()
}

class CustomSinkCollectorProvider extends ForeachDatasetSinkProvider {
  override def func(df: DataFrame) {
    val spark = df.sparkSession
    import spark.implicits._
    Pandas.results += df.as[String].rdd.collect()
  }
} 
Example 63
Source File: StreamingKMeansSuite.scala    From spark-structured-streaming-ml   with Apache License 2.0 5 votes vote down vote up
package com.highperformancespark.examples.structuredstreaming

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.scalatest.FunSuite
import org.apache.log4j.{Level, Logger}

case class TestRow(features: Vector)

class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase {

  override def beforeAll(): Unit = {
    super.beforeAll()
    Logger.getLogger("org").setLevel(Level.OFF)
  }

  test("streaming model with one center should converge to true center") {
    import spark.implicits._
    val k = 1
    val dim = 5
    val clusterSpread = 0.1
    val seed = 63
    // TODO: this test is very flaky. The centers do not converge for some
    // (most?) random seeds
    val (batches, trueCenters) =
      StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed)
    val inputStream = MemoryStream[TestRow]
    val ds = inputStream.toDS()
    val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01)
    val query = skm.evilTrain(ds.toDF())
    val streamingModels = batches.map { batch =>
      inputStream.addData(batch)
      query.processAllAvailable()
      skm.getModel
    }
    // TODO: use spark's testing suite
    streamingModels.last.centers.zip(trueCenters).foreach {
      case (center, trueCenter) =>
        val centers = center.toArray.mkString(",")
        val trueCenters = trueCenter.toArray.mkString(",")
        println(s"${centers} | ${trueCenters}")
        assert(center.toArray.zip(trueCenter.toArray).forall(
          x => math.abs(x._1 - x._2) < 0.1))
    }
    query.stop()
  }

  def compareBatchAndStreaming(
      batchModel: KMeansModel,
      streamingModel: StreamingKMeansModel,
      validationData: DataFrame): Unit = {
    assert(batchModel.clusterCenters === streamingModel.centers)
    // TODO: implement prediction comparison
  }

}

object StreamingKMeansSuite {

  def generateBatches(
      numPoints: Int,
      numBatches: Int,
      k: Int,
      d: Int,
      r: Double,
      seed: Int,
      initCenters: Array[Vector] = null):
      (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = {
    val rand = scala.util.Random
    rand.setSeed(seed)
    val centers = initCenters match {
      case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian())))
      case _ => initCenters
    }
    val data = (0 until numBatches).map { i =>
      (0 until numPoints).map { idx =>
        val center = centers(idx % k)
        val vec = Vectors.dense(
          Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r))
        TestRow(vec)
      }
    }
    (data, centers)
  }
} 
Example 64
package com.spark.recommendation

import org.apache.spark.{sql, SparkConf}
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.sql.{Dataset, SparkSession}


  def getFeatures(): sql.DataFrame = {
    import spark.implicits._
    //val ratings = spark.read.textFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_05/data/ml-100k 2/u.data").map(parseRating).toDF()
    val ratings = spark.read.textFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_05/2.0.0/scala-spark-app/src/main/scala/com/spark/recommendation/sample_movielens_ratings.txt").map(parseRating).toDF()
    println(ratings.first())

//    val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
//    println(training.first())

    return ratings
  }

  def getSpark(): SparkSession = {
    return spark
  }

  def main(args: Array[String]) {
    getFeatures()
  }

} 
Example 65
Source File: Prettify.scala    From spark-testing-base   with Apache License 2.0 5 votes vote down vote up
package com.holdenkarau.spark.testing

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset}
import org.scalacheck.util.Pretty

trait Prettify {
  val maxNumberOfShownValues = 100

  implicit def prettyDataFrame(dataframe: DataFrame): Pretty =
    Pretty { _ => describeDataframe(dataframe)}

  implicit def prettyRDD(rdd: RDD[_]): Pretty =
    Pretty { _ => describeRDD(rdd)}

  implicit def prettyDataset(dataset: Dataset[_]): Pretty =
    Pretty { _ => describeDataset(dataset)}

  private def describeDataframe(dataframe: DataFrame) =
    s"""<DataFrame: schema = ${dataframe.toString}, size = ${dataframe.count()},
        |values = (${dataframe.take(maxNumberOfShownValues).mkString(", ")})>""".
      stripMargin.replace("\n", " ")

  private def describeRDD(rdd: RDD[_]) =
    s"""<RDD: size = ${rdd.count()},
        |values = (${rdd.take(maxNumberOfShownValues).mkString(", ")})>""".
      stripMargin.replace("\n", " ")

  private def describeDataset(dataset: Dataset[_]) =
    s"""<Dataset: schema = ${dataset.toString}, size = ${dataset.count()},
        |values = (${dataset.take(maxNumberOfShownValues).mkString(", ")})>""".
      stripMargin.replace("\n", " ")
}

object Prettify extends Prettify 
Example 66
Source File: DatasetGenerator.scala    From spark-testing-base   with Apache License 2.0 5 votes vote down vote up
package com.holdenkarau.spark.testing

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, SQLContext}
import org.scalacheck.{Arbitrary, Gen}

import scala.reflect.ClassTag

object DatasetGenerator {

  
  def arbitrarySizedDataset[T: ClassTag : Encoder]
    (sqlCtx: SQLContext, minPartitions: Int = 1)
    (generator: Int => Gen[T]): Arbitrary[Dataset[T]] = {

    val rddGen: Gen[RDD[T]] =
      RDDGenerator.genSizedRDD[T](sqlCtx.sparkContext, minPartitions)(generator)
    val datasetGen: Gen[Dataset[T]] =
      rddGen.map(rdd => sqlCtx.createDataset(rdd))

    Arbitrary {
      datasetGen
    }
  }
} 
Example 67
Source File: SampleDatasetGeneratorTest.scala    From spark-testing-base   with Apache License 2.0 5 votes vote down vote up
package com.holdenkarau.spark.testing

import org.apache.spark.sql.{Dataset, SQLContext}
import org.scalacheck.{Gen, Arbitrary}
import org.scalacheck.Prop.forAll
import org.scalatest.FunSuite
import org.scalatest.prop.Checkers

class SampleDatasetGeneratorTest extends FunSuite
    with SharedSparkContext with Checkers {

  test("test generating Datasets[String]") {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val property =
      forAll(
        DatasetGenerator.genDataset[String](sqlContext)(
          Arbitrary.arbitrary[String])) {
        dataset => dataset.map(_.length).count() == dataset.count()
      }

    check(property)
  }

  test("test generating sized Datasets[String]") {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val property =
      forAll {
        DatasetGenerator.genSizedDataset[(Int, String)](sqlContext) { size =>
          Gen.listOfN(size, Arbitrary.arbitrary[Char]).map(l => (size, l.mkString))
        }
      }{
        dataset =>
          val tuples = dataset.collect()
          val value = dataset.map{ case (_, str) => str.length}
          tuples.forall{ case (size, str) => size == str.length} &&
          value.count() == dataset.count
      }

    check(property)
  }

  test("test generating Datasets[Custom Class]") {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val carGen: Gen[Dataset[Car]] =
      DatasetGenerator.genDataset[Car](sqlContext) {
        val generator: Gen[Car] = for {
          name <- Arbitrary.arbitrary[String]
          speed <- Arbitrary.arbitrary[Int]
        } yield (Car(name, speed))

        generator
    }

    val property =
      forAll(carGen) {
        dataset => dataset.map(_.speed).count() == dataset.count()
      }

    check(property)
  }
}

case class Car(name: String, speed: Int) 
Example 68
Source File: DatasetGeneratorSizeSpecial.scala    From spark-testing-base   with Apache License 2.0 5 votes vote down vote up
package com.holdenkarau.spark.testing

import org.apache.spark.sql.{Dataset, SQLContext}
import org.scalacheck.{Gen, Arbitrary}
import org.scalacheck.Prop.forAll
import org.scalatest.FunSuite
import org.scalatest.prop.Checkers

class DatasetGeneratorSizeSpecial extends FunSuite
    with SharedSparkContext with Checkers {

  test("test generating sized Datasets[Custom Class]") {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    // In 2.3 List is fine, however prior to 2.1 the generator returns
    // a concrete sub type which isn't handled well.
    // This works in 1.6.1+ but we only test in 2.0+ because that's easier
    val carGen: Gen[Dataset[Seq[Car]]] =
      DatasetGenerator.genSizedDataset[Seq[Car]](sqlContext) { size =>
        val slowCarsTopNumber = math.ceil(size * 0.1).toInt
        def carGenerator(speed: Gen[Int]): Gen[Car] = for {
          name <- Arbitrary.arbitrary[String]
          speed <- speed
        } yield Car(name, speed)

        val cars: Gen[List[Car]] = for {
          slowCarsNumber: Int <- Gen.choose(0, slowCarsTopNumber)
          slowCars: List[Car] <- Gen.listOfN(slowCarsNumber, carGenerator(Gen.choose(0, 20)))
          normalSpeedCars: List[Car] <- Gen.listOfN(
            size - slowCarsNumber,
            carGenerator(Gen.choose(21, 150))
          )
        } yield {
          slowCars ++ normalSpeedCars
        }
        cars
      }

    val property =
      forAll(carGen.map(_.flatMap(identity))) {
        dataset =>
          val cars = dataset.collect()
          val dataSetSize  = cars.length
          val slowCars = cars.filter(_.speed < 21)
          slowCars.length <= dataSetSize * 0.1 &&
            cars.map(_.speed).length == dataSetSize
      }

    check(property)
  }
} 
Example 69
Source File: HashingTF.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
} 
Example 70
Source File: SQLTransformer.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType


  @Since("1.6.0")
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val tableName = Identifiable.randomUID(uid)
    dataset.createOrReplaceTempView(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val result = dataset.sparkSession.sql(realStatement)
    dataset.sparkSession.catalog.dropTempView(tableName)
    result
  }

  @Since("1.6.0")
  override def transformSchema(schema: StructType): StructType = {
    val spark = SparkSession.builder().getOrCreate()
    val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty))
    val dummyDF = spark.createDataFrame(dummyRDD, schema)
    val tableName = Identifiable.randomUID(uid)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    dummyDF.createOrReplaceTempView(tableName)
    val outputSchema = spark.sql(realStatement).schema
    spark.catalog.dropTempView(tableName)
    outputSchema
  }

  @Since("1.6.0")
  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}

@Since("1.6.0")
object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {

  @Since("1.6.0")
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 71
Source File: BinaryClassificationEvaluator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType


  @Since("1.2.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "areaUnderROC")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT))
    SchemaUtils.checkNumericType(schema, $(labelCol))

    // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2.
    val scoreAndLabels =
      dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map {
        case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label)
        case Row(rawPrediction: Double, label: Double) => (rawPrediction, label)
      }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    val metric = $(metricName) match {
      case "areaUnderROC" => metrics.areaUnderROC()
      case "areaUnderPR" => metrics.areaUnderPR()
    }
    metrics.unpersist()
    metric
  }

  @Since("1.5.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "areaUnderROC" => true
    case "areaUnderPR" => true
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] {

  @Since("1.6.0")
  override def load(path: String): BinaryClassificationEvaluator = super.load(path)
} 
Example 72
Source File: MulticlassClassificationEvaluator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType


  @Since("1.5.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "f1")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels =
      dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map {
        case Row(prediction: Double, label: Double) => (prediction, label)
      }
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "f1" => metrics.weightedFMeasure
      case "weightedPrecision" => metrics.weightedPrecision
      case "weightedRecall" => metrics.weightedRecall
      case "accuracy" => metrics.accuracy
    }
    metric
  }

  @Since("1.5.0")
  override def isLargerBetter: Boolean = true

  @Since("1.5.0")
  override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object MulticlassClassificationEvaluator
  extends DefaultParamsReadable[MulticlassClassificationEvaluator] {

  @Since("1.6.0")
  override def load(path: String): MulticlassClassificationEvaluator = super.load(path)
} 
Example 73
Source File: RegressionEvaluator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, FloatType}


  @Since("1.4.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "rmse")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType))
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels = dataset
      .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType))
      .rdd
      .map { case Row(prediction: Double, label: Double) => (prediction, label) }
    val metrics = new RegressionMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "rmse" => metrics.rootMeanSquaredError
      case "mse" => metrics.meanSquaredError
      case "r2" => metrics.r2
      case "mae" => metrics.meanAbsoluteError
    }
    metric
  }

  @Since("1.4.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "rmse" => false
    case "mse" => false
    case "r2" => true
    case "mae" => false
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] {

  @Since("1.6.0")
  override def load(path: String): RegressionEvaluator = super.load(path)
} 
Example 74
Source File: RWrapperUtils.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.spark.internal.Logging
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
import org.apache.spark.ml.feature.{RFormula, RFormulaModel}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.Dataset

private[r] object RWrapperUtils extends Logging {

  
  def getFeaturesAndLabels(
      rFormulaModel: RFormulaModel,
      data: Dataset[_]): (Array[String], Array[String]) = {
    val schema = rFormulaModel.transform(data).schema
    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
      .attributes.get
    val features = featureAttrs.map(_.name.get)
    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
      .asInstanceOf[NominalAttribute]
    val labels = labelAttr.values.get
    (features, labels)
  }
} 
Example 75
Source File: MultilayerPerceptronClassifierWrapper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  val weights: Array[Double] = mlpModel.weights.toArray
  val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 76
Source File: Transformer.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import scala.annotation.varargs

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


  protected def validateInputType(inputType: DataType): Unit = {}

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val transformUDF = udf(this.createTransformFunc, outputDataType)
    dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): T = defaultCopy(extra)
} 
Example 77
Source File: ChiSqSelectorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
  with DefaultReadWriteTest {

  @transient var dataset: Dataset[_] = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    // Toy dataset, including the top feature for a chi-squared test.
    // These data are chosen such that each feature's test has a distinct p-value.
    
  val allParamSettings: Map[String, Any] = Map(
    "selectorType" -> "percentile",
    "numTopFeatures" -> 1,
    "percentile" -> 0.12,
    "outputCol" -> "myOutput"
  )
} 
Example 78
Source File: TokenizerSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}

@BeanInfo
case class TokenizerTestData(rawText: String, wantedTokens: Array[String])

class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new Tokenizer)
  }

  test("read/write") {
    val t = new Tokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }
}

class RegexTokenizerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import org.apache.spark.ml.feature.RegexTokenizerSuite._
  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegexTokenizer)
  }

  test("RegexTokenizer") {
    val tokenizer0 = new RegexTokenizer()
      .setGaps(false)
      .setPattern("\\w+|\\p{Punct}")
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset0 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")),
      TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct"))
    ).toDF()
    testRegexTokenizer(tokenizer0, dataset0)

    val dataset1 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")),
      TokenizerTestData("Te,st. punct", Array("punct"))
    ).toDF()
    tokenizer0.setMinTokenLength(3)
    testRegexTokenizer(tokenizer0, dataset1)

    val tokenizer2 = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset2 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")),
      TokenizerTestData("Te,st.  punct", Array("te,st.", "punct"))
    ).toDF()
    testRegexTokenizer(tokenizer2, dataset2)
  }

  test("RegexTokenizer with toLowercase false") {
    val tokenizer = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
      .setToLowercase(false)
    val dataset = Seq(
      TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")),
      TokenizerTestData("java scala", Array("java", "scala"))
    ).toDF()
    testRegexTokenizer(tokenizer, dataset)
  }

  test("read/write") {
    val t = new RegexTokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setMinTokenLength(2)
      .setGaps(false)
      .setPattern("hi")
      .setToLowercase(false)
    testDefaultReadWrite(t)
  }
}

object RegexTokenizerSuite extends SparkFunSuite {

  def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = {
    t.transform(dataset)
      .select("tokens", "wantedTokens")
      .collect()
      .foreach { case Row(tokens, wantedTokens) =>
        assert(tokens === wantedTokens)
      }
  }
} 
Example 79
Source File: NGramSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}

@BeanInfo
case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String])

class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import org.apache.spark.ml.feature.NGramSuite._
  import testImplicits._

  test("default behavior yields bigram features") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
    val dataset = Seq(NGramTestData(
      Array("Test", "for", "ngram", "."),
      Array("Test for", "for ngram", "ngram .")
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("NGramLength=4 yields length 4 n-grams") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = Seq(NGramTestData(
      Array("a", "b", "c", "d", "e"),
      Array("a b c d", "b c d e")
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("empty input yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = Seq(NGramTestData(Array(), Array())).toDF()
    testNGram(nGram, dataset)
  }

  test("input array < n yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(6)
    val dataset = Seq(NGramTestData(
      Array("a", "b", "c", "d", "e"),
      Array()
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("read/write") {
    val t = new NGram()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setN(3)
    testDefaultReadWrite(t)
  }
}

object NGramSuite extends SparkFunSuite {

  def testNGram(t: NGram, dataset: Dataset[_]): Unit = {
    t.transform(dataset)
      .select("nGrams", "wantedNGrams")
      .collect()
      .foreach { case Row(actualNGrams, wantedNGrams) =>
        assert(actualNGrams === wantedNGrams)
      }
  }
} 
Example 80
Source File: PredictorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import PredictorSuite._

  test("should support all NumericType labels and not support other types") {
    val df = spark.createDataFrame(Seq(
      (0, Vectors.dense(0, 2, 3)),
      (1, Vectors.dense(0, 3, 9)),
      (0, Vectors.dense(0, 2, 6))
    )).toDF("label", "features")

    val types =
      Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))

    val predictor = new MockPredictor()

    types.foreach { t =>
      predictor.fit(df.select(col("label").cast(t), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label").cast(StringType), col("features")))
    }
  }
}

object PredictorSuite {

  class MockPredictor(override val uid: String)
    extends Predictor[Vector, MockPredictor, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictor"))

    override def train(dataset: Dataset[_]): MockPredictionModel = {
      require(dataset.schema("label").dataType == DoubleType)
      new MockPredictionModel(uid)
    }

    override def copy(extra: ParamMap): MockPredictor =
      throw new NotImplementedError()
  }

  class MockPredictionModel(override val uid: String)
    extends PredictionModel[Vector, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictormodel"))

    override def predict(features: Vector): Double =
      throw new NotImplementedError()

    override def copy(extra: ParamMap): MockPredictionModel =
      throw new NotImplementedError()
  }
} 
Example 81
Source File: SQLBuilderTest.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst

import scala.util.control.NonFatal

import org.apache.spark.sql.{DataFrame, Dataset, QueryTest}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.hive.test.TestHiveSingleton


abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton {
  protected def checkSQL(e: Expression, expectedSQL: String): Unit = {
    val actualSQL = e.sql
    try {
      assert(actualSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following expression:
             |
             |${e.prettyName}
             |
             |$cause
           """.stripMargin)
    }
  }

  protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = {
    val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) =>
      fail(
        s"""Cannot convert the following logical query plan to SQL:
           |
           |${plan.treeString}
         """.stripMargin)
    }

    try {
      assert(generatedSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following logical query plan:
             |
             |${plan.treeString}
             |
             |$cause
           """.stripMargin)
    }

    checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan))
  }

  protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = {
    checkSQL(df.queryExecution.analyzed, expectedSQL)
  }
} 
Example 82
Source File: Aggregator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression


  def toColumn: TypedColumn[IN, OUT] = {
    implicit val bEncoder = bufferEncoder
    implicit val cEncoder = outputEncoder

    val expr =
      AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        isDistinct = false)

    new TypedColumn[IN, OUT](expr, encoderFor[OUT])
  }
} 
Example 83
Source File: FrequentItems.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 84
Source File: cache.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command

import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

case class CacheTableCommand(
    tableIdent: TableIdentifier,
    plan: Option[LogicalPlan],
    isLazy: Boolean) extends RunnableCommand {
  require(plan.isEmpty || tableIdent.database.isEmpty,
    "Database name is not allowed in CACHE TABLE AS SELECT")

  override protected def innerChildren: Seq[QueryPlan[_]] = {
    plan.toSeq
  }

  override def run(sparkSession: SparkSession): Seq[Row] = {
    plan.foreach { logicalPlan =>
      Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString)
    }
    sparkSession.catalog.cacheTable(tableIdent.quotedString)

    if (!isLazy) {
      // Performs eager caching
      sparkSession.table(tableIdent).count()
    }

    Seq.empty[Row]
  }
}


case class UncacheTableCommand(
    tableIdent: TableIdentifier,
    ifExists: Boolean) extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val tableId = tableIdent.quotedString
    try {
      sparkSession.catalog.uncacheTable(tableId)
    } catch {
      case _: NoSuchTableException if ifExists => // don't throw
    }
    Seq.empty[Row]
  }
}


case object ClearCacheCommand extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    sparkSession.catalog.clearCache()
    Seq.empty[Row]
  }
} 
Example 85
Source File: XGBoost.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import eleflow.uberdata.models.UberXGBOOSTModel
import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType}

import scala.reflect.ClassTag


class XGBoost[I](override val uid: String,
                 val models: RDD[(I, (UberXGBOOSTModel,
                   Seq[(ModelParamEvaluation[I])]))])(
  implicit kt: ClassTag[I],
  ord: Ordering[I] = null)
    extends ForecastBaseModel[XGBoostSmallModel[I]]
    with HasInputCol
    with HasOutputCol
    with DefaultParamsWritable
    with HasFeaturesCol
    with HasNFutures
    with HasGroupByCol {

  def this(
    models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))]
  )(implicit kt: ClassTag[I], ord: Ordering[I] ) =
    this(Identifiable.randomUID("xgboost"), models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val schema = dataSet.schema
    val predSchema = transformSchema(schema)
    val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)})

    val predictions = joined.map {
      case (id, ((bestModel, metrics), row)) =>
        val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]](
          IUberdataForecastUtil.FEATURES_COL_NAME
        )
        val label = DataTransformer.toFloat(row.getAs($(featuresCol)))
        val labelPoint = features.map { vec =>
          val array = vec.toArray.map(_.toFloat)
          LabeledPoint(label, null, array)
        }
        val matrix = new DMatrix(labelPoint.toIterator)
        val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance
          .predict(matrix)
          .flatMap(_.map(_.toDouble))
          .splitAt(features.length)
        Row(
          row.toSeq :+ Vectors
            .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params
            .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _*
        )
    }
    dataSet.sqlContext.createDataFrame(predictions, predSchema)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra)
} 
Example 86
Source File: TimeSeriesGenerator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, StructType}

import scala.reflect.ClassTag


  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val rdd = dataSet.rdd

    val sparkContext = dataSet.sqlContext.sparkContext
    val index = sparkContext.broadcast(dataSet.schema.fieldIndex($(timeCol).get))
    val labelColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(groupByCol).get))
    val featuresColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(featuresCol)))
    val grouped = rdd.map { case (row: Row) =>
      val timeColRow =
        IUberdataForecastUtil.convertColumnToLong(row, index.value)
      convertColumnToDouble(timeColRow, featuresColIndex)
    }.groupBy { row =>
      row.getAs[L](labelColIndex.value)
    }.map {
      case (key, values) =>
        val toBeUsed =
          values.toArray.sortBy(row => row.getAs[Long](index.value))
        (key, toBeUsed)
    }

    val toBeTrained = grouped.map {
      case (key, values) =>
        org.apache.spark.sql.Row(
          key,
          Vectors.dense(values.map(_.getAs[Double](featuresColIndex.value)))
        )
    }

    val trainSchema = transformSchema(dataSet.schema)
    dataSet.sqlContext.createDataFrame(toBeTrained, trainSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    val labelIndex = schema.fieldIndex($(groupByCol).get)
    StructType(
      Seq(
        schema.fields(labelIndex),
        StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT)
      )
    )
  }

  override def copy(extra: ParamMap): TimeSeriesGenerator[L] =
    defaultCopy(extra)

}

object TimeSeriesGenerator extends DefaultParamsReadable[TimeSeriesGenerator[_]] {

  override def load(path: String): TimeSeriesGenerator[_] = super.load(path)
} 
Example 87
Source File: XGBoostBigModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml


import com.cloudera.sparkts.models.UberXGBoostModel
import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint}
import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)])
    extends ForecastBaseModel[XGBoostBigModel[I]]
    with HasLabelCol
    with HasIdCol {

  def setLabelcol(label: String): this.type = set(labelCol, label)

  def setIdcol(id: String): this.type = set(idCol, id)

  override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
            )
      }
      .join(prediction)
      .map {
        case (id, (features, predictValue)) =>
          Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }

  protected def predict(dataSet: Dataset[_]) = {
    val features = dataSet.rdd.map { case (row: Row) =>
      val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
      val id = row.getAs[I]($(idCol))
      SparkLabeledPoint(DataTransformer.toFloat(id), features)
    }.cache
    val (_, model) = models.head
    UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(getPredictionSchema)

  protected def getPredictionSchema: Array[StructField] = {
    Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    )
  }
} 
Example 88
Source File: ArimaBestModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.TimeSeriesModel
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType


class ArimaBestModel[L, M <: TimeSeriesModel](
  override val uid: String,
  val bestPrediction: RDD[(L, M)],
  val validationMetrics: RDD[(L, Seq[ModelParamEvaluation[L]])]
) extends Model[ArimaBestModel[L, M]]
    with TimeSeriesBestModelFinderParam[L] {

  //TODO avaliar necessidade
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)

    dataset.toDF()
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def copy(extra: ParamMap): ArimaBestModel[L, M] = {
    val copied =
      new ArimaBestModel[L, M](uid, bestPrediction, validationMetrics)
    copyValues(copied, extra)
  }
} 
Example 89
Source File: MovingAverage.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types._


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(windowSize -> 3)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataSet.schema)
    val sparkContext = dataSet.sqlContext.sparkContext
    val inputType = outputSchema($(inputCol)).dataType
    val inputTypeBr = sparkContext.broadcast(inputType)
    val dataSetRdd = dataSet.rdd
    val inputColName = sparkContext.broadcast($(inputCol))
    val inputColIndex = dataSet.columns.indexOf($(inputCol))
    val inputColIndexBr = sparkContext.broadcast(inputColIndex)
    val windowSizeBr = sparkContext.broadcast($(windowSize))
    val maRdd = dataSetRdd.map { case (row: Row) =>
      val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) {
        val vector =
          row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value)
        (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1)))
      } else {
        val iterable = row.getAs[Iterable[Double]](inputColName.value)
        (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1)))
      }
      val (before, after) = row.toSeq.splitAt(inputColIndexBr.value)
      Row(
        (before :+ rawValue) ++ after.tail :+ MovingAverageCalc
          .simpleMovingAverageArray(array, windowSizeBr.value): _*
      )
    }
    dataSet.sqlContext.createDataFrame(maRdd, outputSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra)
}

object MovingAverageCalc {
  private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = {
    (for (i <- 1 to values.length)
      yield
      //TODO rollback this comment with the right size of features to make the meanaverage return
      // the features values for the first values of the calc
      if (i < period) 0d //values(i)
      else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d)
  }
}

object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] {

  override def load(path: String): MovingAverage[_] = super.load(path)
} 
Example 90
Source File: VectorizeEncoder.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.core.data.DataTransformer
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, StructType}


class VectorizeEncoder(override val uid: String)
    extends Transformer
    with HasIdCol
    with HasTimeCol
    with HasInputCols
    with HasLabelCol
    with HasGroupByCol
    with HasOutputCol
    with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("vectorizer"))

  def setIdCol(input: String) = set(idCol, input)

  def setLabelCol(input: String) = set(labelCol, input)

  def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy))

  def setInputCol(input: Array[String]) = set(inputCols, input)

  def setTimeCol(time: String) = set(timeCol, Some(time))

  def setOutputCol(output: String) = set(outputCol, output)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val context = dataSet.sqlContext.sparkContext
    val input = context.broadcast($(inputCols))
    val allColumnNames = dataSet.schema.map(_.name)

    val nonInputColumnIndexes = context.broadcast(
      allColumnNames.zipWithIndex.filter(
        f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol)
          || f._1 == $(timeCol).getOrElse("")))
    val result = dataSet.rdd.map { case (row: Row) =>
      val rowSeq = row.toSeq
      val nonInputColumns = nonInputColumnIndexes.value.map {
        case (_, index) => rowSeq(index)
      }
      val size = input.value.length
      val (values, indices) = input.value
        .filter(col => row.getAs(col) != null)
        .map { column =>
          DataTransformer.toDouble(row.getAs(column))
        }
        .zipWithIndex
        .filter(f => f._1 != 0d)
        .unzip
      Row(
        nonInputColumns :+ org.apache.spark.ml.linalg.Vectors
          .sparse(size, indices.toArray, values.toArray): _*
      )
    }
    val newSchema = transformSchema(dataSet.schema)
    dataSet.sqlContext.createDataFrame(result, newSchema)
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(
      schema.filter(
        col =>
          !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol)
            || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("")
      )
    ).add(StructField($(outputCol), new VectorUDT))
} 
Example 91
Source File: AllColumnsTimeSeriesGenerator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset

import scala.reflect.ClassTag


  def setOutputCol(value: String): this.type = set(outputCol, value)

//  override def transform(dataSet: DataFrame): DataFrame = {
  override def transform(dataSet: Dataset[_] ): DataFrame = {
    val rdd = dataSet.rdd
    val sparkContext = dataSet.sqlContext.sparkContext
    val labelColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(labelCol)))
    val keyValueDataSet = rdd.map { case (row: Row) =>
      Row(
        row.getAs[T](labelColIndex.value),
        row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol))
      )
    }
    val trainSchema = transformSchema(dataSet.schema)

    dataSet.sqlContext.createDataFrame(keyValueDataSet, trainSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    StructType(
      schema.filter(_.name == $(labelCol)).head +: Seq(
        StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT)
      )
    )
  }

  override def copy(extra: ParamMap): AllColumnsTimeSeriesGenerator[T, U] =
    defaultCopy(extra)
}

object AllColumnsTimeSeriesGenerator
    extends DefaultParamsReadable[AllColumnsTimeSeriesGenerator[_, _]] {

  override def load(path: String): AllColumnsTimeSeriesGenerator[_, _] =
    super.load(path)
} 
Example 92
Source File: HoltWintersEstimator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.TimeSeriesModel

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.Dataset


class HoltWintersBestModel[T, M <: TimeSeriesModel](
  override val uid: String,
  val bestPrediction: RDD[(T, M)],
  val validationMetrics: RDD[(T, ModelParamEvaluation[T])]
) extends Model[HoltWintersBestModel[T, M]]
    with TimeSeriesBestModelFinderParam[T] {

  //TODO look for this method usage to see if it can be removed
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    dataset.toDF()
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def copy(extra: ParamMap): HoltWintersBestModel[T, M] = {
    val copied =
      new HoltWintersBestModel[T, M](uid, bestPrediction, validationMetrics)
    copyValues(copied, extra)
  }
} 
Example 93
Source File: XGBoostBigModelTimeSeries.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import java.sql.Timestamp

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasTimeCol
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModelTimeSeries[I](override val uid: String,
                                   override val models: Seq[(ParamMap, XGBoostModel)])
                                  extends XGBoostBigModel[I](uid, models) with HasTimeCol{

  def setTimecol(time: String): this.type = set(timeCol, Some(time))

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME),
              row.getAs[java.sql.Timestamp]($(timeCol).get)))
      }
      .join(prediction)
      .map {
        case (id, ((features, time), predictValue)) =>
          Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }


  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField($(timeCol).get, TimestampType),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    ) )
} 
Example 94
Source File: HoltWintersBestModelFinder.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.UberHoltWintersModel
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset

import scala.reflect.ClassTag


class HoltWintersBestModelFinder[G](
  override val uid: String
)(implicit kt: ClassTag[G])
    extends HoltWintersBestModelEvaluation[G, HoltWintersModel[G]]
    with DefaultParamsWritable
    with HasGroupByCol
    with TimeSeriesBestModelFinder {

  def setTimeSeriesEvaluator(eval: TimeSeriesEvaluator[G]): this.type =
    set(timeSeriesEvaluator, eval)

  def setEstimatorParamMaps(value: Array[ParamMap]): this.type =
    set(estimatorParamMaps, value)

  def setNFutures(value: Int): this.type = set(nFutures, value)

  override def setValidationCol(value: String): this.type = set(validationCol, value)

  def setLabelCol(label: String): this.type = set(labelCol, label)

  def setGroupByCol(groupBy: String): this.type = set(groupByCol, Some(groupBy))

  def this()(implicit kt: ClassTag[G]) = this(Identifiable.randomUID("arima"))

  def modelEvaluation(
    idModels: RDD[(G, Row, Option[UberHoltWintersModel])]
  ): RDD[(G, (UberHoltWintersModel, ModelParamEvaluation[G]))] = {
    val eval = $(timeSeriesEvaluator)
    val broadcastEvaluator = idModels.context.broadcast(eval)
    idModels.filter(_._3.isDefined).map {
      case (id, row, models) =>
        val evaluatedModels = models.map { model =>
          holtWintersEvaluation(row, model, broadcastEvaluator, id)
        }.head
        log.warn(s"best model reach ${evaluatedModels._2.metricResult}")
        (id, evaluatedModels)
    }
  }

  override protected def train(dataSet: Dataset[_]): HoltWintersModel[G] = {
    val splitDs = split(dataSet, $(nFutures))
    val idModels = splitDs.rdd.map(train)
    new HoltWintersModel[G](uid, modelEvaluation(idModels))
      .setValidationCol($(validationCol))
      .asInstanceOf[HoltWintersModel[G]]
  }

  def train(row: Row): (G, Row, Option[UberHoltWintersModel]) = {
    val id = row.getAs[G]($(groupByCol).get)

    val result = try {
      val dense = row.getAs[org.apache.spark.ml.linalg.DenseVector]($(featuresCol))
      val ts:org.apache.spark.mllib.linalg.Vector  = org.apache.spark.mllib.linalg.Vectors.dense(dense.toArray);
      Some(
        UberHoltWintersModel.fitModelWithBOBYQA(ts, $(nFutures))
      )
    } catch {
      case e: Exception =>
        log.error(
          s"Got the following Exception ${e.getLocalizedMessage} in id $id"
        )
        None
    }
    (id, row, result)
  }
}

object HoltWintersBestModelFinder extends DefaultParamsReadable[HoltWintersBestModelFinder[_]] {

  override def load(path: String): HoltWintersBestModelFinder[_] =
    super.load(path)
} 
Example 95
Source File: SparkTest.scala    From spark-records   with Apache License 2.0 5 votes vote down vote up
package examples.fancy_numbers

import com.swoop.spark.records._
import com.swoop.spark.test.SparkSqlSpec
import org.apache.spark.sql.Dataset
import org.apache.spark.storage.StorageLevel


class SparkTest extends ExampleSpec with SparkSqlSpec with TestNegative5To100 {

  lazy val dc = SimpleDriverContext(sc)
  lazy val jc = dc.jobContext(SimpleJobContext)
  lazy val ds = recordsDataset(-5 to 100, jc)
  lazy val records = ds.collect

  "in an integration test" - {
    implicit val env = FlatRecordEnvironment()
    val sqlContext = sqlc
    import sqlContext.implicits._

    behave like fancyRecordBuilder(records, jc)

    "should build records with Spark" in {
      ds.count should be(105)
    }
    "should filter error records" in {
      ds.errorRecords.count should be(6)
    }
    "should extract data from records" in {
      ds.recordData.count should be(99)
    }
    "should extract issues" in {
      ds.allIssues.count should be(8)
      ds.errorIssues.count should be(6)
    }
    "should demonstrate issueCounts() output" in {
      ds.issueCounts.show(false)
    }
    "should demonstrate errorIssueCounts() output" in {
      ds.errorIssueCounts.show(false)
    }
    "should demonstrate messageCounts() output" in {
      ds.messageCounts.show(false)
    }
    "should demonstrate errorMessageCounts() output" in {
      ds.errorMessageCounts.show(false)
    }
    "should demonstrate errorDetailCounts() output" in {
      ds.errorIssues.errorDetailCounts().show
    }
    "should demonstrate unknownErrorDetailCounts() output" in {
      ds.errorIssues.unknownErrorDetailCounts("examples.fancy_numbers").show
    }
    "should demonstrate errorDetails() output" in {
      ds.errorIssues.errorDetails().show
    }
    "should demonstrate unknownErrorDetails() output" in {
      ds.errorIssues.unknownErrorDetails("examples.fancy_numbers").show
    }
  }

  def recordsDataset(numbers: Seq[Int], jc: JobContext): Dataset[FancyNumberRecord] = {
    val sqlContext = sqlc
    import sqlContext.implicits._
    sqlc.createDataset(numbers)
      .mapPartitions(inputs => Example.buildRecords(inputs, jc))
      .persist(StorageLevel.MEMORY_ONLY)
  }

} 
Example 96
Source File: MergeProjection.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command.mutation.merge

import java.sql.{Date, Timestamp}

import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection}
import org.apache.spark.sql.catalyst.util.DateTimeUtils


case class MergeProjection(
    @transient tableCols: Seq[String],
    @transient statusCol : String,
    @transient ds: Dataset[Row],
    @transient rltn: CarbonDatasourceHadoopRelation,
    @transient sparkSession: SparkSession,
    @transient mergeAction: MergeAction) {

  private val cutOffDate = Integer.MAX_VALUE >> 1

  val isUpdate = mergeAction.isInstanceOf[UpdateAction]
  val isDelete = mergeAction.isInstanceOf[DeleteAction]

  def apply(row: GenericRowWithSchema): InternalRow = {
    // TODO we can avoid these multiple conversions if this is added as a SparkPlan node.
    val values = row.values.map {
      case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s)
      case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d)
      case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b)
      case d: Date => DateTimeUtils.fromJavaDate(d)
      case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t)
      case value => value
    }

    projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow]
  }

  val (projection, output) = generateProjection

  private def generateProjection: (Projection, Array[Expression]) = {
    val existingDsOutput = rltn.carbonRelation.schema.toAttributes
    val colsMap = mergeAction match {
      case UpdateAction(updateMap) => updateMap
      case InsertAction(insertMap) => insertMap
      case _ => null
    }
    if (colsMap != null) {
      val output = new Array[Expression](tableCols.length)
      val expecOutput = new Array[Expression](tableCols.length)
      colsMap.foreach { case (k, v) =>
        val tableIndex = tableCols.indexOf(k.toString().toLowerCase)
        if (tableIndex < 0) {
          throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap")
        }
        output(tableIndex) = v.expr.transform {
          case a: Attribute if !a.resolved =>
            ds.queryExecution.analyzed.resolveQuoted(a.name,
              sparkSession.sessionState.analyzer.resolver).get
        }
        expecOutput(tableIndex) =
          existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get
      }
      if (output.contains(null)) {
        throw new CarbonMergeDataSetException(s"Not all columns are mapped")
      }
      (new InterpretedMutableProjection(output++Seq(
        ds.queryExecution.analyzed.resolveQuoted(statusCol,
        sparkSession.sessionState.analyzer.resolver).get),
        ds.queryExecution.analyzed.output), expecOutput)
    } else {
      (null, null)
    }
  }
} 
Example 97
Source File: DeltaLoad.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.algo

import com.adidas.analytics.algo.DeltaLoad._
import com.adidas.analytics.algo.core.Algorithm
import com.adidas.analytics.algo.shared.DateComponentDerivation
import com.adidas.analytics.config.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration
import com.adidas.analytics.util.DataFrameUtils._
import com.adidas.analytics.util._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.slf4j.{Logger, LoggerFactory}


  private def getUpsertRecords(deltaRecords: Dataset[Row], resultColumns: Seq[String]): Dataset[Row] = {
    // Create partition window - Partitioning by delta records logical key (i.e. technical key of active records)
    val partitionWindow = Window
      .partitionBy(businessKey.map(col): _*)
      .orderBy(technicalKey.map(component => col(component).desc): _*)

    // Ranking & projection
    val rankedDeltaRecords = deltaRecords
      .withColumn(rankingColumnName, row_number().over(partitionWindow))
      .filter(upsertRecordsModesFilterFunction)

    rankedDeltaRecords
      .filter(rankedDeltaRecords(rankingColumnName) === 1)
      .selectExpr(resultColumns: _*)
  }

  protected def withDatePartitions(spark: SparkSession, dfs: DFSWrapper, dataFrames: Vector[DataFrame]): Vector[DataFrame] = {
    logger.info("Adding partitioning information if needed")
    try {
      dataFrames.map { df =>
        if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions){
          df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions))
        }
        else df
      }
    } catch {
      case e: Throwable =>
        logger.error("Cannot add partitioning information for data frames.", e)
        //TODO: Handle failure case properly
        throw new RuntimeException("Unable to transform data frames.", e)
    }
  }
}


object DeltaLoad {

  private val logger: Logger = LoggerFactory.getLogger(getClass)

  def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = {
    new DeltaLoad(spark, dfs, configLocation)
  }
} 
Example 98
Source File: PartitionHelpers.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.algo.core

import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}


trait PartitionHelpers {

  protected def getDistinctPartitions(outputDataFrame: DataFrame, targetPartitions: Seq[String]): Dataset[Row] = {
    val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString))

    outputDataFrame.select(targetPartitionsColumns: _*).distinct
  }

  protected def getParameterValue(row: Row, partitionString: String): String =
    createParameterValue(row.get(row.fieldIndex(partitionString)))

  protected def createParameterValue(partitionRawValue: Any): String =
    partitionRawValue match {
      case value: java.lang.Short => value.toString
      case value: java.lang.Integer => value.toString
      case value: scala.Predef.String => "'" + value + "'"
      case null => throw new Exception("Partition Value is null. No support for null partitions!")
      case value => throw new Exception("Unsupported partition DataType: " + value.getClass)
    }
} 
Example 99
Source File: RecoverPartitionsNativeIntegrationTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.integration

import com.adidas.utils.TestUtils._
import com.adidas.analytics.algo.AppendLoad
import com.adidas.utils.FileReader
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{Dataset, Encoders}
import org.scalatest.FeatureSpec
import org.scalatest.Matchers._

import scala.collection.JavaConverters._


class RecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest {

  feature("Partitions can be updated with native spark.recoverPartitions()") {

    scenario("Using Append Load Algorithm with multiple source files") {
      val testResourceDir = "multiple_source_files"
      val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json")
      val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1")

      val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType]
      val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType]
      val dataReader = FileReader.newDSVFileReader(Some(targetSchema))
      val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema))

      val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema)
      setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader)
      prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv"))
      uploadParameters(testResourceDir)

      // checking pre-conditions
      spark.read.csv(sourceDirPath.toString).count() shouldBe 7
      targetTable.read().count() shouldBe 19

      fs.exists(targetPath20180101) shouldBe false
      fs.exists(headerPath20180101) shouldBe false

      // executing load
      AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()

      // validating result
      val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true)
      val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true)
      val expectedDf = dataReader.read(spark, expectedDataLocation)
      val actualDf = targetTable.read()

      val producedPartitionsNumber: Dataset[String] = spark
        .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}")
        .as(Encoders.STRING)

      // MetaData Specific Tests
      val expectedPartitions: Dataset[String] = expectedPartitionsDataReader
        .read(spark, expectedPartitionsLocation)
        .as(Encoders.STRING)

      expectedPartitions.collectAsList().asScala.sorted.toSet should
        equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet)

      actualDf.hasDiff(expectedDf) shouldBe false

      spark
        .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)")
        .filter("col_name == 'Partition Statistics'")
        .head()
        .getAs[String]("data_type").contains("6 rows") shouldBe true

      fs.exists(targetPath20180101) shouldBe true
      fs.exists(headerPath20180101) shouldBe true
    }
  }


} 
Example 100
Source File: SparkRecoverPartitionsNativeIntegrationTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.integration

import com.adidas.utils.TestUtils._
import com.adidas.analytics.algo.AppendLoad
import com.adidas.utils.FileReader
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{Dataset, Encoders}
import org.scalatest.FeatureSpec
import org.scalatest.Matchers._

import scala.collection.JavaConverters._


class SparkRecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest {

  feature("Partitions can be updated with native spark.recoverPartitions()") {

    scenario("Using Append Load Algorithm with multiple source files") {
      val testResourceDir = "multiple_source_files"
      val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json")
      val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1")

      val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType]
      val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType]
      val dataReader = FileReader.newDSVFileReader(Some(targetSchema))
      val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema))

      val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema)
      setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader)
      prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv"))
      uploadParameters(testResourceDir)

      // checking pre-conditions
      spark.read.csv(sourceDirPath.toString).count() shouldBe 7
      targetTable.read().count() shouldBe 19

      fs.exists(targetPath20180101) shouldBe false
      fs.exists(headerPath20180101) shouldBe false

      // executing load
      AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()

      // validating result
      val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true)
      val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true)
      val expectedDf = dataReader.read(spark, expectedDataLocation)
      val actualDf = targetTable.read()

      val producedPartitionsNumber: Dataset[String] = spark
        .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}")
        .as(Encoders.STRING)

      // MetaData Specific Tests
      val expectedPartitions: Dataset[String] = expectedPartitionsDataReader
        .read(spark, expectedPartitionsLocation)
        .as(Encoders.STRING)

      expectedPartitions.collectAsList().asScala.sorted.toSet should
        equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet)

      actualDf.hasDiff(expectedDf) shouldBe false

      fs.exists(targetPath20180101) shouldBe true
      fs.exists(headerPath20180101) shouldBe true
    }
  }


} 
Example 101
Source File: SparkRecoverPartitionsCustomIntegrationTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.integration

import com.adidas.utils.TestUtils._
import com.adidas.analytics.algo.AppendLoad
import com.adidas.utils.FileReader
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{Dataset, Encoders}
import org.scalatest.FeatureSpec
import org.scalatest.Matchers._

import scala.collection.JavaConverters._


class SparkRecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest {

  feature("Partitions can be updated programmatically using custom logic") {

    scenario("Using Append Load Algorithm with multiple source files") {
      val testResourceDir = "multiple_source_files"
      val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json")
      val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1")

      val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType]
      val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType]
      val dataReader = FileReader.newDSVFileReader(Some(targetSchema))
      val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema))

      val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema)
      setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader)
      prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv"))
      uploadParameters(testResourceDir)

      // checking pre-conditions
      spark.read.csv(sourceDirPath.toString).count() shouldBe 7
      targetTable.read().count() shouldBe 19

      fs.exists(targetPath20180101) shouldBe false
      fs.exists(headerPath20180101) shouldBe false

      // executing load
      AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()

      // validating result
      val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true)
      val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true)
      val expectedDf = dataReader.read(spark, expectedDataLocation)
      val actualDf = targetTable.read()

      val producedPartitionsNumber: Dataset[String] = spark
        .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}")
        .as(Encoders.STRING)

      // MetaData Specific Tests
      val expectedPartitions: Dataset[String] = expectedPartitionsDataReader
        .read(spark, expectedPartitionsLocation)
        .as(Encoders.STRING)

      expectedPartitions.collectAsList().asScala.sorted.toSet should
        equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet)

      actualDf.hasDiff(expectedDf) shouldBe false

      fs.exists(targetPath20180101) shouldBe true
      fs.exists(headerPath20180101) shouldBe true
    }
  }


} 
Example 102
Source File: RecoverPartitionsCustomIntegrationTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.integration

import com.adidas.utils.TestUtils._
import com.adidas.analytics.algo.AppendLoad
import com.adidas.utils.FileReader
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{Dataset, Encoders}
import org.scalatest.FeatureSpec
import org.scalatest.Matchers._

import scala.collection.JavaConverters._


class RecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest {

  feature("Partitions can be updated programmatically using custom logic") {

    scenario("Using Append Load Algorithm with multiple source files") {
      val testResourceDir = "multiple_source_files"
      val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json")
      val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1")

      val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType]
      val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType]
      val dataReader = FileReader.newDSVFileReader(Some(targetSchema))
      val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema))

      val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema)
      setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader)
      prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv"))
      uploadParameters(testResourceDir)

      // checking pre-conditions
      spark.read.csv(sourceDirPath.toString).count() shouldBe 7
      targetTable.read().count() shouldBe 19

      fs.exists(targetPath20180101) shouldBe false
      fs.exists(headerPath20180101) shouldBe false

      // executing load
      AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()

      // validating result
      val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true)
      val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true)
      val expectedDf = dataReader.read(spark, expectedDataLocation)
      val actualDf = targetTable.read()

      val producedPartitionsNumber: Dataset[String] = spark
        .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}")
        .as(Encoders.STRING)

      // MetaData Specific Tests
      val expectedPartitions: Dataset[String] = expectedPartitionsDataReader
        .read(spark, expectedPartitionsLocation)
        .as(Encoders.STRING)

      expectedPartitions.collectAsList().asScala.sorted.toSet should
        equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet)

      actualDf.hasDiff(expectedDf) shouldBe false

      spark
        .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)")
        .filter("col_name == 'Partition Statistics'")
        .head()
        .getAs[String]("data_type").contains("6 rows") shouldBe true

      fs.exists(targetPath20180101) shouldBe true
      fs.exists(headerPath20180101) shouldBe true
    }
  }


} 
Example 103
Source File: SparkRecoverPartitionsCustomTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.unit

import com.adidas.analytics.util.SparkRecoverPartitionsCustom
import com.adidas.utils.SparkSessionWrapper
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers, PrivateMethodTester}

import scala.collection.JavaConverters._

class SparkRecoverPartitionsCustomTest extends FunSuite
  with SparkSessionWrapper
  with PrivateMethodTester
  with Matchers
  with BeforeAndAfterAll{

  test("test conversion of String Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue("theValue")

    result should be("'theValue'")
  }

  test("test conversion of Short Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Short.valueOf("2"))

    result should be("2")
  }

  test("test conversion of Integer Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Integer.valueOf("4"))

    result should be("4")
  }

  test("test conversion of null Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    an [Exception] should be thrownBy {
      customSparkRecoverPartitions invokePrivate createParameterValue(null)
    }
  }

  test("test conversion of not supported Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    an [Exception] should be thrownBy {
      customSparkRecoverPartitions invokePrivate createParameterValue(false)
    }
  }

  test("test HiveQL statements Generation") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(
      tableName="test",
      targetPartitions = Seq("country","district")
    )

    val rowsInput = Seq(
      Row(1, "portugal", "porto"),
      Row(2, "germany", "herzogenaurach"),
      Row(3, "portugal", "coimbra")
    )

    val inputSchema = StructType(
      List(
        StructField("number", IntegerType, nullable = true),
        StructField("country", StringType, nullable = true),
        StructField("district", StringType, nullable = true)
      )
    )

    val expectedStatements: Seq[String] = Seq(
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='porto')",
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='germany',district='herzogenaurach')",
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='coimbra')"
    )

    val testDataset: Dataset[Row] = spark.createDataset(rowsInput)(RowEncoder(inputSchema))

    val createParameterValue = PrivateMethod[Dataset[String]]('generateAddPartitionStatements)

    val producedStatements: Seq[String] = (customSparkRecoverPartitions invokePrivate createParameterValue(testDataset))
      .collectAsList()
      .asScala

    expectedStatements.sorted.toSet should equal(producedStatements.sorted.toSet)
  }

  override def afterAll(): Unit = {
    spark.stop()
  }

} 
Example 104
Source File: RecoverPartitionsCustomTest.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.unit

import com.adidas.analytics.util.RecoverPartitionsCustom
import com.adidas.utils.SparkSessionWrapper
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers, PrivateMethodTester}

import scala.collection.JavaConverters._

class RecoverPartitionsCustomTest extends FunSuite
  with SparkSessionWrapper
  with PrivateMethodTester
  with Matchers
  with BeforeAndAfterAll{

  test("test conversion of String Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue("theValue")

    result should be("'theValue'")
  }

  test("test conversion of Short Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Short.valueOf("2"))

    result should be("2")
  }

  test("test conversion of Integer Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Integer.valueOf("4"))

    result should be("4")
  }

  test("test conversion of null Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    an [Exception] should be thrownBy {
      customSparkRecoverPartitions invokePrivate createParameterValue(null)
    }
  }

  test("test conversion of not supported Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    an [Exception] should be thrownBy {
      customSparkRecoverPartitions invokePrivate createParameterValue(false)
    }
  }

  test("test HiveQL statements Generation") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(
      tableName="test",
      targetPartitions = Seq("country","district")
    )

    val rowsInput = Seq(
      Row(1, "portugal", "porto"),
      Row(2, "germany", "herzogenaurach"),
      Row(3, "portugal", "coimbra")
    )

    val inputSchema = StructType(
      List(
        StructField("number", IntegerType, nullable = true),
        StructField("country", StringType, nullable = true),
        StructField("district", StringType, nullable = true)
      )
    )

    val expectedStatements: Seq[String] = Seq(
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='porto')",
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='germany',district='herzogenaurach')",
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='coimbra')"
    )

    val testDataset: Dataset[Row] = spark.createDataset(rowsInput)(RowEncoder(inputSchema))

    val createParameterValue = PrivateMethod[Dataset[String]]('generateAddPartitionStatements)

    val producedStatements: Seq[String] = (customSparkRecoverPartitions invokePrivate createParameterValue(testDataset))
      .collectAsList()
      .asScala

    expectedStatements.sorted.toSet should equal(producedStatements.sorted.toSet)
  }

  override def afterAll(): Unit = {
    spark.stop()
  }

} 
Example 105
Source File: SqsSource.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.sqs

import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation}
import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.execution.streaming.FileStreamSource._
import org.apache.spark.sql.types.StructType


class SqsSource(sparkSession: SparkSession,
                metadataPath: String,
                options: Map[String, String],
                override val schema: StructType) extends Source with Logging {

  private val sourceOptions = new SqsSourceOptions(options)

  private val hadoopConf = sparkSession.sessionState.newHadoopConf()

  private val metadataLog =
    new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath)
  private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L)

  private val maxFilesPerTrigger = sourceOptions.maxFilesPerTrigger

  private val maxFileAgeMs: Long = sourceOptions.maxFileAgeMs

  private val fileFormatClassName = sourceOptions.fileFormatClassName

  private val shouldSortFiles = sourceOptions.shouldSortFiles

  private val sqsClient = new SqsClient(sourceOptions, hadoopConf)

  metadataLog.allFiles().foreach { entry =>
    sqsClient.sqsFileCache.add(entry.path, MessageDescription(entry.timestamp, true, ""))
  }
  sqsClient.sqsFileCache.purge()

  logInfo(s"maxFilesPerBatch = $maxFilesPerTrigger, maxFileAgeMs = $maxFileAgeMs")

   
    val batchFiles = sqsClient.sqsFileCache.getUncommittedFiles(maxFilesPerTrigger, shouldSortFiles)

    if (batchFiles.nonEmpty) {
      metadataLogCurrentOffset += 1
      metadataLog.add(metadataLogCurrentOffset, batchFiles.map {
        case (path, timestamp, receiptHandle) =>
          FileEntry(path = path, timestamp = timestamp, batchId = metadataLogCurrentOffset)
      }.toArray)
      logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files")
      val messageReceiptHandles = batchFiles.map {
        case (path, timestamp, receiptHandle) =>
          sqsClient.sqsFileCache.markCommitted(path)
          logDebug(s"New file: $path")
          receiptHandle
      }.toList
      sqsClient.addToDeleteMessageQueue(messageReceiptHandles)
    }

    val numPurged = sqsClient.sqsFileCache.purge()

    if (!sqsClient.deleteMessageQueue.isEmpty) {
      sqsClient.deleteMessagesFromQueue()
    }

    logTrace(
      s"""
         |Number of files selected for batch = ${batchFiles.size}
         |Number of files purged from tracking map = $numPurged
       """.stripMargin)

    FileStreamSourceOffset(metadataLogCurrentOffset)
  }

  override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1)

  override def commit(end: Offset): Unit = {
    // No-op for now; SqsSource currently garbage-collects files based on timestamp
    // and the value of the maxFileAge parameter.
  }

  override def stop(): Unit = {
    if (!sqsClient.sqsScheduler.isTerminated) {
      sqsClient.sqsScheduler.shutdownNow()
    }
  }

  override def toString: String = s"SqsSource[${sqsClient.sqsUrl}]"

} 
Example 106
Source File: Cleaner.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package functions.clean

import com.hankcs.hanlp.HanLP
import config.paramconf.{HasOutputCol, HasInputCol}
import functions.MySchemaUtils
import functions.clean.chinese.BCConvert
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}



  setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1)

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema, logging = true)

    val cleanFunc = udf {line: String =>
      var cleaned = ""
      getFanJian match {
        case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line)
        case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line)
        case _ => cleaned = line
      }

      getQuanBan match {
        case "q2b" => cleaned = BCConvert.qj2bj(cleaned)
        case "b2q" => cleaned = BCConvert.bj2qj(cleaned)
        case _ => cleaned = cleaned
      }

      cleaned
    }

    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record =>
      val outputIndex = record.fieldIndex($(outputCol))
      record.getString(outputIndex).length >= getMinLineLen
    }
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.typeName.equals(StringType.typeName),
      s"Input type must be StringType but got $inputType.")
    MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
  }
}


object Cleaner extends DefaultParamsReadable[Cleaner] {
  override def load(path: String): Cleaner = super.load(path)
} 
Example 107
Source File: CUDAUtils.scala    From GPUEnabler   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.gpuenabler

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType
import org.apache.spark.internal.Logging

object CUDAUtils {

  def DS = Dataset
  type _ds[T] = Dataset[T]

  def getLogicalPlan[T](ds: Dataset[T]) = {
    ds.logicalPlan
  }

  def getAttributes(st: StructType) = {
    st.toAttributes
  }

  type _Logging = Logging

  def md5HashObj(obj: AnyRef) : String = {
    val text = obj.toString()
    java.security.MessageDigest.getInstance("MD5").digest(text.getBytes)
      .map(0xFF & _).map {
      "%02x".format(_)
    }.foldLeft("") {
      _ + _
    } + "_" + obj.hashCode()
  }
} 
Example 108
Source File: LightPipeline.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import org.apache.spark.ml.{PipelineModel, Transformer}
import org.apache.spark.sql.{DataFrame, Dataset}

import scala.collection.JavaConverters._

class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddingsVectors: Boolean = false) {

  private var ignoreUnsupported = false

  def setIgnoreUnsupported(v: Boolean): Unit = ignoreUnsupported = v
  def getIgnoreUnsupported: Boolean = ignoreUnsupported

  def getStages: Array[Transformer] = pipelineModel.stages

  def transform(dataFrame: Dataset[_]): DataFrame = pipelineModel.transform(dataFrame)

  def fullAnnotate(target: String, startWith: Map[String, Seq[Annotation]] = Map.empty[String, Seq[Annotation]]): Map[String, Seq[Annotation]] = {
    getStages.foldLeft(startWith)((annotations, transformer) => {
      transformer match {
        case documentAssembler: DocumentAssembler =>
          annotations.updated(documentAssembler.getOutputCol, documentAssembler.assemble(target, Map.empty[String, String]))
        case lazyAnnotator: AnnotatorModel[_] if lazyAnnotator.getLazyAnnotator => annotations
        case recursiveAnnotator: HasRecursiveTransform[_] with AnnotatorModel[_] =>
          val combinedAnnotations =
            recursiveAnnotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil))
          annotations.updated(recursiveAnnotator.getOutputCol, recursiveAnnotator.annotate(combinedAnnotations, pipelineModel))
        case annotator: AnnotatorModel[_] =>
          val combinedAnnotations =
            annotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil))
          annotations.updated(annotator.getOutputCol, annotator.annotate(combinedAnnotations))
        case finisher: Finisher =>
          annotations.filterKeys(finisher.getInputCols.contains)
        case rawModel: RawAnnotator[_] =>
          if (ignoreUnsupported) annotations
          else throw new IllegalArgumentException(s"model ${rawModel.uid} does not support LightPipeline." +
            s" Call setIgnoreUnsupported(boolean) on LightPipeline to ignore")
        case pipeline: PipelineModel =>
          new LightPipeline(pipeline, parseEmbeddingsVectors).fullAnnotate(target, annotations)
        case _ => annotations
      }
    })
  }

  def fullAnnotate(targets: Array[String]): Array[Map[String, Seq[Annotation]]] = {
    targets.par.map(target => {
      fullAnnotate(target)
    }).toArray
  }

  def fullAnnotateJava(target: String): java.util.Map[String, java.util.List[JavaAnnotation]] = {
    fullAnnotate(target).mapValues(_.map(aa =>
      JavaAnnotation(aa.annotatorType, aa.begin, aa.end, aa.result, aa.metadata.asJava)).asJava).asJava
  }

  def fullAnnotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[JavaAnnotation]]] = {
    targets.asScala.par.map(target => {
      fullAnnotateJava(target)
    }).toList.asJava
  }

  def annotate(target: String): Map[String, Seq[String]] = {
    fullAnnotate(target).mapValues(_.map(a => {
      a.annotatorType match {
        case (AnnotatorType.WORD_EMBEDDINGS |
             AnnotatorType.SENTENCE_EMBEDDINGS) if (parseEmbeddingsVectors) =>  a.embeddings.mkString(" ")
        case _ => a.result
      }
    }))
  }

  def annotate(targets: Array[String]): Array[Map[String, Seq[String]]] = {
    targets.par.map(target => {
      annotate(target)
    }).toArray
  }

  def annotateJava(target: String): java.util.Map[String, java.util.List[String]] = {
    annotate(target).mapValues(_.asJava).asJava
  }

  def annotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[String]]] = {
    targets.asScala.par.map(target => {
      annotateJava(target)
    }).toList.asJava
  }

} 
Example 109
Source File: AnnotatorApproach.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import com.johnsnowlabs.storage.HasStorage
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType}
import org.apache.spark.ml.util.DefaultParamsWritable


  override final def transformSchema(schema: StructType): StructType = {
    require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" +
      msgHelper(schema) +
      s"\nMake sure such annotators exist in your pipeline, " +
      s"with the right output names and that they have following annotator types: " +
      s"${inputAnnotatorTypes.mkString(", ")}")
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", outputAnnotatorType)
    val outputFields = schema.fields :+
      StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build)
    StructType(outputFields)
  }
} 
Example 110
Source File: RecursivePipeline.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import org.apache.spark.internal.Logging
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset}

import scala.collection.mutable.ListBuffer

class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline {

  def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty)

  def this(uid: String) = this(uid, Array.empty)

  def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages)

  this.setStages(baseStages)

  
  override def fit(dataset: Dataset[_]): PipelineModel = {
    transformSchema(dataset.schema, logging = true)
    val theStages = $(stages)
    var indexOfLastEstimator = -1
    theStages.view.zipWithIndex.foreach { case (stage, index) =>
      stage match {
        case _: Estimator[_] =>
          indexOfLastEstimator = index
        case _ =>
      }
    }
    var curDataset = dataset
    val transformers = ListBuffer.empty[Transformer]
    theStages.view.zipWithIndex.foreach { case (stage, index) =>
      if (index <= indexOfLastEstimator) {
        val transformer = stage match {
          case estimator: HasRecursiveFit[_] =>
            estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset))
          case estimator: Estimator[_] =>
            estimator.fit(curDataset)
          case t: Transformer =>
            t
          case _ =>
            throw new IllegalArgumentException(
              s"Does not support stage $stage of type ${stage.getClass}")
        }
        if (index < indexOfLastEstimator) {
          curDataset = transformer.transform(curDataset)
        }
        transformers += transformer
      } else {
        transformers += stage.asInstanceOf[Transformer]
      }
    }

    createPipeline(dataset, transformers.toArray)
  }

}

class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel)
  extends Model[RecursivePipelineModel] with MLWritable with Logging {

  def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline)

  // drops right at most because is itself included
  private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel =
    new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset)

  override def copy(extra: ParamMap): RecursivePipelineModel = {
    new RecursivePipelineModel(uid, innerPipeline.copy(extra))
  }

  override def write: MLWriter = {
    innerPipeline.write
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match {
      case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset))
      case t: AnnotatorModel[_] if t.getLazyAnnotator => cur
      case t: Transformer => t.transform(cur)
    })
  }

  override def transformSchema(schema: StructType): StructType = {
    innerPipeline.transformSchema(schema)
  }
} 
Example 111
Source File: BigTextMatcher.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.btm

import com.johnsnowlabs.collections.StorageSearchTrie
import com.johnsnowlabs.nlp.AnnotatorType.{TOKEN, DOCUMENT, CHUNK}
import com.johnsnowlabs.nlp.annotators.TokenizerModel
import com.johnsnowlabs.nlp.serialization.StructFeature
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.storage.Database.Name
import com.johnsnowlabs.storage.{Database, HasStorage, RocksDBConnection, StorageWriter}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset

class BigTextMatcher(override val uid: String) extends AnnotatorApproach[BigTextMatcherModel] with HasStorage {

  def this() = this(Identifiable.randomUID("ENTITY_EXTRACTOR"))

  override val inputAnnotatorTypes = Array(DOCUMENT, TOKEN)

  override val outputAnnotatorType: AnnotatorType = CHUNK

  override val description: String = "Extracts entities from target dataset given in a text file"

  val mergeOverlapping = new BooleanParam(this, "mergeOverlapping", "whether to merge overlapping matched chunks. Defaults false")
  val tokenizer = new StructFeature[TokenizerModel](this, "tokenizer")

  setDefault(inputCols,Array(TOKEN))
  setDefault(caseSensitive, true)
  setDefault(mergeOverlapping, false)

  def setTokenizer(tokenizer: TokenizerModel): this.type = set(this.tokenizer, tokenizer)

  def getTokenizer: TokenizerModel = $$(tokenizer)

  def setMergeOverlapping(v: Boolean): this.type = set(mergeOverlapping, v)

  def getMergeOverlapping: Boolean = $(mergeOverlapping)

  
  private def loadEntities(path: String, writers: Map[Database.Name, StorageWriter[_]]): Unit = {
    val inputFiles: Seq[Iterator[String]] =
      ResourceHelper.parseLinesIterator(ExternalResource(path, ReadAs.TEXT, Map()))
    inputFiles.foreach { inputFile => {
      StorageSearchTrie.load(inputFile, writers, get(tokenizer))
    }}
  }

  override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): BigTextMatcherModel = {
    new BigTextMatcherModel()
      .setInputCols($(inputCols))
      .setOutputCol($(outputCol))
      .setCaseSensitive($(caseSensitive))
      .setStorageRef($(storageRef))
      .setMergeOverlapping($(mergeOverlapping))
  }

  override protected def createWriter(database: Name, connection: RocksDBConnection): StorageWriter[_] = {
    database match {
      case Database.TMVOCAB => new TMVocabReadWriter(connection, $(caseSensitive))
      case Database.TMEDGES => new TMEdgesReadWriter(connection, $(caseSensitive))
      case Database.TMNODES => new TMNodesWriter(connection)
    }
  }

  override protected def index(
                                fitDataset: Dataset[_],
                                storageSourcePath: Option[String],
                                readAs: Option[ReadAs.Value],
                                writers: Map[Database.Name, StorageWriter[_]],
                                readOptions: Option[Map[String, String]]
                              ): Unit = {
    require(readAs.get == ReadAs.TEXT, "BigTextMatcher only supports TEXT input formats at the moment.")
    loadEntities(storageSourcePath.get, writers)
  }

  override protected val databases: Array[Name] = BigTextMatcherModel.databases
}

object BigTextMatcher extends DefaultParamsReadable[BigTextMatcher] 
Example 112
Source File: ChunkTokenizer.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, TOKEN}
import com.johnsnowlabs.nlp.util.io.ResourceHelper
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset


  override val outputAnnotatorType: AnnotatorType = TOKEN

  override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): TokenizerModel = {
    val ruleFactory = buildRuleFactory

    val processedExceptions = get(exceptionsPath)
      .map(er => ResourceHelper.parseLines(er))
      .getOrElse(Array.empty[String]) ++ get(exceptions).getOrElse(Array.empty[String])

    val raw = new ChunkTokenizerModel()
      .setCaseSensitiveExceptions($(caseSensitiveExceptions))
      .setTargetPattern($(targetPattern))
      .setRules(ruleFactory)

    if (processedExceptions.nonEmpty)
      raw.setExceptions(processedExceptions)
    else
      raw
  }

}

object ChunkTokenizer extends DefaultParamsReadable[ChunkTokenizer] 
Example 113
Source File: CoNLLGenerator.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.util

import org.apache.spark.ml.PipelineModel
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

import scala.collection.mutable.ArrayBuffer
import scala.util.Try

object CoNLLGenerator {

  def exportConllFiles(spark: SparkSession, filesPath: String, pipelineModel: PipelineModel, outputPath: String): Unit = {
    import spark.implicits._ //for toDS and toDF
    val data = spark.sparkContext.wholeTextFiles(filesPath).toDS.toDF("filename", "text")
    exportConllFiles(data, pipelineModel, outputPath)
  }

  def exportConllFiles(spark: SparkSession, filesPath: String, pipelinePath: String, outputPath: String): Unit = {
    val model = PipelineModel.load(pipelinePath)
    exportConllFiles(spark, filesPath, model, outputPath)
  }

  def exportConllFiles(data: DataFrame, pipelineModel: PipelineModel, outputPath: String): Unit = {
    val POSdataset = pipelineModel.transform(data)
    exportConllFiles(POSdataset, outputPath)
  }

  def exportConllFiles(data: DataFrame, pipelinePath: String, outputPath: String): Unit = {
    val model = PipelineModel.load(pipelinePath)
    exportConllFiles(data, model, outputPath)
  }

  def exportConllFiles(data: DataFrame, outputPath: String): Unit = {
    import data.sparkSession.implicits._ //for udf
    var dfWithNER = data
    //if data does not contain ner column, add "O" as default
    if (Try(data("finished_ner")).isFailure){
      def OArray = (len : Int) => { //create array of $len "O"s
        var z = new Array[String](len)
        for (i <- 0 until z.length) { z(i)="O" }
        z
      }
      val makeOArray = data.sparkSession.udf.register("finished_pos", OArray)
      dfWithNER=data.withColumn("finished_ner", makeOArray(size(col("finished_pos"))))
    }

    val newPOSDataset = dfWithNER.select("finished_token", "finished_pos", "finished_token_metadata", "finished_ner").
      as[(Array[String], Array[String], Array[(String, String)], Array[String])]
    val CoNLLDataset = makeConLLFormat(newPOSDataset)
    CoNLLDataset.coalesce(1).write.format("com.databricks.spark.csv").
      options(scala.collection.Map("delimiter" -> " ", "emptyValue" -> "")).
      save(outputPath)
  }


  def makeConLLFormat(newPOSDataset : Dataset[(Array[String], Array[String], Array[(String, String)], Array[String])]) ={
    import newPOSDataset.sparkSession.implicits._ //for row casting
    newPOSDataset.flatMap(row => {
      val newColumns: ArrayBuffer[(String, String, String, String)] = ArrayBuffer()
      val columns = ((row._1 zip row._2), row._3.map(_._2.toInt), row._4).zipped.map{case (a,b, c) => (a._1, a._2, b, c)}
      var sentenceId = 1
      newColumns.append(("", "", "", ""))
      newColumns.append(("-DOCSTART-", "-X-", "-X-", "O"))
      newColumns.append(("", "", "", ""))
      columns.foreach(a => {
        if (a._3 != sentenceId){
          newColumns.append(("", "", "", ""))
          sentenceId = a._3
        }
        newColumns.append((a._1, a._2, a._2, a._4))
      })
      newColumns
    })
  }

} 
Example 114
Source File: HasStorageRef.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.storage

import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable
import org.apache.spark.ml.param.Param
import org.apache.spark.sql.Dataset

trait HasStorageRef extends ParamsAndFeaturesWritable {

  val storageRef = new Param[String](this, "storageRef", "storage unique identifier")

  setDefault(storageRef, this.uid)

  def createDatabaseConnection(database: Database.Name): RocksDBConnection =
    RocksDBConnection.getOrCreate(database, $(storageRef))

  def setStorageRef(value: String): this.type = {
    if (get(storageRef).nonEmpty)
      throw new UnsupportedOperationException(s"Cannot override storage ref on $this. " +
        s"Please re-use current ref: $getStorageRef")
    set(this.storageRef, value)
  }
  def getStorageRef: String = $(storageRef)

  def validateStorageRef(dataset: Dataset[_], inputCols: Array[String], annotatorType: String): Unit = {
    require(isDefined(storageRef), "This Annotator does not have a storage reference defined. This could be an outdated " +
      "model or an incorrectly created one. Make sure storageRef param is defined and set.")
    require(HasStorageRef.getStorageRefFromInput(dataset, inputCols, annotatorType) == $(storageRef),
      s"Found input column with storage metadata. But such ref does not match to the ref this annotator requires. " +
        s"Make sure you are loading the annotator with ref: ${$(storageRef)}")
  }

}

object HasStorageRef {
  def getStorageRefFromInput(dataset: Dataset[_], inputCols: Array[String], annotatorType: String): String = {
    val storageCol = dataset.schema.fields
      .find(f => inputCols.contains(f.name) && f.metadata.getString("annotatorType") == annotatorType)
      .getOrElse(throw new Exception(s"Could not find a column of type $annotatorType. Make sure your pipeline is correct."))
      .name

    val storage_meta = dataset.select(storageCol).schema.fields.head.metadata

    require(storage_meta.contains("ref"), s"Could not find a ref name in column $storageCol. " +
      s"Make sure $storageCol was created appropriately with a valid storageRef")

    storage_meta.getString("ref")
  }
} 
Example 115
Source File: DataBuilder.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import com.johnsnowlabs.nlp.training.CoNLL
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest._


object DataBuilder extends FlatSpec with BeforeAndAfterAll { this: Suite =>

  import SparkAccessor.spark.implicits._

  def basicDataBuild(content: String*)(implicit cleanupMode: String = "disabled"): Dataset[Row] = {
    val data = SparkAccessor.spark.sparkContext.parallelize(content).toDS().toDF("text")
    AnnotatorBuilder.withDocumentAssembler(data, cleanupMode)
  }

  def multipleDataBuild(content: Seq[String]): Dataset[Row] = {
    val data = SparkAccessor.spark.sparkContext.parallelize(content).toDS().toDF("text")
    AnnotatorBuilder.withDocumentAssembler(data)
  }

  def buildNerDataset(datasetContent: String): Dataset[Row] = {
    val lines = datasetContent.split("\n")
    val data = CoNLL(conllLabelIndex = 1)
      .readDatasetFromLines(lines, SparkAccessor.spark).toDF
    AnnotatorBuilder.withDocumentAssembler(data)
  }

  def loadParquetDataset(path: String) =
    SparkAccessor.spark.read.parquet(path)
} 
Example 116
Source File: BigTextMatcherBehaviors.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.btm

import com.johnsnowlabs.nlp.{Annotation, AnnotatorBuilder}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest._

trait BigTextMatcherBehaviors { this: FlatSpec =>

  def fullBigTextMatcher(dataset: => Dataset[Row]) {
    "An BigTextMatcher Annotator" should "successfully transform data" in {
      AnnotatorBuilder.withFullBigTextMatcher(dataset)
        .collect().foreach {
        row =>
          row.getSeq[Row](3)
            .map(Annotation(_))
            .foreach {
              case entity: Annotation if entity.annotatorType == "entity" =>
                println(entity, entity.end)
              case _ => ()
            }
      }
    }
  }
} 
Example 117
Source File: DependencyParserBehaviors.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.parser.dep

import com.johnsnowlabs.nlp._
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.scalatest.FlatSpec
import com.johnsnowlabs.util.PipelineModels
import org.apache.spark.ml.Pipeline

trait DependencyParserBehaviors { this: FlatSpec =>


  def initialAnnotations(testDataSet: Dataset[Row]): Unit = {
    val fixture = createFixture(testDataSet)
    it should "add annotations" in {
      assert(fixture.dependencies.count > 0, "Annotations count should be greater than 0")
    }

    it should "add annotations with the correct annotationType" in {
      fixture.depAnnotations.foreach { a =>
        assert(a.annotatorType == AnnotatorType.DEPENDENCY, s"Annotation type should ${AnnotatorType.DEPENDENCY}")
      }
    }

    it should "annotate each token" in {
      assert(fixture.tokenAnnotations.size == fixture.depAnnotations.size, s"Every token should be annotated")
    }

    it should "annotate each word with a head" in {
      fixture.depAnnotations.foreach { a =>
        assert(a.result.nonEmpty, s"Result should have a head")
      }
    }

    it should "annotate each word with the correct indexes" in {
      fixture.depAnnotations
        .zip(fixture.tokenAnnotations)
        .foreach { case (dep, token) => assert(dep.begin == token.begin && dep.end == token.end, s"Token and word should have equal indixes") }
    }
  }

  private def createFixture(testDataSet: Dataset[Row]) = new {
    val dependencies: DataFrame = testDataSet.select("dependency")
    val depAnnotations: Seq[Annotation] = dependencies
      .collect
      .flatMap { r => r.getSeq[Row](0) }
      .map { r =>
        Annotation(r.getString(0), r.getInt(1), r.getInt(2), r.getString(3), r.getMap[String, String](4))
      }
    val tokens: DataFrame = testDataSet.select("token")
    val tokenAnnotations: Seq[Annotation] = tokens
      .collect
      .flatMap { r => r.getSeq[Row](0) }
      .map { r =>
        Annotation(r.getString(0), r.getInt(1), r.getInt(2), r.getString(3), r.getMap[String, String](4))
      }
  }

  def relationshipsBetweenWordsPredictor(testDataSet: Dataset[Row], pipeline: Pipeline): Unit = {

    val emptyDataSet = PipelineModels.dummyDataset

    val dependencyParserModel = pipeline.fit(emptyDataSet)

    it should "train a model" in {
      val model = dependencyParserModel.stages.last.asInstanceOf[DependencyParserModel]
      assert(model.isInstanceOf[DependencyParserModel])
    }

    val dependencyParserDataFrame = dependencyParserModel.transform(testDataSet)
    //dependencyParserDataFrame.collect()
    dependencyParserDataFrame.select("dependency").show(false)

    it should "predict relationships between words" in {
      assert(dependencyParserDataFrame.isInstanceOf[DataFrame])
    }

  }

} 
Example 118
Source File: TokenizerBehaviors.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.{Annotation, AnnotatorBuilder, AnnotatorType}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest._

import scala.language.reflectiveCalls

trait TokenizerBehaviors { this: FlatSpec =>

  def fixture(dataset: => Dataset[Row]) = new {
    val df = AnnotatorBuilder.withTokenizer(AnnotatorBuilder.withTokenizer(dataset))
    val documents = df.select("document")
    val sentences = df.select("sentence")
    val tokens = df.select("token")
    val sentencesAnnotations = sentences
      .collect
      .flatMap { r => r.getSeq[Row](0) }
      .map { a => Annotation(a.getString(0), a.getInt(1), a.getInt(2), a.getString(3), a.getMap[String, String](4)) }
    val tokensAnnotations = tokens
      .collect
      .flatMap { r => r.getSeq[Row](0)}
      .map { a => Annotation(a.getString(0), a.getInt(1), a.getInt(2), a.getString(3), a.getMap[String, String](4)) }

    val docAnnotations = documents
      .collect
      .flatMap { r => r.getSeq[Row](0)}
      .map { a => Annotation(a.getString(0), a.getInt(1), a.getInt(2), a.getString(3), a.getMap[String, String](4)) }

    val corpus = docAnnotations
      .map(d => d.result)
      .mkString("")
  }

  def fullTokenizerPipeline(dataset: => Dataset[Row]) {
    "A Tokenizer Annotator" should "successfully transform data" in {
      val f = fixture(dataset)
      assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators")
    }

    it should "annotate using the annotatorType of token" in {
      val f = fixture(dataset)
      assert(f.tokensAnnotations.nonEmpty, "Tokenizer should add annotators")
      f.tokensAnnotations.foreach { a =>
        assert(a.annotatorType == AnnotatorType.TOKEN, "Tokenizer annotations type should be equal to 'token'")
      }
    }

    it should "annotate with the correct word indexes" in {
      val f = fixture(dataset)
      f.tokensAnnotations.foreach { a =>
        val token = a.result
        val sentenceToken = f.corpus.slice(a.begin, a.end + 1)
        assert(sentenceToken == token, s"Word ($sentenceToken) from sentence at (${a.begin},${a.end}) should be equal to token ($token) inside the corpus ${f.corpus}")
      }
    }
  }
} 
Example 119
Source File: RankingMetricFormatter.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.closures.UDFs._
import ws.vinta.albedo.evaluators.RankingEvaluator._

class RankingMetricFormatter(override val uid: String, val sourceType: String)
  extends Transformer with DefaultParamsWritable {

  def this(sourceType: String) = {
    this(Identifiable.randomUID("rankingMetricFormatter"), sourceType)
  }

  val userCol = new Param[String](this, "userCol", "User column name")

  def getUserCol: String = $(userCol)

  def setUserCol(value: String): this.type = set(userCol, value)
  setDefault(userCol -> "user")

  val itemCol = new Param[String](this, "itemCol", "Item column name")

  def getItemCol: String = $(itemCol)

  def setItemCol(value: String): this.type = set(itemCol, value)
  setDefault(itemCol -> "item")

  val predictionCol = new Param[String](this, "predictionCol", "Prediction column name")

  def getPredictionCol: String = $(predictionCol)

  def setPredictionCol(value: String): this.type = set(predictionCol, value)
  setDefault(predictionCol -> "prediction")

  val topK = new IntParam(this, "topK", "Recommend top-k items for every user")

  def getTopK: Int = $(topK)

  def setTopK(value: Int): this.type = set(topK, value)
  setDefault(topK -> 15)

  override def transformSchema(schema: StructType): StructType = {
    Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType)
      .foreach{
        case(columnName: String, expectedDataType: DataType) => {
          val actualDataType = schema(columnName).dataType
          require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.")
        }
      }

    schema
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    sourceType match {
      case "als" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK)))
      case "lr" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK)))
    }
  }

  override def copy(extra: ParamMap): RankingMetricFormatter = {
    val copied = new RankingMetricFormatter(uid, sourceType)
    copyValues(copied, extra)
  }
}

object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter] 
Example 120
Source File: SQLTransformer.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.feature

import com.tencent.angel.sona.ml.Transformer
import com.tencent.angel.sona.ml.param.{Param, ParamMap}
import com.tencent.angel.sona.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType

/**
  * Implements the transformations which are defined by SQL statement.
  * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...'
  * where '__THIS__' represents the underlying table of the input dataset.
  * The select clause specifies the fields, constants, and expressions to display in
  * the output, it can be any select clause that Spark SQL supports. Users can also
  * use Spark SQL built-in function and UDFs to operate on these selected columns.
  * For example, [[SQLTransformer]] supports statements like:
  * {{{
  *  SELECT a, a + b AS a_b FROM __THIS__
  *  SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
  *  SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
  * }}}
  */
class SQLTransformer(override val uid: String) extends Transformer
  with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("sql"))

  /**
    * SQL statement parameter. The statement is provided in string form.
    *
    * @group param
    */
  final val statement: Param[String] = new Param[String](this, "statement", "SQL statement")

  
  def setStatement(value: String): this.type = set(statement, value)

  
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val tableName = Identifiable.randomUID(uid)
    dataset.createOrReplaceTempView(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val result = dataset.sparkSession.sql(realStatement)
    // Call SessionCatalog.dropTempView to avoid unpersisting the possibly cached dataset.
    dataset.sparkSession.catalog.dropTempView(tableName)
    // Compatible.sessionstate.catalog.dropTempView(tableName)
    result
  }

  override def transformSchema(schema: StructType): StructType = {
    val spark = SparkSession.builder().getOrCreate()
    val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty))
    val dummyDF = spark.createDataFrame(dummyRDD, schema)
    val tableName = Identifiable.randomUID(uid)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    dummyDF.createOrReplaceTempView(tableName)
    val outputSchema = spark.sql(realStatement).schema
    spark.catalog.dropTempView(tableName)
    outputSchema
  }

  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}


object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 121
Source File: DatasetUtil.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import org.apache.spark.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata}
import org.apache.spark.sql.{Column, DataFrame, Dataset}


object DatasetUtil {
  def withColumns[T](ds: Dataset[T],
                     colNames: Seq[String],
                     cols: Seq[Column],
                     metadata: Seq[Metadata]): DataFrame = {
    require(colNames.size == cols.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of columns: ${cols.size}")
    require(colNames.size == metadata.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of metadata elements: ${metadata.size}")

    val sparkSession = ds.sparkSession
    val queryExecution = ds.queryExecution
    val resolver = sparkSession.sessionState.analyzer.resolver
    val output = queryExecution.analyzed.output

    checkColumnNameDuplication(colNames,
      "in given column names",
      sparkSession.sessionState.conf.caseSensitiveAnalysis)

    val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) =>
      colName -> col.as(colName, metadata)
    }.toMap

    val replacedAndExistingColumns = output.map { field =>
      columnMap.find { case (colName, _) =>
        resolver(field.name, colName)
      } match {
        case Some((colName: String, col: Column)) => col.as(colName)
        case _ => new Column(field)
      }
    }

    val newColumns = columnMap.filter { case (colName, col) =>
      !output.exists(f => resolver(f.name, colName))
    }.map { case (colName, col) => col.as(colName) }

    ds.select(replacedAndExistingColumns ++ newColumns: _*)
  }

  def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = {
    withColumns(ds, Seq(colName), Seq(col), Seq(metadata))
  }

  private def checkColumnNameDuplication(columnNames: Seq[String], colType: String,
                                         caseSensitiveAnalysis: Boolean): Unit = {
    val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
    if (names.distinct.length != names.length) {
      val duplicateColumns = names.groupBy(identity).collect {
        case (x, ys) if ys.length > 1 => s"`$x`"
      }
      throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
    }
  }

  /**
    * Cast a column in a Dataset to Vector type.
    *
    * The supported data types of the input column are
    * - Vector
    * - float/double type Array.
    *
    * Note: The returned column does not have Metadata.
    *
    * @param dataset input DataFrame
    * @param colName column name.
    * @return Vector column
    */
  def columnToVector(dataset: Dataset[_], colName: String): Column = {
    val columnDataType = dataset.schema(colName).dataType
    columnDataType match {
      case _: VectorUDT => col(colName)
      case fdt: ArrayType =>
        val transferUDF = fdt.elementType match {
          case _: FloatType => udf(f = (vector: Seq[Float]) => {
            val inputArray = Array.fill[Double](vector.size)(0.0)
            vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble)
            Vectors.dense(inputArray)
          })
          case _: DoubleType => udf((vector: Seq[Double]) => {
            Vectors.dense(vector.toArray)
          })
          case other =>
            throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector")
        }
        transferUDF(col(colName))
      case other =>
        throw new IllegalArgumentException(s"$other column cannot be cast to Vector")
    }
  }

} 
Example 122
Source File: StructuredStreamingKafkaSample.scala    From kafka-scala-api   with Apache License 2.0 5 votes vote down vote up
package com.example.structured_streaming

import org.apache.spark.sql.{Dataset, SparkSession}

object StructuredStreamingKafkaSample extends App {

  val sparkSession = SparkSession
    .builder
    .master("local")
    .appName("kafka")
    .getOrCreate()

  sparkSession.sparkContext.setLogLevel("ERROR")

  import sparkSession.implicits._

  val kafkaDF = sparkSession
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "127.0.0.1:9092")
    .option("subscribe", "structured_topic")
    .load()

  val data: Dataset[(String, String)] = kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
    .as[(String, String)]
  kafkaDF.printSchema()

  data.writeStream
    .outputMode("append")
    .format("console")
    .start()
    .awaitTermination()

} 
Example 123
Source File: DataPreprocess.scala    From xgbspark-text-classification   with Apache License 2.0 5 votes vote down vote up
package com.lenovo.ml

import org.apache.spark.sql.{SparkSession, DataFrame, Dataset}
import scala.collection.mutable
import scala.util.matching.Regex
import org.ansj.library.DicLibrary
import org.ansj.recognition.impl.StopRecognition
import org.ansj.splitWord.analysis.DicAnalysis


object DataPreprocess {
  def textCleaner(sparkSession: SparkSession, rawText: DataFrame): Dataset[String] = {
    // 过滤文本中的时间、网址和邮箱
    val regex1 = new Regex("""[-—0-9a-z]+[:]+[0-9a-z]+[:]?""")
    val regex2 = new Regex("""[0-9]+年|[0-9]+月|[0-9]+[日]|[0-9]+[天]|[0-9]+[号]|[0-9]+[次]""")
    val regex3 = new Regex("""http[s]?://[a-z0-9./?=_-]+""")
    val regex4 = new Regex("""[0-9_a-z]+([-+.][0-9_a-z]+)*@[0-9_a-z]+([-.][0-9_a-z]+)*\.[0-9_a-z]+([-.][0-9_a-z]+)*""")

    import sparkSession.implicits._
    rawText.map(x => x.toString).map(x => x.substring(1,x.length - 1).toLowerCase).map(x => regex1.replaceAllIn(x,""))
      .map(x => regex2.replaceAllIn(x,"")).map(x => regex3.replaceAllIn(x,"")).map(x => regex4.replaceAllIn(x,""))
  }

  def segWords(sparkSession: SparkSession, stopWordsPath: String, dictionaryPath: String, synonymWordsPath: String,
               singleWordsPath: String, rawText: DataFrame): DataFrame = {
    val filter = new StopRecognition()
    // 设定停用词性
    filter.insertStopNatures("w","ns","nr","t","r","u","e","y","o")
    // 加载停用词表
    val stopWords = sparkSession.sparkContext.textFile(stopWordsPath).cache()
    stopWords.collect().foreach{line => filter.insertStopWords(line)}
    // 加载自定义词表
    val dictionary = sparkSession.sparkContext.textFile(dictionaryPath).cache()
    dictionary.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)}
    stopWords.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)}
    // 构建同义词表
    val synonymWords = sparkSession.sparkContext.textFile(synonymWordsPath).cache()
    var synonymMap: Map[String, String] = Map()
    synonymWords.collect().foreach{line =>
      val data = line.split(" ",2)
      synonymMap = synonymMap + (data(0) -> data(1))
    }
    // 构建单字白名单
    val singleWords = sparkSession.sparkContext.textFile(singleWordsPath).cache()
    val singleWhiteList: mutable.Set[String] = mutable.Set()
    singleWords.collect().foreach{line => singleWhiteList.add(line)}

    // 通过广播将词表发送给各节点
    val stop = sparkSession.sparkContext.broadcast(filter)
    val dic = sparkSession.sparkContext.broadcast(DicLibrary.get(DicLibrary.DEFAULT))
    val synonym = sparkSession.sparkContext.broadcast(synonymMap)
    val single = sparkSession.sparkContext.broadcast(singleWhiteList)

    // 读取文本数据,过滤后分词
    import sparkSession.implicits._
    textCleaner(sparkSession, rawText).map { x =>
      val parse = DicAnalysis.parse(x, dic.value).recognition(stop.value)
      // 抽取分词结果,不附带词性
      val words = for(i<-Range(0,parse.size())) yield parse.get(i).getName
      val filterWords = words.map(_.trim).filter(x => x.length > 1 || single.value.contains(x))
      filterWords.map(x => if(synonym.value.contains(x)) synonym.value(x) else x).mkString(" ")
    }.toDF("words")
  }
} 
Example 124
Source File: SimpleVectorAssembler.scala    From albedo   with MIT License 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkException
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row}

import scala.collection.mutable.ArrayBuilder



  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)

    val schema = dataset.schema
    val assembleFunc = udf { r: Row =>
      SimpleVectorAssembler.assemble(r.toSeq: _*)
    }
    val args = $(inputCols).map { c =>
      schema(c).dataType match {
        case DoubleType => dataset(c)
        case _: VectorUDT => dataset(c)
        case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid")
      }
    }

    dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol)))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputColNames = $(inputCols)
    val outputColName = $(outputCol)
    val inputDataTypes = inputColNames.map(name => schema(name).dataType)
    inputDataTypes.foreach {
      case _: NumericType | BooleanType =>
      case t if t.isInstanceOf[VectorUDT] =>
      case other =>
        throw new IllegalArgumentException(s"Data type $other is not supported.")
    }
    if (schema.fieldNames.contains(outputColName)) {
      throw new IllegalArgumentException(s"Output column $outputColName already exists.")
    }
    StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true))
  }

  override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra)
}

object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] {
  override def load(path: String): SimpleVectorAssembler = super.load(path)

  def assemble(vv: Any*): Vector = {
    val indices = ArrayBuilder.make[Int]
    val values = ArrayBuilder.make[Double]
    var cur = 0
    vv.foreach {
      case v: Double =>
        if (v != 0.0) {
          indices += cur
          values += v
        }
        cur += 1
      case vec: Vector =>
        vec.foreachActive { case (i, v) =>
          if (v != 0.0) {
            indices += cur + i
            values += v
          }
        }
        cur += vec.size
      case null =>
        // TODO: output Double.NaN?
        throw new SparkException("Values to assemble cannot be null.")
      case o =>
        throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.")
    }
    Vectors.sparse(cur, indices.result(), values.result()).compressed
  }
} 
Example 125
Source File: ALSRecommender.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.recommenders

import com.github.fommil.netlib.F2jBLAS
import org.apache.spark.ml.recommendation.ALSModel
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.settings

class ALSRecommender(override val uid: String) extends Recommender {

  def this() = {
    this(Identifiable.randomUID("alsRecommender"))
  }

  private def alsModel: ALSModel = {
    val alsModelPath = s"${settings.dataDir}/${settings.today}/alsModel.parquet"
    ALSModel.load(alsModelPath)
  }

  def blockify(factors: Dataset[(Int, Array[Float])], blockSize: Int = 4096): Dataset[Seq[(Int, Array[Float])]] = {
    import factors.sparkSession.implicits._
    factors.mapPartitions(_.grouped(blockSize))
  }

  override def source = "als"

  override def recommendForUsers(userDF: Dataset[_]): DataFrame = {
    transformSchema(userDF.schema)

    import userDF.sparkSession.implicits._

    val activeUsers = userDF.select(col($(userCol)).alias("id"))
    val userFactors = alsModel.userFactors.join(activeUsers, Seq("id"))
    val itemFactors = alsModel.itemFactors
    val rank = alsModel.rank
    val num = $(topK)

    val userFactorsBlocked = blockify(userFactors.as[(Int, Array[Float])])
    val itemFactorsBlocked = blockify(itemFactors.as[(Int, Array[Float])])
    val ratings = userFactorsBlocked.crossJoin(itemFactorsBlocked)
      .as[(Seq[(Int, Array[Float])], Seq[(Int, Array[Float])])]
      .flatMap { case (srcIter, dstIter) =>
        val m = srcIter.size
        val n = math.min(dstIter.size, num)
        val output = new Array[(Int, Int, Float)](m * n)
        var i = 0
        val pq = new BoundedPriorityQueue[(Int, Float)](num)(Ordering.by(_._2))
        srcIter.foreach { case (srcId, srcFactor) =>
          dstIter.foreach { case (dstId, dstFactor) =>
            val score = new F2jBLAS().sdot(rank, srcFactor, 1, dstFactor, 1)
            pq += dstId -> score
          }
          pq.foreach { case (dstId, score) =>
            output(i) = (srcId, dstId, score)
            i += 1
          }
          pq.clear()
        }
        output.toSeq
      }

    ratings
      .toDF($(userCol), $(itemCol), $(scoreCol))
      .withColumn($(sourceCol), lit(source))
  }
} 
Example 126
Source File: CurationRecommender.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.recommenders

import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import ws.vinta.albedo.utils.DatasetUtils._

class CurationRecommender(override val uid: String) extends Recommender {

  def this() = {
    this(Identifiable.randomUID("curationRecommender"))
  }

  override def source = "curation"

  override def recommendForUsers(userDF: Dataset[_]): DataFrame = {
    transformSchema(userDF.schema)

    implicit val spark: SparkSession = userDF.sparkSession
    import spark.implicits._

    val rawStarringDS = loadRawStarringDS().cache()

    val curatorIds = Array(652070, 1912583, 59990, 646843, 28702) // vinta, saiday, tzangms, fukuball, wancw
    val curatedRepoDF = rawStarringDS
      .select($"repo_id", $"starred_at")
      .where($"user_id".isin(curatorIds: _*))
      .groupBy($"repo_id")
      .agg(max($"starred_at").alias("starred_at"))
      .orderBy($"starred_at".desc)
      .limit($(topK))
      .cache()

    def calculateScoreUDF = udf((starred_at: java.sql.Timestamp) => {
      starred_at.getTime / 1000.0
    })

    userDF
      .select($(userCol))
      .crossJoin(curatedRepoDF)
      .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"starred_at").alias($(scoreCol)))
      .withColumn($(sourceCol), lit(source))
  }
} 
Example 127
Source File: PopularityRecommender.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.recommenders

import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import ws.vinta.albedo.utils.DatasetUtils._

class PopularityRecommender(override val uid: String) extends Recommender {

  def this() = {
    this(Identifiable.randomUID("popularityRecommender"))
  }

  override def source = "popularity"

  override def recommendForUsers(userDF: Dataset[_]): DataFrame = {
    transformSchema(userDF.schema)

    implicit val spark: SparkSession = userDF.sparkSession
    import spark.implicits._

    val popularRepoDF = loadPopularRepoDF()
      .limit($(topK))
      .cache()

    def calculateScoreUDF = udf((stargazers_count: Int, created_at: java.sql.Timestamp) => {
      val valueScore = math.round(math.log10(stargazers_count) * 1000.0) / 1000.0
      val timeScore = (created_at.getTime / 1000.0) / (60 * 60 * 24 * 30 * 12) / 5.0
      valueScore + timeScore
    })

    userDF
      .select($(userCol))
      .crossJoin(popularRepoDF)
      .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"repo_stargazers_count", $"repo_created_at").alias($(scoreCol)))
      .withColumn($(sourceCol), lit(source))
  }
} 
Example 128
Source File: ContentRecommender.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.recommenders

import org.apache.http.HttpHost
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.elasticsearch.action.search.SearchRequest
import org.elasticsearch.client.{RestClient, RestHighLevelClient}
import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item
import org.elasticsearch.index.query.QueryBuilders._
import org.elasticsearch.search.SearchHit
import org.elasticsearch.search.builder.SearchSourceBuilder
import ws.vinta.albedo.closures.DBFunctions._

class ContentRecommender(override val uid: String) extends Recommender {

  def this() = {
    this(Identifiable.randomUID("contentRecommender"))
  }

  val enableEvaluationMode = new Param[Boolean](this, "enableEvaluationMode", "Should be enable for evaluation only")

  def getEnableEvaluationMode: Boolean = $(enableEvaluationMode)

  def setEnableEvaluationMode(value: Boolean): this.type = set(enableEvaluationMode, value)
  setDefault(enableEvaluationMode -> false)

  override def source = "content"

  override def recommendForUsers(userDF: Dataset[_]): DataFrame = {
    transformSchema(userDF.schema)

    import userDF.sparkSession.implicits._

    val userRecommendedItemDF = userDF
      .as[Int]
      .flatMap {
        case (userId) => {
          // 因為 More Like This query 用 document id 查詢時
          // 結果會過濾掉那些做為條件的 document ids
          // 但是這樣在 evaluate 的時候就不太合適了
          // 所以我們改用後 k 個 repo 當作查詢條件
          val limit = $(topK)
          val offset = if ($(enableEvaluationMode)) $(topK) else 0
          val repoIds = selectUserStarredRepos(userId, limit, offset)

          val lowClient = RestClient.builder(new HttpHost("127.0.0.1", 9200, "http")).build()
          val highClient = new RestHighLevelClient(lowClient)

          val fields = Array("description", "full_name", "language", "topics")
          val texts = Array("")
          val items = repoIds.map((itemId: Int) => new Item("repo", "repo_info_doc", itemId.toString))
          val queryBuilder = moreLikeThisQuery(fields, texts, items)
            .minTermFreq(2)
            .maxQueryTerms(50)

          val searchSourceBuilder = new SearchSourceBuilder()
          searchSourceBuilder.query(queryBuilder)
          searchSourceBuilder.size($(topK))
          searchSourceBuilder.from(0)

          val searchRequest = new SearchRequest()
          searchRequest.indices("repo")
          searchRequest.types("repo_info_doc")
          searchRequest.source(searchSourceBuilder)

          val searchResponse = highClient.search(searchRequest)
          val hits = searchResponse.getHits
          val searchHits = hits.getHits

          val userItemScoreTuples = searchHits.map((searchHit: SearchHit) => {
            val itemId = searchHit.getId.toInt
            val score = searchHit.getScore
            (userId, itemId, score)
          })

          lowClient.close()

          userItemScoreTuples
        }
      }
      .toDF($(userCol), $(itemCol), $(scoreCol))
      .withColumn($(sourceCol), lit(source))

    userRecommendedItemDF
  }
} 
Example 129
Source File: UserRepoTransformer.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.closures.UDFs._

class UserRepoTransformer(override val uid: String)
  extends Transformer with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("userRepoTransformer"))
  }

  val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "Input column names")

  def getInputCols: Array[String] = $(inputCols)

  def setInputCols(value: Array[String]): this.type = set(inputCols, value)

  override def transformSchema(schema: StructType): StructType = {
    $(inputCols).foreach((inputColName: String) => {
      require(schema.fieldNames.contains(inputColName), s"Input column $inputColName must exist.")
    })

    val newFields: Array[StructField] = Array(
      StructField("repo_language_index_in_user_recent_repo_languages", IntegerType, nullable = false),
      StructField("repo_language_count_in_user_recent_repo_languages", IntegerType, nullable = false)
    )
    StructType(schema.fields ++ newFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    import dataset.sparkSession.implicits._

    dataset
      .withColumn("repo_language_index_in_user_recent_repo_languages", repoLanguageIndexInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages"))
      .withColumn("repo_language_count_in_user_recent_repo_languages", repoLanguageCountInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages"))
  }

  override def copy(extra: ParamMap): UserRepoTransformer = {
    defaultCopy(extra)
  }
}

object UserRepoTransformer extends DefaultParamsReadable[UserRepoTransformer] 
Example 130
Source File: Evaluator.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.evaluation

import com.tencent.angel.sona.ml.param.{ParamMap, Params}
import org.apache.spark.sql.Dataset

/**
 * :: DeveloperApi ::
 * Abstract class for evaluators that compute metrics from predictions.
 */
abstract class Evaluator extends Params {

  /**
   * Evaluates model output and returns a scalar metric.
   * The value of [[isLargerBetter]] specifies whether larger values are better.
   *
   * @param dataset a dataset that contains labels/observations and predictions.
   * @param paramMap parameter map that specifies the input columns and output metrics
   * @return metric
   */

  def evaluate(dataset: Dataset[_], paramMap: ParamMap): Double = {
    this.copy(paramMap).evaluate(dataset)
  }

  /**
   * Evaluates model output and returns a scalar metric.
   * The value of [[isLargerBetter]] specifies whether larger values are better.
   *
   * @param dataset a dataset that contains labels/observations and predictions.
   * @return metric
   */

  def evaluate(dataset: Dataset[_]): Double

  /**
   * Indicates whether the metric returned by `evaluate` should be maximized (true, default)
   * or minimized (false).
   * A given evaluator may support multiple metrics which may be maximized or minimized.
   */

  def isLargerBetter: Boolean = true


  override def copy(extra: ParamMap): Evaluator
} 
Example 131
Source File: IntermediateCacher.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}

class IntermediateCacher(override val uid: String)
  extends Transformer with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("intermediateCacher"))
  }

  val inputCols = new StringArrayParam(this, "inputCols", "Input column names")

  def getInputCols: Array[String] = $(inputCols)

  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
  setDefault(inputCols -> Array.empty[String])

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    val intermediateDF = if ($(inputCols).isEmpty) dataset.toDF() else dataset.select($(inputCols).map(col(_)): _*)
    intermediateDF.cache()
  }

  override def copy(extra: ParamMap): IntermediateCacher = {
    defaultCopy(extra)
  }
}

object IntermediateCacher extends DefaultParamsReadable[IntermediateCacher] 
Example 132
Source File: DataFrameConverter.scala    From incubator-toree   with Apache License 2.0 5 votes vote down vote up
package org.apache.toree.utils

import org.apache.spark.sql.{Dataset, Row}
import org.apache.toree.plugins.Plugin
import play.api.libs.json.{JsObject, Json}

import scala.util.Try
import org.apache.toree.plugins.annotations.Init

import DataFrameConverter._

class DataFrameConverter extends Plugin with LogLike {
  @Init def init() = {
    register(this)
  }

  def convert(df: Dataset[Row], outputType: String, limit: Int = 10): Try[String] = {
    Try(
      outputType.toLowerCase() match {
        case "html" =>
          convertToHtml(df = df, limit = limit)
        case "json" =>
          convertToJson(df = df, limit = limit)
        case "csv" =>
          convertToCsv(df = df, limit = limit)
      }
    )
  }

  private def convertToHtml(df: Dataset[Row], limit: Int = 10): String = {
      val columnFields = df.schema.fieldNames.map(columnName => {
        s"<th>${columnName}</th>"
      }).reduce(_ + _)
      val columns = s"<tr>${columnFields}</tr>"
      val rows = df.rdd.map(row => {
        val fieldValues = row.toSeq.map(field => {
         s"<td>${fieldToString(field)}</td>"
        }).reduce(_ + _)
        s"<tr>${fieldValues}</tr>"
      }).take(limit).reduce(_ + _)
      s"<table>${columns}${rows}</table>"
  }

  private def convertToJson(df: Dataset[Row], limit: Int = 10): String = {
    val schema = Json.toJson(df.schema.fieldNames)
    val transformed = df.rdd.map(row =>
      row.toSeq.map(fieldToString).toArray)
    val rows = transformed.take(limit)
    JsObject(Seq(
      "columns" -> schema,
      "rows" -> Json.toJson(rows)
    )).toString()
  }

  private def convertToCsv(df: Dataset[Row], limit: Int = 10): String = {
      val headers = df.schema.fieldNames.reduce(_ + "," + _)
      val rows = df.rdd.map(row => {
        row.toSeq.map(fieldToString).reduce(_ + "," + _)
      }).take(limit).reduce(_ + "\n" + _)
      s"${headers}\n${rows}"
  }

}

object DataFrameConverter {

  def fieldToString(any: Any): String =
    any match {
      case null => "null"
      case seq: Seq[_] => seq.mkString("[", ", ", "]")
      case _ => any.toString
    }

} 
Example 133
Source File: K-Centers.scala    From Clustering4Ever   with Apache License 2.0 5 votes vote down vote up
package org.clustering4ever.clustering.kcenters.dataset

		@annotation.tailrec
		def go(cpt: Int, haveAllCentersConverged: Boolean, centers: List[(Int, V)]): List[(Int, V)] = {
			val preUpdatedCenters = data.groupByKey( cz => obtainNearestCenterID(cz.v, centers, metric) )(encoderInt)
				.mapGroups(computeCenters)(encoder)
				.collect
				.sortBy(_._1)
				.toList
			val alignedOldCenters = preUpdatedCenters.map{ case (oldClusterID, _) => centers(oldClusterID) }
			val updatedCenters = preUpdatedCenters.zipWithIndex.map{ case ((oldClusterID, center), newClusterID) => (newClusterID, center) }
			val shiftingEnough = areCentersNotMovingEnough(updatedCenters, alignedOldCenters, minShift, metric)
			if(cpt < maxIterations && !shiftingEnough) {
				go(cpt + 1, shiftingEnough, updatedCenters)
			}
			else {
				updatedCenters
			}
		}

		immutable.HashMap(go(0, false, centers):_*)

	}
} 
Example 134
Source File: K-Means.scala    From Clustering4Ever   with Apache License 2.0 5 votes vote down vote up
package org.clustering4ever.clustering.kcenters.rdd

	final def fit[D <: ContinuousDistance](
		data: RDD[Array[Double]],
		k: Int,
		metric: D,
		minShift: Double,
		maxIterations: Int,
		persistanceLVL: StorageLevel
		): KMeansModel[D] = {
		KMeans(k, metric, minShift, maxIterations, persistanceLVL).fit(scalarDataWithIDToClusterizable(data.zipWithIndex))
	}
} 
Example 135
Source File: Utils.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.profilers

import au.com.bytecode.opencsv.CSVParser
import com.bizo.mighty.csv.{CSVReader, CSVReaderSettings}
import org.apache.spark.sql.Dataset

object Utils {

  case class Field(idx: Int, value: String)

  def readAs[T](filename: String)(implicit settings: CSVReaderSettings, mf: Manifest[T]): Iterator[T] = {
    val is = getClass.getResourceAsStream(filename)
    CSVReader(is)(settings) {
      CSVReader.convertRow[T]
    }
  }

  def split(ds: Dataset[String], delimiter: String = ","): Dataset[Array[String]] = {
    import ds.sparkSession.implicits._
    ds.mapPartitions({ lines =>
      val parser = new CSVParser(delimiter.charAt(0))
      lines map parser.parseLine
    })
  }

  def buildColumns(ds: Dataset[Array[String]]): Dataset[Field] = {
    import ds.sparkSession.implicits._
    ds.flatMap({ values =>
      values.zipWithIndex.map({ case (value, col) =>
        Field(col, value)
      })
    })
  }

} 
Example 136
Source File: AsciiProfiler.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.profilers.raw

import io.gzet.profilers.Utils._
import org.apache.spark.sql.{Dataset, Row}

case class AsciiProfiler(asciiMap: Map[Int, Ascii]) {

  def profile(df: Dataset[String]): Dataset[AsciiReport] = {

    val charset = df.sparkSession.sparkContext.broadcast(asciiMap)

    import df.sparkSession.implicits._

    val charCount = df.flatMap(_.toCharArray.map(_.asInstanceOf[Int]))
      .groupByKey(t => t)
      .count()
      .withColumnRenamed("value", "tmp")
      .withColumnRenamed("count(1)", "count")

    charCount.map({ case Row(octet: Int, count: Long) =>
      val ascii = charset.value.getOrElse(octet, Ascii("NA", "NA", "NA", "NA", "NA"))
      AsciiReport(
        ascii.binary,
        ascii.description,
        count
      )
    })
  }
}

object AsciiProfiler {
  def apply(): AsciiProfiler = {
    val ascii = readAs[Ascii]("/ascii.csv").toList
    AsciiProfiler(ascii.map(a => (a.octet.toInt, a)).toMap)
  }
}

case class Ascii(
                  symbol: String,
                  octet: String,
                  hex: String,
                  binary: String,
                  description: String
                )

case class AsciiReport(
                        binary: String,
                        ascii: String,
                        metricValue: Double
                      ) 
Example 137
Source File: RowProfiler.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.profilers.raw

import org.apache.spark.sql.Dataset

case class RowProfiler() {

  def profile(df: Dataset[String]): Dataset[RowReport] = {
    import df.sparkSession.implicits._
    val report = RowReport(df.count().toDouble)
    df.sparkSession.createDataset[RowReport](
      Seq(report)
    )
  }
}

case class RowReport(
                      metricValue: Double
                    ) 
Example 138
Source File: StructuralProfiler.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.profilers.raw

import au.com.bytecode.opencsv.CSVParser
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Row}

case class StructuralProfiler(delimiter: String = ",") {

  def profile(df: Dataset[String]): Dataset[StructuralReport] = {

    import df.sparkSession.implicits._

    val rows = df.mapPartitions({ lines =>
      val parser = new CSVParser(delimiter.charAt(0))
      lines.map(line => (parser.parseLine(line).length, line))
    })

    val fieldCount = rows.groupByKey({ case (fields, line) =>
      fields
    }).count()
      .withColumnRenamed("value", "fields")
      .withColumnRenamed("count(1)", "count")

    val fieldLine = rows.groupByKey({ case (fields, line) =>
      fields
    }).reduceGroups({ (v1, v2) => v1 }).map({ case (fields, (_, line)) =>
      (fields, line)
    })
      .withColumnRenamed("_1", "_fieldLine_")
      .withColumnRenamed("_2", "line")

    fieldCount.join(fieldLine, col("fields") === col("_fieldLine_"))
      .drop("_fieldLine_")
      .map({ case Row(columns: Int, count: Long, line: String) =>
        StructuralReport(
          columns,
          count,
          line
        )
      })
  }
}

case class StructuralReport(
                             fields: Int,
                             metricValue: Double,
                             description: String
                           ) 
Example 139
Source File: EmptinessProfiler.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.profilers.field

import io.gzet.profilers.Utils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.Dataset

import scalaz.Scalaz._

case class EmptinessProfiler() {

  def profile(df: Dataset[Array[String]]): Dataset[EmptinessReport] = {

    import df.sparkSession.implicits._

    val features = Utils.buildColumns(df)

    features.map(f => (f.idx, StringUtils.isNotEmpty(f.value))).groupByKey({ case (column, isNotEmpty) =>
      (column, isNotEmpty)
    }).count().map({ case ((column, isNotEmpty), count) =>
      (column, Map(isNotEmpty -> count))
    }).groupByKey({ case (column, map) =>
      column
    }).reduceGroups({ (v1, v2) =>
      (v1._1, v1._2 |+| v2._2)
    }).map({ case (col, (_, map)) =>
      val emptiness = map.getOrElse(false, 0L) / (map.getOrElse(true, 0L) + map.getOrElse(false, 0L)).toDouble
      EmptinessReport(
        col,
        emptiness
      )
    })

  }

}

case class EmptinessReport(
                            field: Int,
                            metricValue: Double
                          ) 
Example 140
Source File: DataFrameFunctions.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import org.apache.spark.sql.{Column, Dataset, Row}


class DataFrameFunctions(self: DC[Row]) {

    def join(right: DC[Row]): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right)
      }
      val hashTarget = Seq("join")
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], usingColumn: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, usingColumn)
      }
      val hashTarget = Seq("join", usingColumn)
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner")

    def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, joinExprs)
      }
      val hashTarget = Seq("join", joinType, joinExprs.toString())
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }


} 
Example 141
Source File: WordCount.scala    From Scalaprof   with GNU General Public License v2.0 5 votes vote down vote up
package edu.neu.csye._7200

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}

object WordCount extends App {

  def wordCount(lines: RDD[String],separator: String) = {
    lines.flatMap(_.split(separator))
         .map((_,1))
         .reduceByKey(_ + _)
  }

  def wordCount2(lines: RDD[String], separator: String) = {
    lines.flatMap(_.split(separator))
        .filter(!_.contains("He"))
        .map(_.replace(",", ""))
        .map((_,1))
        .reduceByKey(_ + _)
  }

  def wordCount3(lines: RDD[String], separator: String) = {
    lines.flatMap(_.split(separator))
        .filter(myFilter(_, "He"))
        .map(myReplacer _)
        .map((_,1))
        .reduceByKey(_ + _)
  }

  def myFilter(input: String, keyword: String) = !input.contains(keyword)

  def myReplacer(input: String) = input.replace(",","")

  case class Word(word: String, count: Int)

  def createWordDS(ds: Dataset[String], separator: String)(implicit spark:SparkSession) = {
    import spark.implicits._
    ds.flatMap(_.split(separator))
      .map((_,1))
      .map(Word.tupled)
      .as[Word]
  }

  //For Spark 1.0-1.9
  val sc = new SparkContext(new SparkConf().setAppName("WordCount").setMaster("local[*]"))

  wordCount(sc.textFile("input//WordCount.txt")," ").collect().foreach(println(_))

  sc.stop()

  //For Spark 2.0+
  implicit val spark = SparkSession
    .builder()
    .appName("WordCount")
    .master("local[*]")
    .getOrCreate()

  wordCount(spark.read.textFile("input//WordCount.txt").rdd," ").collect().foreach(println(_))

  //Spark SQL example
  val wordDS = createWordDS(spark.read.textFile("input//WordCount.txt")," ")

  wordDS.createTempView("words")
  wordDS.cache()

  spark.sql("select word, count(*) from words group by word").show(10)

  spark.stop()
} 
Example 142
Source File: SuicidalMonkeyProcessor.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.example

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.streaming.OutputMode

import pipelines.streamlets.StreamletShape
import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet }
import pipelines.spark.sql.SQLImplicits._

class SuicidalMonkeyProcessor extends SparkStreamlet {
  val in = AvroInlet[Data]("in")
  val out = AvroOutlet[Data]("out", _.key.toString)
  val shape = StreamletShape(in, out)

  val rng = scala.util.Random
  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries = {
      val outStream = process(readStream(in))
      writeStream(outStream, out, OutputMode.Append).toQueryExecution
    }

    private def process(inDataset: Dataset[Data]): Dataset[Data] = {
      inDataset.mapPartitions { iter ⇒
        // monkey business
        // The logic in this processor causes the current executor to crash with a certain probability.
        // comment out to see the process working
        if (rng.nextDouble() < SequenceSettings.FailureProbability) {
          sys.exit(-1)
        }
        iter
      }

    }
  }

} 
Example 143
Source File: SparkSequenceGeneratorIngress.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.example

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.streaming.OutputMode

import pipelines.streamlets._
import pipelines.streamlets.StreamletShape
import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet }
import pipelines.spark.sql.SQLImplicits._

class SparkSequenceGeneratorIngress extends SparkStreamlet {
  val out = AvroOutlet[Data]("out", d ⇒ d.key.toString)
  val shape = StreamletShape(out)

  val RecordsPerSecond = IntegerConfigParameter(
    "records-per-second",
    "Records per second to process.",
    Some(50))

  override def configParameters = Vector(RecordsPerSecond)

  override def createLogic() = new SparkStreamletLogic {
    val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key)

    override def buildStreamingQueries = {
      writeStream(process, out, OutputMode.Append).toQueryExecution
    }

    private def process: Dataset[Data] = {
      session.readStream
        .format("rate")
        .option("rowsPerSecond", recordsPerSecond)
        .load()
        .withColumn("key", ($"value" / SequenceSettings.GroupSize).cast(LongType))
        .as[Data]
    }
  }
} 
Example 144
Source File: MovingAverageSparklet.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.example

import pipelines.streamlets.StreamletShape

import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet }

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.TimestampType
import pipelines.spark.sql.SQLImplicits._
import org.apache.spark.sql.streaming.OutputMode

class MovingAverageSparklet extends SparkStreamlet {

  val in = AvroInlet[Data]("in")
  val out = AvroOutlet[Agg]("out", _.src)
  val shape = StreamletShape(in, out)

  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries = {
      val dataset = readStream(in)
      val outStream = process(dataset)
      writeStream(outStream, out, OutputMode.Append).toQueryExecution
    }

    private def process(inDataset: Dataset[Data]): Dataset[Agg] = {
      val query = inDataset
        .withColumn("ts", $"timestamp".cast(TimestampType))
        .withWatermark("ts", "1 minutes")
        .groupBy(window($"ts", "1 minute", "30 seconds"), $"src", $"gauge").agg(avg($"value") as "avg")
      query.select($"src", $"gauge", $"avg" as "value").as[Agg]
    }
  }

} 
Example 145
Source File: SparkRandomGenDataIngress.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.example

import java.sql.Timestamp

import scala.util.Random

import pipelines.streamlets.{ IntegerConfigParameter, StreamletShape }
import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.streaming.OutputMode

import pipelines.spark.sql.SQLImplicits._

case class Rate(timestamp: Timestamp, value: Long)

class SparkRandomGenDataIngress extends SparkStreamlet {
  val out = AvroOutlet[Data]("out", d ⇒ d.src)
  val shape = StreamletShape(out)

  val RecordsPerSecond = IntegerConfigParameter(
    "records-per-second",
    "Records per second to produce.",
    Some(50))

  override def configParameters = Vector(RecordsPerSecond)

  override def createLogic() = new SparkStreamletLogic {

    override def buildStreamingQueries = {
      writeStream(process, out, OutputMode.Append).toQueryExecution
    }

    private def process: Dataset[Data] = {

      val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key)

      val gaugeGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "oil" else "gas"

      val rateStream = session.readStream
        .format("rate")
        .option("rowsPerSecond", recordsPerSecond)
        .load()
        .as[Rate]

      rateStream.map {
        case Rate(timestamp, value) ⇒ Data(s"src-${value % 100}", timestamp.getTime, gaugeGen(), Random.nextDouble() * value)
      }
    }
  }
} 
Example 146
Source File: CallStatsAggregator.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.examples.carly.aggregator

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

import pipelines.streamlets._
import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.spark.sql.streaming.OutputMode
import pipelines.spark.sql.SQLImplicits._
import org.apache.log4j.{ Level, Logger }

import pipelines.examples.carly.data._
class CallStatsAggregator extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  //tag::docs-schemaAware-example[]
  val in = AvroInlet[CallRecord]("in")
  val out = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString)
  val shape = StreamletShape(in, out)
  //end::docs-schemaAware-example[]

  val GroupByWindow = DurationConfigParameter(
    "group-by-window",
    "Window duration for the moving average computation",
    Some("1 minute"))

  val Watermark = DurationConfigParameter(
    "watermark",
    "Late events watermark duration: how long to wait for late events",
    Some("1 minute"))

  override def configParameters = Vector(GroupByWindow, Watermark)
  override def createLogic = new SparkStreamletLogic {
    val watermark = context.streamletConfig.getDuration(Watermark.key)
    val groupByWindow = context.streamletConfig.getDuration(GroupByWindow.key)

    //tag::docs-aggregationQuery-example[]
    override def buildStreamingQueries = {
      val dataset = readStream(in)
      val outStream = process(dataset)
      writeStream(outStream, out, OutputMode.Update).toQueryExecution
    }

    private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = {
      val query =
        inDataset
          .withColumn("ts", $"timestamp".cast(TimestampType))
          .withWatermark("ts", s"${watermark.toMillis()} milliseconds")
          .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds"))
          .agg(avg($"duration") as "avgCallDuration", sum($"duration") as "totalCallDuration")
          .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType))

      query
        .select($"window.start".cast(LongType) as "startTime", $"windowDuration", $"avgCallDuration", $"totalCallDuration")
        .as[AggregatedCallStats]
    }
    //end::docs-aggregationQuery-example[]
  }
} 
Example 147
Source File: CallRecordGeneratorIngress.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.examples.carly.aggregator

import java.sql.Timestamp

import scala.util.Random
import scala.concurrent.duration._

import org.apache.spark.sql.{ Dataset, SparkSession }
import org.apache.spark.sql.streaming.OutputMode

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.LongType

import pipelines.streamlets._
import pipelines.streamlets.avro._
import pipelines.spark.sql.SQLImplicits._
import pipelines.examples.carly.data.CallRecord
import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.log4j.{ Level, Logger }

case class Rate(timestamp: Timestamp, value: Long)

class CallRecordGeneratorIngress extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  val RecordsPerSecond = IntegerConfigParameter(
    "records-per-second",
    "Records per second to process.",
    Some(50))

  override def configParameters = Vector(RecordsPerSecond)

  val out = AvroOutlet[CallRecord]("out", _.user)
  val shape = StreamletShape(out)

  override def createLogic() = new SparkStreamletLogic {
    val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key)
    override def buildStreamingQueries = {
      val outStream = DataGenerator.mkData(super.session, recordsPerSecond)
      writeStream(outStream, out, OutputMode.Append).toQueryExecution
    }
  }
}

object DataGenerator {
  def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = {
    // do we need to expose this through configuration?

    val MaxTime = 2.hours.toMillis
    val MaxUsers = 100000
    val TS0 = new java.sql.Timestamp(0)
    val ZeroTimestampProb = 0.05 // error rate

    // Random Data Generator
    val usersUdf = udf(() ⇒ "user-" + Random.nextInt(MaxUsers))
    val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing")

    // Time-biased randomized filter - 1/2 hour cycles
    val sinTime: Long ⇒ Double = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI)
    val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob
    val timeFilterUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng))
    val zeroTimestampUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ {
      if (rng < ZeroTimestampProb) {
        TS0
      } else {
        ts
      }
    })

    val rateStream = session.readStream
      .format("rate")
      .option("rowsPerSecond", recordsPerSecond)
      .load()
      .as[Rate]

    val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand())
    val sampledData = randomDataset.where(timeFilterUdf($"timestamp", $"rng"))
      .withColumn("user", usersUdf())
      .withColumn("other", usersUdf())
      .withColumn("direction", directionUdf())
      .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType))
      .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng"))
      .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp" as "timestamp")
      .as[CallRecord]
    sampledData
  }
} 
Example 148
Source File: IdentitySparkProcessor1.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.example

import pipelines.streamlets.StreamletShape

import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet }

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.TimestampType
import pipelines.spark.sql.SQLImplicits._
import org.apache.spark.sql.streaming.OutputMode

class IdentitySparkProcessor1 extends SparkStreamlet {

  val in = AvroInlet[Data]("in")
  val out = AvroOutlet[Data]("out", _.src)
  val shape = StreamletShape(in, out)

  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries = {
      writeStream(readStream(in).map(d ⇒ d.copy(t1 = TimeOps.nowAsOption)), out, OutputMode.Append).toQueryExecution
    }
  }
} 
Example 149
Source File: SparkRandomGenDataIngress.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.example

import java.sql.Timestamp

import scala.util.Random

import pipelines.streamlets.{ DurationConfigParameter, IntegerConfigParameter, StreamletShape }
import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.streaming.{ OutputMode, Trigger }

import pipelines.spark.sql.SQLImplicits._

case class Rate(timestamp: Timestamp, value: Long)

class SparkRandomGenDataIngress extends SparkStreamlet {
  val out = AvroOutlet[Data]("out", d ⇒ d.src)
  val shape = StreamletShape(out)

  val RecordsPerSecond = IntegerConfigParameter(
    "records-per-second",
    "Records per second to produce.",
    Some(50))

  val RampUpTime = DurationConfigParameter(
    "ramp-up-time",
    "Time to reach max records per second.",
    Some("0 seconds"))

  override def configParameters = Vector(RecordsPerSecond, RampUpTime)

  override def createLogic() = new SparkStreamletLogic {

    override def buildStreamingQueries = {
      writeStream(process, out, OutputMode.Append).toQueryExecution
    }

    private def process: Dataset[Data] = {

      val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key)
      val rampUpTime = context.streamletConfig.getDuration(RampUpTime.key, java.util.concurrent.TimeUnit.SECONDS)
      println(s"Using rampup time of $rampUpTime seconds")

      val gaugeGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "oil" else "gas"

      val rateStream = session.readStream
        .format("rate")
        .option("rowsPerSecond", recordsPerSecond)
        .option("rampUpTime", s"${rampUpTime}s")
        .load()
        .as[Rate]

      rateStream.map {
        case Rate(timestamp, value) ⇒ Data(s"src-${value % 1000}", timestamp.getTime, None, None, gaugeGen(), value)
      }
    }
  }
} 
Example 150
Source File: MyDatasetFunc.scala    From Apache-Spark-2x-Machine-Learning-Cookbook   with MIT License 5 votes vote down vote up
package spark.ml.cookbook.chapter3

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Dataset, SparkSession}
//import spark.ml.cookbook.chapter3.{Car, MyDatasetData}

//import scala.collection.mutable
import scala.collection.mutable.ListBuffer

object MyDatasetFunc {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)

    // setup SparkSession to use for interactions with Spark
    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("mydatasetfunc")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    import spark.implicits._

    val cars = spark.createDataset(MyDatasetData.carData)
    cars.show(false)

    val modelData = cars.groupByKey(_.make).mapGroups({
      case (make, car) => {
        val carModel = new ListBuffer[String]()
        car.map(_.model).foreach({
            c =>  carModel += c
        })
        (make, carModel)
      }
    })

    modelData.show(false)

    spark.stop()
  }
} 
Example 151
Source File: CardinalityProfiler.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.profilers.field

import io.gzet.profilers.Utils
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Row}

import scala.collection.mutable
import scalaz.Scalaz._

case class CardinalityProfiler(topN: Int = 5) {

  def profile(df: Dataset[Array[String]]): Dataset[CardinalityReport] = {

    val total = df.sparkSession.sparkContext.broadcast(df.count())

    import df.sparkSession.implicits._

    val features = Utils.buildColumns(df)

    val topNValues = features.groupByKey({ field =>
      field
    }).count().map({ case (field, count) =>
      (field.idx, Map(field.value -> count))
    }).groupByKey({ case (column, map) =>
      column
    }).reduceGroups({ (v1, v2) =>
      val m1 = v1._2
      val m2 = v2._2
      val m = (m1 |+| m2).toSeq.sortBy(_._2).reverse
      (v1._1, m.take(math.min(m.size, topN)).toMap)
    }).map({ case (column, (_, map)) =>
      val top = map.keySet.toArray
      (column, top)
    })
      .withColumnRenamed("_1", "_topNValues_")
      .withColumnRenamed("_2", "description")

    val cardinalities = features.distinct().groupByKey(_.idx).count().map({
      case (column, distinctValues) =>
        val cardinality = distinctValues / total.value.toDouble
        (column, cardinality)
    })
      .withColumnRenamed("_1", "column")
      .withColumnRenamed("_2", "cardinality")

    cardinalities.join(topNValues, col("column") === col("_topNValues_"))
      .drop("_topNValues_")
      .map({ case Row(column: Int, cardinality: Double, description: mutable.WrappedArray[String]) =>
        CardinalityReport(
          column,
          cardinality,
          description.toArray
        )
      })

  }

}

case class CardinalityReport(
                              field: Int,
                              metricValue: Double,
                              description: Array[String]
                            ) 
Example 152
Source File: MultiGroupedTransformDC.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import com.bloomberg.sparkflow.serialization.Hashing
import org.apache.spark.sql.{Dataset, Encoder, KeyValueGroupedDataset, SparkSession}

import scala.concurrent.duration.Duration
import scala.concurrent.{Await, Future}
import scala.reflect.ClassTag
import scala.concurrent.ExecutionContext.Implicits.global



class MultiGroupedTransformDC[K, V, U, T: ClassTag]
(left: KeyValueGroupedDC[K, V],
 right: KeyValueGroupedDC[K, U],
 f: (KeyValueGroupedDataset[K, V], KeyValueGroupedDataset[K, U]) => Dataset[T])
(implicit tEncoder: Encoder[T]) extends DC[T](tEncoder, Seq(left, right)) {

  override def computeDataset(spark: SparkSession) = {
    val leftFuture = Future{left.get(spark)}
    val rightFuture = Future{right.get(spark)}
    val ld = Await.result(leftFuture, Duration.Inf)
    val rd = Await.result(rightFuture, Duration.Inf)
    val dataset = f(ld, rd)
    dataset
  }

  override def computeSignature() = {
    Hashing.hashString(left.getSignature + right.getSignature + Hashing.hashClass(f))
  }

} 
Example 153
Source File: DatasetTransformDC.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import com.bloomberg.sparkflow.serialization.Hashing._
import org.apache.spark.sql.{Dataset, Encoder, SparkSession}


private[sparkflow] class DatasetTransformDC[U, T]
(encoder: Encoder[U], val prev: DC[T], f: (Dataset[T]) => Dataset[U], hashTargets: Seq[String]) extends DC[U](encoder, Seq(prev)) {
  //
  //  def this(prev: DC[T], f: Dataset[T] => Dataset[U], hashTarget: AnyRef)(implicit tEncoder: Encoder[T], uEncoder: Encoder[U])  = {
  //    this(prev, uEncoder, f, Seq(hashClass(hashTarget)))
  //  }
  //
  //  def this(prev: DC[T], f: Dataset[T] => Dataset[U], hashTarget: AnyRef, hashTargets: Seq[String])(implicit tEncoder: Encoder[T], uEncoder: Encoder[U])  = {
  //    this(prev,uEncoder,  f, hashClass(hashTarget) +: hashTargets)
  //  }

  def computeDataset(spark: SparkSession) = {
    val dataset = f(prev.getDataset(spark))
    dataset
  }

  override def computeSignature() = {
    hashString(prev.getSignature + hashSeq(hashTargets))
  }

} 
Example 154
Source File: DataFrameWordCountTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.dataframe

import com.github.dnvriend.TestSpec
import org.apache.spark.sql.{ DataFrame, Dataset }

class DataFrameWordCountTest extends TestSpec {
  it should "wordcount alice in wonderland" in withSparkSession { spark =>
    import org.apache.spark.sql.functions._
    import spark.implicits._
    val lines: Dataset[String] = spark.read.text(TestSpec.AliceInWonderlandText).as[String]
    lines.count shouldBe 3599 // alice in wonderland contains 3599 lines
    val words: DataFrame = lines.flatMap((line: String) => line.split(" ")).map(_.trim).filter(_.nonEmpty).toDF("word")
    words.count() shouldBe 26467 // there are 26,467 words in the book, excluding spaces
    val wordCount: Dataset[(String, Long)] =
      words.groupBy('word).agg(count('word).as("count")).orderBy('count.desc).as[(String, Long)].cache

    wordCount.take(1).head shouldBe ("the", 1505) // the word 'the' is used 1505 times
    wordCount.filter(lower('word) === "alice").take(1).head shouldBe ("Alice", 221)
    wordCount.filter(lower('word) === "queen").take(1).head shouldBe ("Queen", 34)
    wordCount.filter(lower('word) === "rabbit").take(1).head shouldBe ("Rabbit", 29)
    wordCount.filter(lower('word) === "cheshire").take(1).head shouldBe ("Cheshire", 6)
  }
} 
Example 155
Source File: DatasetTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.dataset

import com.github.dnvriend.TestSpec
import com.github.dnvriend.spark._
import com.github.dnvriend.spark.datasources.person.Person
import org.apache.spark.sql.Dataset

class DatasetTest extends TestSpec {
  lazy val xs = Seq(
    Person(1, "foo", 30),
    Person(2, "bar", 21),
    Person(3, "baz", 25),
    Person(4, "jaz", 40),
    Person(5, "bab", 50)
  )

  it should "typed Dataset operations: count" in withSparkSession { spark =>
    import spark.implicits._
    val ds: Dataset[Person] = xs.toDS()
    ds.count() shouldBe 5
  }

  it should "untyped Dataset operations: (aka DataFrame, everything is a Row)" in withSparkSession { spark =>
    import spark.implicits._
    val ds = xs.toDS()
    ds.createOrReplaceTempView("people")
    ds.sqlContext.sql("SELECT COUNT(*) FROM people") // Array[Row]
      .head.getLong(0) shouldBe 5
  }

  it should "count SQL, convert back to typed with .as[Long]" in withSparkSession { spark =>
    import spark.implicits._
    val ds = xs.toDS()
    ds.createOrReplaceTempView("people")
    ds.sqlContext.sql("SELECT COUNT(*) FROM people").as[Long].head() shouldBe 5
  }

  it should "count using dataset operations" in withSparkSession { spark =>
    import spark.implicits._
    val ds = xs.toDS()
    ds.count() shouldBe 5
  }

  it should "filter a ds" in withSparkSession { spark =>
    import spark.implicits._
    val ds = xs.toDS()
    ds.filter(_.age < 30).count shouldBe 2
    ds.filter(_.age > 30).count shouldBe 2
    ds.filter(_.age >= 30).count shouldBe 3
  }

  it should "load people parquet" in withSparkSession { spark =>
    val people = spark.read.parquet(TestSpec.PeopleParquet)
    people.count shouldBe 5
  }

  it should "load purchase_items parquet" in withSparkSession { spark =>
    val people = spark.read.parquet(TestSpec.PurchaseItems)
    people.count shouldBe 25
  }

  it should "load transactions parquet" in withSparkSession { spark =>
    import spark.implicits._
    val tx = spark.read.parquet(TestSpec.Transactions).as[Transaction]
    tx.count shouldBe 1000
  }
} 
Example 156
Source File: WithUtils.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.common

import java.io.Closeable
import java.util.concurrent.locks.Lock

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Dataset
import org.apache.spark.storage.StorageLevel

import scala.util.control.NonFatal

object WithUtils {

  
  def withCloseable[T <: Closeable, R](closeable: T)(func: T => R): R = {
    var triedToClose = false
    try {
      func(closeable)
    } catch {
      case NonFatal(e) =>
        try {
          closeable.close()
        } catch {
          case NonFatal(t) =>
            e.addSuppressed(t)
        }
        triedToClose = true
        throw e
    } finally {
      // if we haven't tried to close it in the exception handler, try here.
      if (!triedToClose) {
        closeable.close()
      }
    }
  }

  def withLock[T](lock: Lock)(f: => T): T = {
    lock.lock()
    try {
      f
    } finally {
      lock.unlock()
    }
  }

  def withCachedRDD[T, U](rdd: RDD[T])(f: RDD[T] => U): U = {
    // Caching in MEMORY_ONLY (or even MEMORY_AND_DISK) can result in OOMs
    rdd.persist(StorageLevel.DISK_ONLY)
    try {
      f(rdd)
    } finally {
      rdd.unpersist()
    }
  }

  def withCachedDataset[T, U](ds: Dataset[T])(f: Dataset[T] => U): U = {
    // Caching in MEMORY_ONLY (or even MEMORY_AND_DISK) can result in OOMs
    ds.persist(StorageLevel.DISK_ONLY)
    try {
      f(ds)
    } finally {
      ds.unpersist()
    }
  }
} 
Example 157
Source File: KCore.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.graph.kcore
import com.tencent.angel.sona.context.PSContext
import org.apache.spark.SparkContext
import com.tencent.angel.sona.graph.params._
import com.tencent.angel.sona.ml.Transformer
import com.tencent.angel.sona.ml.param.ParamMap
import com.tencent.angel.sona.ml.util.Identifiable
import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.storage.StorageLevel

class KCore(override val uid: String) extends Transformer
  with HasSrcNodeIdCol with HasDstNodeIdCol with HasOutputNodeIdCol with HasOutputCoreIdCol
  with HasStorageLevel with HasPartitionNum with HasPSPartitionNum with HasUseBalancePartition {

  def this() = this(Identifiable.randomUID("KCore"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val edges = dataset.select($(srcNodeIdCol), $(dstNodeIdCol)).rdd
      .map(row => (row.getLong(0), row.getLong(1)))
      .filter(e => e._1 != e._2)

    edges.persist(StorageLevel.DISK_ONLY)

    val maxId = edges.map(e => math.max(e._1, e._2)).max() + 1
    val minId = edges.map(e => math.min(e._1, e._2)).min()
    val nodes = edges.flatMap(e => Iterator(e._1, e._2))
    val numEdges = edges.count()

    println(s"minId=$minId maxId=$maxId numEdges=$numEdges level=${$(storageLevel)}")

    // Start PS and init the model
    println("start to run ps")
    PSContext.getOrCreate(SparkContext.getOrCreate())

    val model = KCorePSModel.fromMinMax(minId, maxId, nodes, $(psPartitionNum), $(useBalancePartition))
    var graph = edges.flatMap(e => Iterator((e._1, e._2), (e._2, e._1)))
      .groupByKey($(partitionNum))
      .mapPartitionsWithIndex((index, edgeIter) =>
        Iterator(KCoreGraphPartition.apply(index, edgeIter)))

    graph.persist($(storageLevel))
    graph.foreachPartition(_ => Unit)
    graph.foreach(_.initMsgs(model))

    var curIteration = 0
    var numMsgs = model.numMsgs()
    var prev = graph
    println(s"numMsgs=$numMsgs")

    do {
      curIteration += 1
      graph = prev.map(_.process(model, numMsgs, curIteration == 1))
      graph.persist($(storageLevel))
      graph.count()
      prev.unpersist(true)
      prev = graph
      model.resetMsgs()
      numMsgs = model.numMsgs()
      println(s"curIteration=$curIteration numMsgs=$numMsgs")
    } while (numMsgs > 0)

    val retRDD = graph.map(_.save()).flatMap{case (nodes,cores) => nodes.zip(cores)}
      .map(r => Row.fromSeq(Seq[Any](r._1, r._2)))

    dataset.sparkSession.createDataFrame(retRDD, transformSchema(dataset.schema))
  }

  override def transformSchema(schema: StructType): StructType = {
    StructType(Seq(
      StructField(s"${$(outputNodeIdCol)}", LongType, nullable = false),
      StructField(s"${$(outputCoreIdCol)}", IntegerType, nullable = false)
    ))
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

} 
Example 158
Source File: Correlation.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.stat

import org.apache.spark.linalg.{SQLDataTypes, Vector}

import scala.collection.JavaConverters._
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.types.{StructField, StructType}

/**
 * API for correlation functions in MLlib, compatible with DataFrames and Datasets.
 *
 * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]]
 * to spark.ml's Vector types.
 */
object Correlation {

  /**
   * :: Experimental ::
   * Compute the correlation matrix for the input Dataset of Vectors using the specified method.
   * Methods currently supported: `pearson` (default), `spearman`.
   *
   * @param dataset A dataset or a dataframe
   * @param column The name of the column of vectors for which the correlation coefficient needs
   *               to be computed. This must be a column of the dataset, and it must contain
   *               Vector objects.
   * @param method String specifying the method to use for computing correlation.
   *               Supported: `pearson` (default), `spearman`
   * @return A dataframe that contains the correlation matrix of the column of vectors. This
   *         dataframe contains a single row and a single column of name
   *         '$METHODNAME($COLUMN)'.
   * @throws IllegalArgumentException if the column is not a valid column in the dataset, or if
   *                                  the content of this column is not of type Vector.
   *
   *  Here is how to access the correlation coefficient:
   *  {{{
   *    val data: Dataset[Vector] = ...
   *    val Row(coeff: Matrix) = Correlation.corr(data, "value").head
   *    // coeff now contains the Pearson correlation matrix.
   *  }}}
   *
   * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column
   * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
   * which is fairly costly. Cache the input Dataset before calling corr with `method = "spearman"`
   * to avoid recomputing the common lineage.
   */

  def corr(dataset: Dataset[_], column: String, method: String): DataFrame = {
    val rdd = dataset.select(column).rdd.map {
      case Row(v: Vector) => v
    }
    val oldM = Statistics.corr(rdd, method)
    val name = s"$method($column)"
    val schema = StructType(Array(StructField(name, SQLDataTypes.MatrixType, nullable = false)))
    dataset.sparkSession.createDataFrame(Seq(Row(oldM)).asJava, schema)
  }

  /**
   * Compute the Pearson correlation matrix for the input Dataset of Vectors.
   */

  def corr(dataset: Dataset[_], column: String): DataFrame = {
    corr(dataset, column, "pearson")
  }
} 
Example 159
Source File: Estimator.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml

import com.tencent.angel.sona.ml.param.{ParamMap, ParamPair}
import scala.annotation.varargs
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.sql.Dataset

/**
 * :: DeveloperApi ::
 * Abstract class for estimators that fit models to data.
 */
@DeveloperApi
abstract class Estimator[M <: Model[M]] extends PipelineStage {

  /**
   * Fits a single model to the input data with optional parameters.
   *
   * @param dataset input dataset
   * @param firstParamPair the first param pair, overrides embedded params
   * @param otherParamPairs other param pairs.  These values override any specified in this
   *                        Estimator's embedded ParamMap.
   * @return fitted model
   */
  @varargs
  def fit(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): M = {
    val map = new ParamMap()
      .put(firstParamPair)
      .put(otherParamPairs: _*)
    fit(dataset, map)
  }

  /**
   * Fits a single model to the input data with provided parameter map.
   *
   * @param dataset input dataset
   * @param paramMap Parameter map.
   *                 These values override any specified in this Estimator's embedded ParamMap.
   * @return fitted model
   */
  def fit(dataset: Dataset[_], paramMap: ParamMap): M = {
    copy(paramMap).fit(dataset)
  }

  /**
   * Fits a model to the input data.
   */
  def fit(dataset: Dataset[_]): M

  /**
   * Fits multiple models to the input data with multiple sets of parameters.
   * The default implementation uses a for loop on each parameter map.
   * Subclasses could override this to optimize multi-model training.
   *
   * @param dataset input dataset
   * @param paramMaps An array of parameter maps.
   *                  These values override any specified in this Estimator's embedded ParamMap.
   * @return fitted models, matching the input parameter maps
   */
  def fit(dataset: Dataset[_], paramMaps: Array[ParamMap]): Seq[M] = {
    paramMaps.map(fit(dataset, _))
  }

  override def copy(extra: ParamMap): Estimator[M]
} 
Example 160
Source File: RegressionEvaluator.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.evaluation

import com.tencent.angel.sona.ml.evaluation.evaluating.RegressionSummaryImpl
import com.tencent.angel.sona.ml.param.{Param, ParamMap, ParamValidators}
import com.tencent.angel.sona.ml.param.shared.{HasLabelCol, HasPredictionCol}
import com.tencent.angel.sona.ml.util._
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{DoubleType, FloatType}
import org.apache.spark.sql.util.SONASchemaUtils


/**
 * :: Experimental ::
 * Evaluator for regression, which expects two input columns: prediction and label.
 */
final class RegressionEvaluator(override val uid: String)
  extends Evaluator with HasPredictionCol with HasLabelCol with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("regEval"))

  /**
   * Param for metric name in evaluation. Supports:
   *  - `"rmse"` (default): root mean squared error
   *  - `"mse"`: mean squared error
   *  - `"r2"`: R^2^ metric
   *  - `"mae"`: mean absolute error
   *
   * @group param
   */
  val metricName: Param[String] = {
    val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae"))
    new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae)", allowedParams)
  }

  
  def getMetricName: String = $(metricName)

  
  def setMetricName(value: String): this.type = set(metricName, value)

  
  def setPredictionCol(value: String): this.type = set(predictionCol, value)

  
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "rmse")

  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SONASchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType))
    SONASchemaUtils.checkNumericType(schema, $(labelCol))

    val summary = new RegressionSummaryImpl(dataset.toDF(), $(predictionCol), $(labelCol))
    val metrics = summary.regMetrics

    val metric = $(metricName) match {
      case "rmse" => summary.rmse
      case "mse" => summary.mse
      case "r2" => summary.r2
      case "mae" => summary.absDiff
    }

    metric
  }

  override def isLargerBetter: Boolean = $(metricName) match {
    case "rmse" => false
    case "mse" => false
    case "r2" => true
    case "mae" => false
  }

  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
}


object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] {
  override def load(path: String): RegressionEvaluator = super.load(path)
} 
Example 161
Source File: GenericFunSpecSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.test

import org.scalatest.FunSpec

import org.apache.spark.sql.Dataset


class GenericFunSpecSuite extends FunSpec with SharedSparkSession {
  import testImplicits._

  private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS

  describe("Simple Dataset") {
    it("should have the specified number of elements") {
      assert(8 === ds.count)
    }
    it("should have the specified number of unique elements") {
      assert(8 === ds.distinct.count)
    }
    it("should have the specified number of elements in each column") {
      assert(8 === ds.select("_1").count)
      assert(8 === ds.select("_2").count)
    }
    it("should have the correct number of distinct elements in each column") {
      assert(8 === ds.select("_1").distinct.count)
      assert(4 === ds.select("_2").distinct.count)
    }
  }
} 
Example 162
Source File: A_1_BasicOperation.scala    From wow-spark   with MIT License 5 votes vote down vote up
package com.sev7e0.wow.structured_streaming

import java.sql.Timestamp

import org.apache.spark.sql.types.{BooleanType, StringType, StructType, TimestampType}
import org.apache.spark.sql.{Dataset, SparkSession}

object A_1_BasicOperation {

  //DateTime要使用Timestamp  case类必须使用java.sql。在catalyst中作为TimestampType调用的时间戳
  case class DeviceData(device: String, deviceType: String, signal: Double, time: Timestamp)

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName(A_1_BasicOperation.getClass.getName)
      .master("local")
      .getOrCreate()
    val timeStructType = new StructType().add("device", StringType)
      .add("deviceType", StringType)
      .add("signal", BooleanType)
      .add("time", TimestampType)

    val dataFrame = spark.read.json("src/main/resources/sparkresource/device.json")
    import spark.implicits._
    val ds: Dataset[DeviceData] = dataFrame.as[DeviceData]

    //使用无类型方式查询,类sql
    dataFrame.select("device").where("signal>10").show()
    //使用有类型方式进行查询
    ds.filter(_.signal > 10).map(_.device).show()

    //使用无类型方式进行groupBy,并进行统计
    dataFrame.groupBy("deviceType").count().show()


    import org.apache.spark.sql.expressions.scalalang.typed
    //使用有类型方式进行 计算每种类型的设备的平均信号值
    ds.groupByKey(_.deviceType).agg(typed.avg(_.signal)).show()

    //也可以使用创建临时视图的形式,使用sql语句进行查询
    dataFrame.createOrReplaceTempView("device")
    spark.sql("select * from device").show()

    //可以使用isStreaming来判断是否有流数据
    println(dataFrame.isStreaming)
  }
} 
Example 163
Source File: Writer.scala    From bdg-sequila   with Apache License 2.0 5 votes vote down vote up
package org.biodatageeks.sequila.tests.pileup

import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}

object Writer {

  val mapToString = (map: Map[Byte, Short]) => {
    if (map == null)
      "null"
    else
      map.map({
        case (k, v) => k.toChar -> v
      }).toSeq.sortBy(_._1).mkString.replace(" -> ", ":")
  }

  def saveToFile(spark: SparkSession, res: Dataset[Row], path: String) = {
    spark.udf.register("mapToString", mapToString)
    res
      .selectExpr("contig", "pos_start", "pos_end", "ref", "cast(coverage as int)", "mapToString(alts)")
      .coalesce(1)
      .write
      .mode(SaveMode.Overwrite)
      .csv(path)
  }
} 
Example 164
Source File: PileupTestBase.scala    From bdg-sequila   with Apache License 2.0 5 votes vote down vote up
package org.biodatageeks.sequila.tests.pileup

import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext}
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.sql.types.{IntegerType, ShortType, StringType, StructField, StructType}
import org.scalatest.{BeforeAndAfter, FunSuite}

class PileupTestBase extends FunSuite
  with DataFrameSuiteBase
  with BeforeAndAfter
  with SharedSparkContext{

  val sampleId = "NA12878.multichrom.md"
  val samResPath: String = getClass.getResource("/multichrom/mdbam/samtools.pileup").getPath
  val referencePath: String = getClass.getResource("/reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta").getPath
  val bamPath: String = getClass.getResource(s"/multichrom/mdbam/${sampleId}.bam").getPath
  val cramPath : String = getClass.getResource(s"/multichrom/mdcram/${sampleId}.cram").getPath
  val tableName = "reads_bam"
  val tableNameCRAM = "reads_cram"

  val schema: StructType = StructType(
    List(
      StructField("contig", StringType, nullable = true),
      StructField("position", IntegerType, nullable = true),
      StructField("reference", StringType, nullable = true),
      StructField("coverage", ShortType, nullable = true),
      StructField("pileup", StringType, nullable = true),
      StructField("quality", StringType, nullable = true)
    )
  )
  before {
    System.setProperty("spark.kryo.registrator", "org.biodatageeks.sequila.pileup.serializers.CustomKryoRegistrator")
    spark
      .conf.set("spark.sql.shuffle.partitions",1) //FIXME: In order to get orderBy in Samtools tests working - related to exchange partitions stage
    spark.sql(s"DROP TABLE IF EXISTS $tableName")
    spark.sql(
      s"""
         |CREATE TABLE $tableName
         |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource
         |OPTIONS(path "$bamPath")
         |
      """.stripMargin)

    spark.sql(s"DROP TABLE IF EXISTS $tableNameCRAM")
    spark.sql(
      s"""
         |CREATE TABLE $tableNameCRAM
         |USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource
         |OPTIONS(path "$cramPath", refPath "$referencePath" )
         |
      """.stripMargin)

    val mapToString = (map: Map[Byte, Short]) => {
      if (map == null)
        "null"
      else
        map.map({
          case (k, v) => k.toChar -> v}).mkString.replace(" -> ", ":")
    }

    val byteToString = ((byte: Byte) => byte.toString)

    spark.udf.register("mapToString", mapToString)
    spark.udf.register("byteToString", byteToString)
  }

} 
Example 165
Source File: KafkaSource.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package kafka

import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{struct, to_json, _}
import _root_.log.LazyLogger
import org.apache.spark.sql.types.{StringType, _}
import radio.{SimpleSongAggregation, SimpleSongAggregationKafka}
import spark.SparkHelper


    def read(startingOption: String = "startingOffsets", partitionsAndOffsets: String = "earliest") : Dataset[SimpleSongAggregationKafka] = {
      log.warn("Reading from Kafka")

      spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", KafkaService.topicName)
      .option("enable.auto.commit", false) // Cannot be set to true in Spark Strucutured Streaming https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#kafka-specific-configurations
      .option("group.id", "Structured-Streaming-Examples")
      .option("failOnDataLoss", false) // when starting a fresh kafka (default location is temporary (/tmp) and cassandra is not (var/lib)), we have saved different offsets in Cassandra than real offsets in kafka (that contains nothing)
      .option(startingOption, partitionsAndOffsets) //this only applies when a new query is started and that resuming will always pick up from where the query left off
      .load()
      .withColumn(KafkaService.radioStructureName, // nested structure with our json
        from_json($"value".cast(StringType), KafkaService.schemaOutput) //From binary to JSON object
      ).as[SimpleSongAggregationKafka]
      .filter(_.radioCount != null) //TODO find a better way to filter bad json
  }
} 
Example 166
Source File: KafkaSink.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package kafka

import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{struct, to_json, _}
import _root_.log.LazyLogger
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.types.{StringType, _}
import radio.{SimpleSongAggregation, SimpleSongAggregationKafka}
import spark.SparkHelper

object KafkaSink extends LazyLogger {
  private val spark = SparkHelper.getSparkSession()

  import spark.implicits._

  def writeStream(staticInputDS: Dataset[SimpleSongAggregation]) : StreamingQuery = {
    log.warn("Writing to Kafka")
    staticInputDS
      .select(to_json(struct($"*")).cast(StringType).alias("value"))
      .writeStream
      .outputMode("update")
      .format("kafka")
      .option("kafka.bootstrap.servers", KafkaService.bootstrapServers)
      .queryName("Kafka - Count number of broadcasts for a title/artist by radio")
      .option("topic", "test")
      .start()
  }

  
  def debugStream(staticKafkaInputDS: Dataset[SimpleSongAggregationKafka]) = {
    staticKafkaInputDS
      .writeStream
      .queryName("Debug Stream Kafka")
      .format("console")
      .start()
  }
} 
Example 167
Source File: CassandraSink.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package cassandra.StreamSinkProvider

import cassandra.{CassandraDriver, CassandraKafkaMetadata}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.functions.max
import spark.SparkHelper
import cassandra.CassandraDriver
import com.datastax.spark.connector._
import kafka.KafkaMetadata
import log.LazyLogger
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.types.LongType
import radio.SimpleSongAggregation


  private def saveKafkaMetaData(df: DataFrame) = {
    val kafkaMetadata = df
      .groupBy($"partition")
      .agg(max($"offset").cast(LongType).as("offset"))
      .as[KafkaMetadata]

    log.warn("Saving Kafka Metadata (partition and offset per topic (only one in our example)")
    kafkaMetadata.show()

    kafkaMetadata.rdd.saveToCassandra(CassandraDriver.namespace,
      CassandraDriver.kafkaMetadata,
      SomeColumns("partition", "offset")
    )

    //Otherway to save offset inside Cassandra
    //kafkaMetadata.collect().foreach(CassandraKafkaMetadata.save)
  }
} 
Example 168
Source File: ElasticSink.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package elastic

import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
import radio.{SimpleSongAggregation, Song}
import org.elasticsearch.spark.sql.streaming._
import org.elasticsearch.spark.sql._
import org.elasticsearch.spark.sql.streaming.EsSparkSqlStreamingSink

object ElasticSink {
  def writeStream(ds: Dataset[Song] ) : StreamingQuery = {
    ds   //Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark
      .writeStream
      .outputMode(OutputMode.Append) //Only mode for ES
      .format("org.elasticsearch.spark.sql") //es
      .queryName("ElasticSink")
      .start("test/broadcast") //ES index
  }

} 
Example 169
Source File: MapGroupsWithState.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package mapGroupsWithState

import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{struct, to_json, _}
import _root_.log.LazyLogger
import org.apache.spark.sql.types.StringType
import spark.SparkHelper
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode}
import radio.{ArtistAggregationState, SimpleSongAggregation, SimpleSongAggregationKafka}

object MapGroupsWithState extends LazyLogger {
  private val spark = SparkHelper.getSparkSession()

  import spark.implicits._


  def updateArtistStateWithEvent(state: ArtistAggregationState, artistCount : SimpleSongAggregation) = {
    log.warn("MapGroupsWithState - updateArtistStateWithEvent")
    if(state.artist == artistCount.artist) {
      ArtistAggregationState(state.artist, state.count + artistCount.count)
    } else {
      state
    }
  }

  def updateAcrossEvents(artist:String,
                         inputs: Iterator[SimpleSongAggregation],
                         oldState: GroupState[ArtistAggregationState]): ArtistAggregationState = {

    var state: ArtistAggregationState = if (oldState.exists)
      oldState.get
    else
      ArtistAggregationState(artist, 1L)

    // for every rows, let's count by artist the number of broadcast, instead of counting by artist, title and radio
    for (input <- inputs) {
      state = updateArtistStateWithEvent(state, input)
      oldState.update(state)
    }

    state
  }


  
  def write(ds: Dataset[SimpleSongAggregationKafka] ) = {
    ds.select($"radioCount.title", $"radioCount.artist", $"radioCount.radio", $"radioCount.count")
      .as[SimpleSongAggregation]
      .groupByKey(_.artist)
      .mapGroupsWithState(GroupStateTimeout.NoTimeout)(updateAcrossEvents) //we can control what should be done with the state when no update is received after a timeout.
      .writeStream
      .outputMode(OutputMode.Update())
      .format("console")
      .queryName("mapGroupsWithState - counting artist broadcast")
      .start()
  }
} 
Example 170
Source File: ParquetService.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package parquetHelper

import log.LazyLogger
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._
import radio.{SimpleSongAggregation, Song}
import spark.SparkHelper

object ParquetService extends LazyLogger {
  val pathRadioStationSongs = "data/allRadioPartitionByRadioAndDate.parquet"
  val pathRadioES = "data/broadcast.parquet"

  private val spark = SparkHelper.getSparkSession()
  import spark.implicits._

  val schema = new StructType()
    .add("timestamp", TimestampType)
    .add("title", StringType)
    .add("artist", StringType)
    .add("radio", StringType)
    .add("humanDate", LongType)
    .add("hour", IntegerType)
    .add("minute", IntegerType)
    .add("allArtists", StringType)
    .add("year", IntegerType)
    .add("month", IntegerType)
    .add("day", IntegerType)

  def batchWay() = {
    //Classic  Batch way
    val batchWay =
      spark
        .read
        .schema(ParquetService.schema)
        .parquet(pathRadioStationSongs)
        .where($"artist" === "Drake")
        .groupBy($"radio", $"artist",  $"title")
        .count()
        .orderBy("count")
        .as[Song]

    batchWay.show()

    batchWay
  }

  def streamingWay() : Dataset[SimpleSongAggregation] = {
    log.warn("Starting to stream events from Parquet files....")

    spark
      .readStream
      .schema(ParquetService.schema)
      .option("maxFilesPerTrigger", 1000)  // Treat a sequence of files as a stream by picking one file at a time
      .parquet(pathRadioStationSongs)
      .as[Song]
      .where($"artist" === "Drake")
      .groupBy($"radio", $"artist",  $"title")
      .count()
      .as[SimpleSongAggregation]
  }

  def streamEachEvent : Dataset[Song]  = {
    spark
      .readStream
      .schema(ParquetService.schema)
      .option("maxFilesPerTrigger", 1000)  // Treat a sequence of files as a stream by picking one file at a time
      .parquet(pathRadioES)
      .as[Song]
      .where($"artist" === "Drake")
      .withWatermark("timestamp", "10 minutes")
      .as[Song]
  }

  //Process stream on console to debug only
  def debugStream(staticInputDF: DataFrame) = {
    staticInputDF.writeStream
      .format("console")
      .outputMode("complete")
      .queryName("Console - Count number of broadcasts for a title/artist by radio")
      .start()
  }
} 
Example 171
Source File: Aggregator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
import org.apache.spark.sql.{DataFrame, Dataset, Encoder, TypedColumn}


  def toColumn(
      implicit bEncoder: Encoder[B],
      cEncoder: Encoder[O]): TypedColumn[I, O] = {
    val expr =
      new AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        false)

    new TypedColumn[I, O](expr, encoderFor[O])
  }
} 
Example 172
Source File: RichSparkFunctions.scala    From lighthouse   with Apache License 2.0 5 votes vote down vote up
package be.dataminded.lighthouse.pipeline

import com.typesafe.scalalogging.LazyLogging
import org.apache.spark.sql.{Dataset, Encoder}
import org.apache.spark.storage.StorageLevel

import scala.reflect.ClassTag

object RichSparkFunctions extends LazyLogging {

  class DatasetSparkFunction[A <: Dataset[_]: ClassTag](function: SparkFunction[A]) {

    
    def printSchema(): SparkFunction[A] =
      function.map { dataSet =>
        dataSet.printSchema()
        dataSet
      }

    def as[T: Encoder]: SparkFunction[Dataset[T]] = function.map(_.as[T])

    def cache(storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): SparkFunction[A] =
      function.map {
        _.persist(storageLevel)
      }

    def dropCache(): SparkFunction[A] =
      function.map {
        _.unpersist()
      }

    def write(sink: Sink, sinks: Sink*): SparkFunction[A] = {
      if (sinks.isEmpty) function.map { data =>
        sink.write(data); data
      }
      else (sink +: sinks).foldLeft(function.cache())((f, sink) => f.write(sink))
    }

    def count(): SparkFunction[Long] = {
      function.map { dataSet =>
        val n = dataSet.count()
        logger.debug(s"The data set produced $n rows")
        n
      }
    }
  }
} 
Example 173
Source File: GenericFlatSpecSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.test

import org.scalatest.FlatSpec

import org.apache.spark.sql.Dataset


class GenericFlatSpecSuite extends FlatSpec with SharedSparkSession {
  import testImplicits._

  private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS

  "A Simple Dataset" should "have the specified number of elements" in {
    assert(8 === ds.count)
  }
  it should "have the specified number of unique elements" in {
      assert(8 === ds.distinct.count)
  }
  it should "have the specified number of elements in each column" in {
    assert(8 === ds.select("_1").count)
    assert(8 === ds.select("_2").count)
  }
  it should "have the correct number of distinct elements in each column" in {
    assert(8 === ds.select("_1").distinct.count)
    assert(4 === ds.select("_2").distinct.count)
  }
} 
Example 174
Source File: GenericWordSpecSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.test

import org.scalatest.WordSpec

import org.apache.spark.sql.Dataset


class GenericWordSpecSuite extends WordSpec with SharedSparkSession {
  import testImplicits._

  private def ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS

  "A Simple Dataset" when {
    "looked at as complete rows" should {
      "have the specified number of elements" in {
        assert(8 === ds.count)
      }
      "have the specified number of unique elements" in {
        assert(8 === ds.distinct.count)
      }
    }
    "refined to specific columns" should {
      "have the specified number of elements in each column" in {
        assert(8 === ds.select("_1").count)
        assert(8 === ds.select("_2").count)
      }
      "have the correct number of distinct elements in each column" in {
        assert(8 === ds.select("_1").distinct.count)
        assert(4 === ds.select("_2").distinct.count)
      }
    }
  }
} 
Example 175
Source File: cache.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command

import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

case class CacheTableCommand(
    tableIdent: TableIdentifier,
    plan: Option[LogicalPlan],
    isLazy: Boolean) extends RunnableCommand {
  require(plan.isEmpty || tableIdent.database.isEmpty,
    "Database name is not allowed in CACHE TABLE AS SELECT")

  override protected def innerChildren: Seq[QueryPlan[_]] = plan.toSeq

  override def run(sparkSession: SparkSession): Seq[Row] = {
    plan.foreach { logicalPlan =>
      Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString)
    }
    sparkSession.catalog.cacheTable(tableIdent.quotedString)

    if (!isLazy) {
      // Performs eager caching
      sparkSession.table(tableIdent).count()
    }

    Seq.empty[Row]
  }
}


case class UncacheTableCommand(
    tableIdent: TableIdentifier,
    ifExists: Boolean) extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val tableId = tableIdent.quotedString
    if (!ifExists || sparkSession.catalog.tableExists(tableId)) {
      sparkSession.catalog.uncacheTable(tableId)
    }
    Seq.empty[Row]
  }
}


  override def makeCopy(newArgs: Array[AnyRef]): ClearCacheCommand = ClearCacheCommand()
} 
Example 176
Source File: FrequentItems.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 177
Source File: SaveIntoDataSourceCommand.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources.CreatableRelationProvider


case class SaveIntoDataSourceCommand(
    query: LogicalPlan,
    dataSource: CreatableRelationProvider,
    options: Map[String, String],
    mode: SaveMode) extends RunnableCommand {

  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)

  override def run(sparkSession: SparkSession): Seq[Row] = {
    dataSource.createRelation(
      sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query))

    Seq.empty[Row]
  }

  override def simpleString: String = {
    val redacted = SQLConf.get.redactOptions(options)
    s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}"
  }
} 
Example 178
Source File: JsonUtils.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.json

import org.apache.spark.input.PortableDataStream
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.catalyst.json.JSONOptions

object JsonUtils {
  
  def sample(json: RDD[PortableDataStream], options: JSONOptions): RDD[PortableDataStream] = {
    require(options.samplingRatio > 0,
      s"samplingRatio (${options.samplingRatio}) should be greater than 0")
    if (options.samplingRatio > 0.99) {
      json
    } else {
      json.sample(withReplacement = false, options.samplingRatio, 1)
    }
  }
} 
Example 179
Source File: Aggregator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.{Experimental, InterfaceStability}
import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.encoderFor
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression


  def toColumn: TypedColumn[IN, OUT] = {
    implicit val bEncoder = bufferEncoder
    implicit val cEncoder = outputEncoder

    val expr =
      AggregateExpression(
        TypedAggregateExpression(this),
        Complete,
        isDistinct = false)

    new TypedColumn[IN, OUT](expr, encoderFor[OUT])
  }
} 
Example 180
Source File: KafkaContinuousSourceSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kafka010

import java.util.Properties
import java.util.concurrent.atomic.AtomicInteger

import org.scalatest.time.SpanSugar._
import scala.collection.mutable
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.{StreamTest, Trigger}
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}

// Run tests in KafkaSourceSuiteBase in continuous execution mode.
class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest

class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
  import testImplicits._

  override val brokerProps = Map("auto.create.topics.enable" -> "false")

  test("subscribing topic by pattern with topic deletions") {
    val topicPrefix = newTopic()
    val topic = topicPrefix + "-seems"
    val topic2 = topicPrefix + "-bad"
    testUtils.createTopic(topic, partitions = 5)
    testUtils.sendMessages(topic, Array("-1"))
    require(testUtils.getLatestOffsets(Set(topic)).size === 5)

    val reader = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
      .option("kafka.metadata.max.age.ms", "1")
      .option("subscribePattern", s"$topicPrefix-.*")
      .option("failOnDataLoss", "false")

    val kafka = reader.load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .as[(String, String)]
    val mapped = kafka.map(kv => kv._2.toInt + 1)

    testStream(mapped)(
      makeSureGetOffsetCalled,
      AddKafkaData(Set(topic), 1, 2, 3),
      CheckAnswer(2, 3, 4),
      Execute { query =>
        testUtils.deleteTopic(topic)
        testUtils.createTopic(topic2, partitions = 5)
        eventually(timeout(streamingTimeout)) {
          assert(
            query.lastExecution.logical.collectFirst {
              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
            }.exists { r =>
              // Ensure the new topic is present and the old topic is gone.
              r.knownPartitions.exists(_.topic == topic2)
            },
            s"query never reconfigured to new topic $topic2")
        }
      },
      AddKafkaData(Set(topic2), 4, 5, 6),
      CheckAnswer(2, 3, 4, 5, 6, 7)
    )
  }
}

class KafkaContinuousSourceStressForDontFailOnDataLossSuite
    extends KafkaSourceStressForDontFailOnDataLossSuite {
  override protected def startStream(ds: Dataset[Int]) = {
    ds.writeStream
      .format("memory")
      .queryName("memory")
      .trigger(Trigger.Continuous("1 second"))
      .start()
  }
} 
Example 181
Source File: PredictorSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasWeightCol
import org.apache.spark.ml.util._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import PredictorSuite._

  test("should support all NumericType labels and weights, and not support other types") {
    val df = spark.createDataFrame(Seq(
      (0, 1, Vectors.dense(0, 2, 3)),
      (1, 2, Vectors.dense(0, 3, 9)),
      (0, 3, Vectors.dense(0, 2, 6))
    )).toDF("label", "weight", "features")

    val types =
      Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))

    val predictor = new MockPredictor().setWeightCol("weight")

    types.foreach { t =>
      predictor.fit(df.select(col("label").cast(t), col("weight").cast(t), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label").cast(StringType), col("weight"), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label"), col("weight").cast(StringType), col("features")))
    }
  }
}

object PredictorSuite {

  class MockPredictor(override val uid: String)
    extends Predictor[Vector, MockPredictor, MockPredictionModel] with HasWeightCol {

    def this() = this(Identifiable.randomUID("mockpredictor"))

    def setWeightCol(value: String): this.type = set(weightCol, value)

    override def train(dataset: Dataset[_]): MockPredictionModel = {
      require(dataset.schema("label").dataType == DoubleType)
      require(dataset.schema("weight").dataType == DoubleType)
      new MockPredictionModel(uid)
    }

    override def copy(extra: ParamMap): MockPredictor =
      throw new NotImplementedError()
  }

  class MockPredictionModel(override val uid: String)
    extends PredictionModel[Vector, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictormodel"))

    override def predict(features: Vector): Double =
      throw new NotImplementedError()

    override def copy(extra: ParamMap): MockPredictionModel =
      throw new NotImplementedError()
  }
} 
Example 182
Source File: CallStatsAggregator.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.callrecordaggregator

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

import cloudflow.streamlets._
import cloudflow.streamlets.avro._
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.spark.sql.streaming.OutputMode
import cloudflow.spark.sql.SQLImplicits._
import org.apache.log4j.{ Level, Logger }

import carly.data._
class CallStatsAggregator extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  //tag::docs-schemaAware-example[]
  val in    = AvroInlet[CallRecord]("in")
  val out   = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString)
  val shape = StreamletShape(in, out)
  //end::docs-schemaAware-example[]

  val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute"))

  val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute"))

  override def configParameters = Vector(GroupByWindow, Watermark)
  override def createLogic = new SparkStreamletLogic {
    val watermark     = Watermark.value
    val groupByWindow = GroupByWindow.value

    //tag::docs-aggregationQuery-example[]
    override def buildStreamingQueries = {
      val dataset   = readStream(in)
      val outStream = process(dataset)
      writeStream(outStream, out, OutputMode.Update).toQueryExecution
    }
    //end::docs-aggregationQuery-example[]

    private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = {
      val query =
        inDataset
          .withColumn("ts", $"timestamp".cast(TimestampType))
          .withWatermark("ts", s"${watermark.toMillis()} milliseconds")
          .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds"))
          .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration"))
          .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType))

      query
        .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration")
        .as[AggregatedCallStats]
    }
  }
} 
Example 183
Source File: CSVProfiler.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.profilers

import io.gzet.profilers.field.{CardinalityProfiler, EmptinessProfiler, MaskBasedProfiler, PredefinedMasks}
import io.gzet.profilers.raw.{AsciiProfiler, RowProfiler, StructuralProfiler}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.elasticsearch.spark.sql._

object CSVProfiler {

  Logger.getLogger("akka").setLevel(Level.WARN)
  Logger.getLogger("org").setLevel(Level.WARN)

  val HEADER = Array(
    "rowId",
    "firstName",
    "lastName",
    "email",
    "gender",
    "ipAddress",
    "shaPass"
  )

  def main(args: Array[String]) {

    val spark = SparkSession.builder().appName("Profiler").getOrCreate()
    import spark.implicits._

    val rawDf: Dataset[String] = spark.read.text(args.head).map(_.getAs[String](0))
    rawDf.cache()
    rawDf.count()

    val tabDf: Dataset[Array[String]] = Utils.split(rawDf, delimiter = ",")

    val sources = spark.sparkContext.broadcast(rawDf.inputFiles)
    val ingestTime = spark.sparkContext.broadcast(new java.util.Date().getTime)

    val headers = spark.sparkContext.broadcast(HEADER.zipWithIndex.map(_.swap).toMap)

    RowProfiler.apply().profile(rawDf).map({ report =>
      ("row.count", report.metricValue, Map[String, String]())
    }).union(AsciiProfiler.apply().profile(rawDf).map({ report =>
      ("row.ascii", report.metricValue, Map(Tags.ASCII_NAME -> report.ascii, Tags.ASCII_BINARY -> report.binary))
    })).union(StructuralProfiler.apply(delimiter = ",").profile(rawDf).map({ report =>
      ("field.count", report.metricValue, Map(Tags.EXTRA -> report.description, Tags.FIELD_COUNT -> report.fields.toString))
    })).union(EmptinessProfiler.apply().profile(tabDf).map({ report =>
      ("field.emptiness", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString))
    })).union(CardinalityProfiler.apply(topN = 5).profile(tabDf).map({ report =>
      ("field.cardinality", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.ASCIICLASS_LOWGRAIN).profile(tabDf).map({ report =>
      ("field.ascii.low", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.ASCIICLASS_HIGHGRAIN).profile(tabDf).map({ report =>
      ("field.ascii.high", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.POP_CHECKS).profile(tabDf).map({ report =>
      ("field.pop.check", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).union(MaskBasedProfiler.apply(topN = 5, PredefinedMasks.CLASS_FREQS).profile(tabDf).map({ report =>
      ("field.class.freq", report.metricValue, Map(Tags.FIELD_IDX -> report.field.toString, Tags.MASK -> report.mask, Tags.EXTRA -> report.description.map(l => s"[$l]").mkString(",")))
    })).map({ case (metricName, metricValue, tags) =>
      val newTags = {
        if (tags.contains(Tags.FIELD_IDX)) {
          val fieldIdx = tags.get(Tags.FIELD_IDX).get.toInt
          val fieldName = headers.value.getOrElse(fieldIdx, "NA")
          tags ++ Map(Tags.FIELD_NAME -> fieldName)
        } else {
          tags
        }
      }

      ReportBuilder.create
        .withName(metricName)
        .withMetric(metricValue)
        .withSources(sources.value)
        .withTime(ingestTime.value)
        .withTags(newTags)
        .build

    }).toDF().saveToEs("profiler/mock")

  }

} 
Example 184
Source File: JsonDynamicDeserializer.scala    From gimel   with Apache License 2.0 5 votes vote down vote up
package com.paypal.gimel.deserializers.generic

import org.apache.spark.sql.{DataFrame, Dataset}

import com.paypal.gimel.deserializers.generic.conf.{GenericDeserializerConfigs, GenericDeserializerConfiguration, GenericDeserializerConstants}
import com.paypal.gimel.serde.common.Deserializer


class JsonDynamicDeserializer extends Deserializer {
  override def deserialize(dataframe: DataFrame, props: Map[String, Any] = Map.empty): DataFrame = {
    val conf = new GenericDeserializerConfiguration(props)
    if (!dataframe.columns.contains(conf.columnToDeserialize)) {
      throw new IllegalArgumentException(
        s"""
           | Column to Deserialize does not exist in dataframe --> ${conf.columnToDeserialize}
           | Please set the property ${GenericDeserializerConfigs.columnToDeserializeKey}
           | Note: Default value is "${GenericDeserializerConstants.columnToDeserialize}"
         """.stripMargin
      )
    } else {
      val sparkSession = dataframe.sparkSession
      import sparkSession.implicits._
      val deserializedDS: Dataset[String] = dataframe.map { eachRow => eachRow.getAs(conf.columnToDeserialize).asInstanceOf[Array[Byte]].map(_.toChar).mkString }
      sparkSession.read.json(deserializedDS)
    }
  }
} 
Example 185
Source File: JsonStaticDeserializer.scala    From gimel   with Apache License 2.0 5 votes vote down vote up
package com.paypal.gimel.deserializers.generic

import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._

import com.paypal.gimel.deserializers.generic.conf.{GenericDeserializerConfigs, GenericDeserializerConfiguration, GenericDeserializerConstants}
import com.paypal.gimel.serde.common.Deserializer
import com.paypal.gimel.serde.common.utils.SQLDataTypesUtils


class JsonStaticDeserializer extends Deserializer {
  override def deserialize(dataframe: DataFrame, props: Map[String, Any] = Map.empty): DataFrame = {
    val conf = new GenericDeserializerConfiguration(props)
    if (!dataframe.columns.contains(conf.columnToDeserialize)) {
      throw new IllegalArgumentException(
        s"""
           | Column to Deserialize does not exist in dataframe --> ${conf.columnToDeserialize}
           | Please set the property ${GenericDeserializerConfigs.columnToDeserializeKey}
           | Note: Default value is "${GenericDeserializerConstants.columnToDeserialize}"
         """.stripMargin
      )
    } else {
      if (conf.fieldsBindToJson.isEmpty) {
        throw new Exception ("You need to provide fields in json by setting " + GenericDeserializerConfigs.fieldsBindToJson + " property.")
      } else {
        val schema = SQLDataTypesUtils.getSchemaFromBindToFieldsJson(conf.fieldsBindToJson)
        dataframe.selectExpr("cast (" + conf.columnToDeserialize + " as string) as json")
          .select(from_json(col("json"), schema).as("data")).select("data.*")
      }
    }
  }
} 
Example 186
Source File: TestSparkStreamletContext.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark
package testkit

import java.nio.file.attribute.FileAttribute

import com.typesafe.config._

import scala.reflect.runtime.universe._
import scala.concurrent.duration._
import org.apache.spark.sql.{ Dataset, Encoder, SparkSession }
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery, Trigger }
import cloudflow.streamlets._
import org.apache.spark.sql.catalyst.InternalRow


class TestSparkStreamletContext(override val streamletRef: String,
                                session: SparkSession,
                                inletTaps: Seq[SparkInletTap[_]],
                                outletTaps: Seq[SparkOutletTap[_]],
                                override val config: Config = ConfigFactory.empty)
    extends SparkStreamletContext(StreamletDefinition("appId", "appVersion", streamletRef, "streamletClass", List(), List(), config),
                                  session) {
  val ProcessingTimeInterval = 1500.milliseconds
  override def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] =
    inletTaps
      .find(_.portName == inPort.name)
      .map(_.instream.asInstanceOf[MemoryStream[In]].toDF.as[In])
      .getOrElse(throw TestContextException(inPort.name, s"Bad test context, could not find source for inlet ${inPort.name}"))

  override def writeStream[Out](stream: Dataset[Out],
                                outPort: CodecOutlet[Out],
                                outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = {
    // RateSource can only work with a microBatch query because it contains no data at time zero.
    // Trigger.Once requires data at start to work.
    val trigger = if (isRateSource(stream)) {
      Trigger.ProcessingTime(ProcessingTimeInterval)
    } else {
      Trigger.Once()
    }
    val streamingQuery = outletTaps
      .find(_.portName == outPort.name)
      .map { outletTap ⇒
        stream.writeStream
          .outputMode(outputMode)
          .format("memory")
          .trigger(trigger)
          .queryName(outletTap.queryName)
          .start()
      }
      .getOrElse(throw TestContextException(outPort.name, s"Bad test context, could not find destination for outlet ${outPort.name}"))
    streamingQuery
  }

  override def checkpointDir(dirName: String): String = {
    val fileAttibutes: Array[FileAttribute[_]] = Array()
    val tmpDir                                 = java.nio.file.Files.createTempDirectory("spark-test", fileAttibutes: _*)
    tmpDir.toFile.getAbsolutePath
  }

  private def isRateSource(stream: Dataset[_]): Boolean = {
    import org.apache.spark.sql.execution.command.ExplainCommand
    val explain = ExplainCommand(stream.queryExecution.logical, true)
    val res     = session.sessionState.executePlan(explain).executedPlan.executeCollect()
    res.exists((row: InternalRow) => row.getString(0).contains("org.apache.spark.sql.execution.streaming.sources.RateStreamProvider"))
  }

}

case class TestContextException(portName: String, msg: String) extends RuntimeException(msg) 
Example 187
Source File: SparkAvroDecoder.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark.avro

import org.apache.log4j.Logger

import java.io.ByteArrayOutputStream

import scala.reflect.runtime.universe._

import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord }
import org.apache.avro.io.{ DecoderFactory, EncoderFactory }
import org.apache.spark.sql.{ Dataset, Encoder, Row }
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.StructType
import org.apache.avro.Schema

import cloudflow.spark.sql.SQLImplicits._

case class EncodedKV(key: String, value: Array[Byte])

case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) {

  val encoder: Encoder[T]                           = implicitly[Encoder[T]]
  val sqlSchema: StructType                         = encoder.schema
  val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema)
  @transient lazy val _avroSchema                   = new Schema.Parser().parse(avroSchema)
  @transient lazy val rowConverter                  = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema)
  @transient lazy val datumReader                   = new GenericDatumReader[GenericRecord](_avroSchema)
  @transient lazy val decoder                       = DecoderFactory.get
  def decode(bytes: Array[Byte]): Row = {
    val binaryDecoder = decoder.binaryDecoder(bytes, null)
    val record        = datumReader.read(null, binaryDecoder)
    rowConverter(record).asInstanceOf[GenericRow]
  }

}


case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) {

  @transient lazy val log = Logger.getLogger(getClass.getName)

  val BufferSize = 5 * 1024 // 5 Kb

  val encoder                     = implicitly[Encoder[T]]
  val sqlSchema                   = encoder.schema
  @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema)

  val recordName                = "topLevelRecord" // ???
  val recordNamespace           = "recordNamespace" // ???
  @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace)

  // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage
  def rowToBytes(row: Row): Array[Byte] = {
    val genRecord = converter(row).asInstanceOf[GenericRecord]
    if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord")
    val datumWriter   = new GenericDatumWriter[GenericRecord](_avroSchema)
    val avroEncoder   = EncoderFactory.get
    val byteArrOS     = new ByteArrayOutputStream(BufferSize)
    val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null)
    datumWriter.write(genRecord, binaryEncoder)
    binaryEncoder.flush()
    byteArrOS.toByteArray
  }

  def encode(dataset: Dataset[T]): Dataset[Array[Byte]] =
    dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]]

  // Note to self: I'm not sure how heavy this chain of transformations is
  def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = {
    val encoder             = encoderFor[T]
    implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind()
    dataset.map { value ⇒
      val key         = keyFun(value)
      val internalRow = encoder.toRow(value)
      val row         = rowEncoder.fromRow(internalRow)
      val bytes       = rowToBytes(row)
      EncodedKV(key, bytes)
    }
  }

} 
Example 188
Source File: SparkEgressSpec.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark

import org.apache.spark.sql.{ Dataset, Encoder, SparkSession }
import org.apache.spark.sql.streaming.{ OutputMode, Trigger }
import cloudflow.streamlets.StreamletShape
import cloudflow.streamlets.avro._
import cloudflow.spark.avro._
import cloudflow.spark.testkit._
import cloudflow.spark.sql.SQLImplicits._

class SparkEgressSpec extends SparkScalaTestSupport {
  "SparkEgress" should {
    "materialize streaming data to sink" in {

      val testKit = SparkStreamletTestkit(session)

      def asCollection[T: Encoder](session: SparkSession, queryName: String): List[T] =
        session.sql(s"select * from $queryName").as[T].collect().toList

      val instance = new MySparkEgress()

      // setup inlet tap on inlet port
      val in: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in)

      // build data and send to inlet tap
      val data = (1 to 10).map(i ⇒ Data(i, s"name$i"))
      in.addData(data)

      val run = testKit.run(instance, Seq(in), Seq.empty)
      run.failures mustBe ('empty)
      run.totalRows mustBe (20)
      val r1 = asCollection[String](session, "allNames")
      val r2 = asCollection[String](session, "allNamesUpper")

      // assert
      r1 must contain("name1")
      r2 must contain("NAME1")
    }
  }
}

class MySparkEgress extends SparkStreamlet {
  val in    = AvroInlet[Data]("in")
  val shape = StreamletShape(in)
  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries =
      process(readStream(in))

    private def process(inDataset: Dataset[Data]): StreamletQueryExecution = {
      val q1 = inDataset
        .map { d ⇒
          d.name
        }
        .writeStream
        .format("memory")
        .option("truncate", false)
        .queryName("allNames")
        .outputMode(OutputMode.Append())
        .trigger(Trigger.Once)
        .start()

      val q2 = inDataset
        .map { d ⇒
          d.name.toUpperCase
        }
        .writeStream
        .format("memory")
        .option("truncate", false)
        .queryName("allNamesUpper")
        .outputMode(OutputMode.Append())
        .trigger(Trigger.Once)
        .start()
      StreamletQueryExecution(q1, q2)
    }
  }
} 
Example 189
Source File: SparkJoin3Spec.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.streaming.OutputMode
import cloudflow.streamlets.StreamletShape
import cloudflow.streamlets.avro._
import cloudflow.spark.avro._
import cloudflow.spark.testkit._
import cloudflow.spark.sql.SQLImplicits._

class SparkJoin3Spec extends SparkScalaTestSupport {

  "SparkJoin3" should {
    "process streaming data" in {

      val testKit = SparkStreamletTestkit(session)

      val instance = new MySparkJoin3()

      // setup inlet tap on inlet port
      val in0: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in0)
      val in1: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in1)
      val in2: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in2)

      // setup outlet tap on outlet port
      val out: SparkOutletTap[Simple] = testKit.outletAsTap[Simple](instance.out)

      // build data and send to inlet tap
      val List(d1, d2, d3) = (1 to 30).map(i ⇒ Data(i, s"name$i")).sliding(10, 10).toList
      in0.addData(d1)
      in1.addData(d2)
      in2.addData(d3)

      val run = testKit.run(instance, Seq(in0, in1, in2), Seq(out))
      run.totalRows must be(30)

      // get data from outlet tap
      val results = out.asCollection(session)

      // assert
      results must contain(Simple("name1"))
      results must contain(Simple("name11"))
      results must contain(Simple("name21"))
      (results must have).length(30)
    }
  }
}
// create sparkStreamlet
class MySparkJoin3 extends SparkStreamlet {
  // comment: all inlets could be in different formats, one proto, one avro, one csv..
  val in0 = AvroInlet[Data]("in0")
  val in1 = AvroInlet[Data]("in1")
  val in2 = AvroInlet[Data]("in2")
  val out = AvroOutlet[Simple]("out", _.name)

  val shape = StreamletShape(out).withInlets(in0, in1, in2)

  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries = {
      val dataset0                   = readStream(in0)
      val dataset1                   = readStream(in1)
      val dataset2                   = readStream(in2)
      val outStream: Dataset[Simple] = process(dataset0, dataset1, dataset2)
      val query                      = writeStream(outStream, out, OutputMode.Append)
      StreamletQueryExecution(query)
    }

    private def process(in0: Dataset[Data], in1: Dataset[Data], in2: Dataset[Data]): Dataset[Simple] =
      in0.union(in1.union(in2)).select($"name").as[Simple]
  }
} 
Example 190
Source File: SparkIngressSpec.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark

import scala.collection.immutable.Seq
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.execution.streaming.MemoryStream
import cloudflow.streamlets.StreamletShape
import cloudflow.streamlets.avro._
import cloudflow.spark.avro._
import cloudflow.spark.testkit._
import cloudflow.spark.sql.SQLImplicits._

class SparkIngressSpec extends SparkScalaTestSupport {

  "SparkIngress" should {
    "produce elements to its outlet" in {

      val testKit  = SparkStreamletTestkit(session)
      val instance = new MySparkIngress()

      // setup outlet tap on outlet port
      val out: SparkOutletTap[Data] = testKit.outletAsTap[Data](instance.out)

      val run = testKit.run(instance, Seq.empty, Seq(out))

      // get processed rows from the run
      run.totalRows must be(10)

      // get data from outlet tap
      val results = out.asCollection(session)

      // assert
      results must contain(Data(1, "name1"))
    }
  }
}
// create sparkStreamlet
class MySparkIngress extends SparkStreamlet {
  val out   = AvroOutlet[Data]("out", d ⇒ d.id.toString)
  val shape = StreamletShape(out)

  override def createLogic() = new SparkStreamletLogic {
    private def process: Dataset[Data] = {
      implicit val sqlCtx = session.sqlContext
      val data            = (1 to 10).map(i ⇒ Data(i, s"name$i"))
      val m               = MemoryStream[Data]
      m.addData(data)
      m.toDF.as[Data]
    }
    override def buildStreamingQueries = {
      val outStream: Dataset[Data] = process
      require(outStream.isStreaming, "The Dataset created by an Ingress must be a Streaming Dataset")
      val query = writeStream(outStream, out, OutputMode.Append)
      StreamletQueryExecution(query)
    }
  }
} 
Example 191
Source File: SparkRandomGenIngress.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.sparkdoc

import scala.util.Random

import cloudflow.spark._
import cloudflow.streamlets._
import cloudflow.streamlets.avro._
import cloudflow.spark.sql.SQLImplicits._

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.streaming.OutputMode
import java.sql.Timestamp

class SparkRandomGenDataIngress extends SparkStreamlet {
  val out   = AvroOutlet[Data]("out", d ⇒ d.key)
  val shape = StreamletShape(out)

  case class Rate(timestamp: Timestamp, value: Long)

  override def createLogic() = new SparkStreamletLogic {

    override def buildStreamingQueries =
      writeStream(process, out, OutputMode.Append).toQueryExecution

    private def process: Dataset[Data] = {

      val recordsPerSecond = 10

      val keyGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "keyOne" else "keyTwo"

      val rateStream = session.readStream
        .format("rate")
        .option("rowsPerSecond", recordsPerSecond)
        .load()
        .as[Rate]

      rateStream.map {
        case Rate(_, value) ⇒ Data(keyGen(), value.toInt)
      }
    }
  }
} 
Example 192
Source File: ClusteringEvaluatorSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset


class ClusteringEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  @transient var irisDataset: Dataset[_] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    irisDataset = spark.read.format("libsvm").load("../data/mllib/iris_libsvm.txt")
  }

  test("params") {
    ParamsSuite.checkParams(new ClusteringEvaluator)
  }

  test("read/write") {
    val evaluator = new ClusteringEvaluator()
      .setPredictionCol("myPrediction")
      .setFeaturesCol("myLabel")
    testDefaultReadWrite(evaluator)
  }

  
  test("squared euclidean Silhouette") {
    val evaluator = new ClusteringEvaluator()
        .setFeaturesCol("features")
        .setPredictionCol("label")

    assert(evaluator.evaluate(irisDataset) ~== 0.6564679231 relTol 1e-5)
  }

  test("number of clusters must be greater than one") {
    val singleClusterDataset = irisDataset.where($"label" === 0.0)
    val evaluator = new ClusteringEvaluator()
      .setFeaturesCol("features")
      .setPredictionCol("label")

    val e = intercept[AssertionError]{
      evaluator.evaluate(singleClusterDataset)
    }
    assert(e.getMessage.contains("Number of clusters must be greater than one"))
  }

} 
Example 193
Source File: MovingAverageSparklet.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package sensors

import cloudflow.streamlets.StreamletShape

import cloudflow.streamlets.avro._
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.TimestampType
import cloudflow.spark.sql.SQLImplicits._
import org.apache.spark.sql.streaming.OutputMode

class MovingAverageSparklet extends SparkStreamlet {

  val in    = AvroInlet[Data]("in")
  val out   = AvroOutlet[Agg]("out", _.src)
  val shape = StreamletShape(in, out)

  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries = {
      val dataset   = readStream(in)
      val outStream = process(dataset)
      writeStream(outStream, out, OutputMode.Append).toQueryExecution
    }

    private def process(inDataset: Dataset[Data]): Dataset[Agg] = {
      val query = inDataset
        .withColumn("ts", $"timestamp".cast(TimestampType))
        .withWatermark("ts", "1 minutes")
        .groupBy(window($"ts", "1 minute", "30 seconds"), $"src", $"gauge")
        .agg(avg($"value").as("avg"))
      query.select($"src", $"gauge", $"avg".as("value")).as[Agg]
    }
  }

} 
Example 194
Source File: CallStatsAggregator.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package carly.aggregator

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

import cloudflow.streamlets._
import cloudflow.streamlets.avro._
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.spark.sql.streaming.OutputMode
import cloudflow.spark.sql.SQLImplicits._
import org.apache.log4j.{ Level, Logger }

import carly.data._
class CallStatsAggregator extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  //tag::docs-schemaAware-example[]
  val in    = AvroInlet[CallRecord]("in")
  val out   = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString)
  val shape = StreamletShape(in, out)
  //end::docs-schemaAware-example[]

  val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute"))

  val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute"))

  override def configParameters = Vector(GroupByWindow, Watermark)
  override def createLogic = new SparkStreamletLogic {
    val watermark     = Watermark.value
    val groupByWindow = GroupByWindow.value
//    val t0 = System.currentTimeMillis() // serialization error!

    //tag::docs-aggregationQuery-example[]
    override def buildStreamingQueries = {
      val dataset   = readStream(in)
      val outStream = process(dataset)
      writeStream(outStream, out, OutputMode.Update).toQueryExecution
    }

    private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = {
      val query =
        inDataset
          .withColumn("ts", $"timestamp".cast(TimestampType))
          .withWatermark("ts", s"${watermark.toMillis()} milliseconds")
          .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds"))
          .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration"))
          .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType))

      query
        .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration")
        .as[AggregatedCallStats]
    }
    //end::docs-aggregationQuery-example[]
  }
} 
Example 195
Source File: CallRecordGeneratorIngress.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package carly.aggregator

import java.sql.Timestamp

import scala.util.Random
import scala.concurrent.duration._

import org.apache.spark.sql.{ Dataset, SparkSession }
import org.apache.spark.sql.streaming.OutputMode

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.LongType

import cloudflow.streamlets._
import cloudflow.streamlets.avro._
import cloudflow.spark.sql.SQLImplicits._
import carly.data.CallRecord
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.log4j.{ Level, Logger }

case class Rate(timestamp: Timestamp, value: Long)

class CallRecordGeneratorIngress extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to process.", Some(50))

  override def configParameters = Vector(RecordsPerSecond)

  val out   = AvroOutlet[CallRecord]("out", _.user)
  val shape = StreamletShape(out)

  override def createLogic() = new SparkStreamletLogic {
    val recordsPerSecond = RecordsPerSecond.value
    override def buildStreamingQueries = {
      val outStream = DataGenerator.mkData(super.session, recordsPerSecond)
      writeStream(outStream, out, OutputMode.Append).toQueryExecution
    }
  }
}

object DataGenerator {
  def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = {
    // do we need to expose this through configuration?

    val MaxTime           = 2.hours.toMillis
    val MaxUsers          = 100000
    val TS0               = new java.sql.Timestamp(0)
    val ZeroTimestampProb = 0.05 // error rate

    // Random Data Generator
    val usersUdf     = udf(() ⇒ "user-" + Random.nextInt(MaxUsers))
    val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing")

    // Time-biased randomized filter - 1/2 hour cycles
    val sinTime: Long ⇒ Double                   = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI)
    val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob
    val timeFilterUdf                            = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng))
    val zeroTimestampUdf = udf { (ts: java.sql.Timestamp, rng: Double) ⇒
      if (rng < ZeroTimestampProb) {
        TS0
      } else {
        ts
      }
    }

    val rateStream = session.readStream
      .format("rate")
      .option("rowsPerSecond", recordsPerSecond)
      .load()
      .as[Rate]

    val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand())
    val sampledData = randomDataset
      .where(timeFilterUdf($"timestamp", $"rng"))
      .withColumn("user", usersUdf())
      .withColumn("other", usersUdf())
      .withColumn("direction", directionUdf())
      .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType))
      .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng"))
      .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp".as("timestamp"))
      .as[CallRecord]
    sampledData
  }
} 
Example 196
Source File: SparkOutput.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package swissknife.spark

import cloudflow.streamlets.StreamletShape

import cloudflow.streamlets.avro._
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.TimestampType
import cloudflow.spark.sql.SQLImplicits._
import org.apache.spark.sql.streaming.OutputMode

import swissknife.data.Data

class SparkOutput extends SparkStreamlet {

  val in    = AvroInlet[Data]("in")
  val shape = StreamletShape(in)

  override def createLogic() = new SparkStreamletLogic {
    val sparkLocality = context.session.conf.getOption("spark.locality.wait").getOrElse("")
    val feedbackMsg = s"locality=[$sparkLocality]"

    override def buildStreamingQueries = {
      val query   = readStream(in)
        // we add this to the output to make it observable from the outside
        .withColumn("payload", lit(feedbackMsg)) // we add this to the output to make it observable from the outside
        .writeStream
        .format("console")
        .option("truncate","false")
        .start
      query.toQueryExecution
    }
  }

} 
Example 197
Source File: SparkCounter.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package swissknife.spark

import cloudflow.streamlets.{StreamletShape, StringConfigParameter}
import cloudflow.streamlets.avro._
import cloudflow.spark.{SparkStreamlet, SparkStreamletLogic}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.TimestampType
import cloudflow.spark.sql.SQLImplicits._
import org.apache.spark.sql.streaming.OutputMode
import swissknife.data.Data

class SparkCounter extends SparkStreamlet {

  val in    = AvroInlet[Data]("in")
  val out   = AvroOutlet[Data]("out", _.src)
  val shape = StreamletShape(in, out)

  val configurableMessage = StringConfigParameter("configurable-message", "Configurable message.", Some("spark-original"))

  override def configParameters = Vector(configurableMessage)

  override def createLogic() = new SparkStreamletLogic {
    val msg = configurableMessage.value
    override def buildStreamingQueries = {
      val dataset   = readStream(in)
      val outStream = process(dataset, msg)
      writeStream(outStream, out, OutputMode.Append).toQueryExecution
    }

    private def process(inDataset: Dataset[Data], message: String): Dataset[Data] = {
      val query = inDataset
        .withColumn("ts", $"timestamp".cast(TimestampType))
        .withColumn("updated_src", concat($"src", lit("-spark")))
        .withWatermark("ts", "0 seconds")
        .groupBy(window($"ts", "5 seconds"), $"updated_src")
        .agg(max($"count").as("count"))
      query.select($"updated_src".as("src"), $"window.start".as("timestamp"), lit(message).as("payload"), $"count").as[Data]
    }
  }

} 
Example 198
Source File: SparkDataGenerator.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package swissknife.spark

import java.sql.Timestamp

import cloudflow.streamlets.{ IntegerConfigParameter, StreamletShape }
import cloudflow.streamlets.avro._
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.functions._

import cloudflow.spark.sql.SQLImplicits._

import swissknife.data.Data

case class Rate(timestamp: Timestamp, value: Long)

class SparkDataGenerator extends SparkStreamlet {
  val out   = AvroOutlet[Data]("out", d ⇒ d.src)
  val shape = StreamletShape(out)

  val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to produce.", Some(1))

  override def configParameters = Vector(RecordsPerSecond)

  override def createLogic() = new SparkStreamletLogic {

    override def buildStreamingQueries =
      writeStream(process, out, OutputMode.Append).toQueryExecution

    private def process: Dataset[Data] = {
      val recordsPerSecond = RecordsPerSecond.value
      session.readStream
        .format("rate")
        .option("rowsPerSecond", recordsPerSecond)
        .load()
        .select(lit("origin").as("src"), $"timestamp", lit("").as("payload"), $"value".as("count"))
        .as[Data]
    }
  }
} 
Example 199
Source File: RichSparkFunctionsSpec.scala    From lighthouse   with Apache License 2.0 5 votes vote down vote up
package be.dataminded.lighthouse.pipeline

import java.io.ByteArrayOutputStream

import be.dataminded.lighthouse.testing.SharedSparkSession
import better.files._
import org.apache.spark.sql.Dataset
import org.apache.spark.storage.StorageLevel
import org.scalatest.BeforeAndAfter
import org.scalatest.funspec.AnyFunSpec
import org.scalatest.matchers.should.Matchers

class RichSparkFunctionsSpec extends AnyFunSpec with Matchers with SharedSparkSession with BeforeAndAfter {

  import spark.implicits._

  describe("SparkFunctions with a DataSet inside should have extra functionality") {

    val function = SparkFunction.of(Seq(1, 2, 3, 4, 5).toDS())

    it("can cache") {
      function.cache().run(spark).storageLevel should equal(StorageLevel.MEMORY_ONLY)
    }

    it("can drop the cache") {
      function.cache().dropCache().run(spark).storageLevel should equal(StorageLevel.NONE)
    }

    it("can be written to a sink") {
      function.write(OrcSink("target/output/orc")).run(spark)

      file"target/output/orc".exists should be(true)
    }

    it("can be written to multiple sinks") {
      function.write(OrcSink("target/output/orc"), OrcSink("target/output/orc2")).run(spark)

      file"target/output/orc".exists should be(true)
      file"target/output/orc2".exists should be(true)
    }

    it("is being cached when writing to multiple sinks for performance") {
      val result = function.write(OrcSink("target/output/orc"), OrcSink("target/output/orc2")).run(spark)

      result.storageLevel should equal(StorageLevel.MEMORY_ONLY)
    }

    it("can easily be counted") {
      function.count().run(spark) should equal(5)
    }

    it("can print the schema") {
      val stream = new ByteArrayOutputStream()
      Console.withOut(stream) {
        function.printSchema().run(spark)
      }
      stream.toString() should include("value: integer (nullable = false)")
    }

    it("can be be used as a Dataset") {
      function.as[Int].run(spark) shouldBe a[Dataset[_]]
    }
  }

  after {
    file"target/output/orc".delete(true)
    file"target/output/orc2".delete(true)
  }
} 
Example 200
Source File: AvroDataLink.scala    From lighthouse   with Apache License 2.0 5 votes vote down vote up
package be.dataminded.lighthouse.datalake

import org.apache.spark.sql.{DataFrame, Dataset, SaveMode}

class AvroDataLink(
    val path: LazyConfig[String],
    saveMode: SaveMode = SaveMode.Overwrite,
    partitionedBy: List[String] = List.empty,
    options: Map[String, String] = Map.empty
) extends PathBasedDataLink {

  override def doRead(path: String): DataFrame = {
    spark.read
      .format("com.databricks.spark.avro")
      .options(options)
      .load(path)
  }

  override def doWrite[T](dataset: Dataset[T], path: String): Unit = {
    dataset.write
      .format("com.databricks.spark.avro")
      .partitionBy(partitionedBy: _*)
      .options(options)
      .mode(saveMode)
      .save(path)
  }
}