org.apache.spark.sql.DataFrame Scala Examples

The following examples show how to use org.apache.spark.sql.DataFrame. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingConsumer.scala    From Scala-Programming-Projects   with MIT License 11 votes vote down vote up
package coinyser

import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.functions._

object StreamingConsumer {
  def fromJson(df: DataFrame): Dataset[Transaction] = {
    import df.sparkSession.implicits._
    val schema = Seq.empty[Transaction].toDS().schema
    df.select(from_json(col("value").cast("string"), schema).alias("v"))
      .select("v.*").as[Transaction]
  }

  def transactionStream(implicit spark: SparkSession, config: KafkaConfig): Dataset[Transaction] =
    fromJson(spark.readStream.format("kafka")
      .option("kafka.bootstrap.servers", config.bootStrapServers)
      .option("startingoffsets", "earliest")
      .option("subscribe", config.transactionsTopic)
      .load()
    )

} 
Example 2
Source File: MultilayerPerceptronClassifierWrapper.scala    From drizzle-spark   with Apache License 2.0 8 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel,
    val labelCount: Long,
    val layers: Array[Int],
    val weights: Array[Double]
  ) extends MLWritable {

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
      val rMetadata = parse(rMetadataStr)
      val labelCount = (rMetadata \ "labelCount").extract[Long]
      val layers = (rMetadata \ "layers").extract[Array[Int]]
      val weights = (rMetadata \ "weights").extract[Array[Double]]

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = ("class" -> instance.getClass.getName) ~
        ("labelCount" -> instance.labelCount) ~
        ("layers" -> instance.layers.toSeq) ~
        ("weights" -> instance.weights.toArray.toSeq)
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 3
Source File: DefaultSource.scala    From spark-snowflake   with Apache License 2.0 7 votes vote down vote up
package net.snowflake.spark.snowflake

import net.snowflake.spark.snowflake.streaming.SnowflakeSink
import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_SHORT_NAME
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources._
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import org.slf4j.LoggerFactory


  override def createRelation(sqlContext: SQLContext,
                              saveMode: SaveMode,
                              parameters: Map[String, String],
                              data: DataFrame): BaseRelation = {

    val params = Parameters.mergeParameters(parameters)
    // check spark version for push down
    if (params.autoPushdown) {
      SnowflakeConnectorUtils.checkVersionAndEnablePushdown(
        sqlContext.sparkSession
      )
    }
    // pass parameters to pushdown functions
    pushdowns.setGlobalParameter(params)
    val table = params.table.getOrElse {
      throw new IllegalArgumentException(
        "For save operations you must specify a Snowfake table name with the 'dbtable' parameter"
      )
    }

    def tableExists: Boolean = {
      val conn = jdbcWrapper.getConnector(params)
      try {
        jdbcWrapper.tableExists(conn, table.toString)
      } finally {
        conn.close()
      }
    }

    val (doSave, dropExisting) = saveMode match {
      case SaveMode.Append => (true, false)
      case SaveMode.Overwrite => (true, true)
      case SaveMode.ErrorIfExists =>
        if (tableExists) {
          sys.error(
            s"Table $table already exists! (SaveMode is set to ErrorIfExists)"
          )
        } else {
          (true, false)
        }
      case SaveMode.Ignore =>
        if (tableExists) {
          log.info(s"Table $table already exists -- ignoring save request.")
          (false, false)
        } else {
          (true, false)
        }
    }

    if (doSave) {
      val updatedParams = parameters.updated("overwrite", dropExisting.toString)
      new SnowflakeWriter(jdbcWrapper)
        .save(
          sqlContext,
          data,
          saveMode,
          Parameters.mergeParameters(updatedParams)
        )

    }

    createRelation(sqlContext, parameters)
  }

  override def createSink(sqlContext: SQLContext,
                          parameters: Map[String, String],
                          partitionColumns: Seq[String],
                          outputMode: OutputMode): Sink =
    new SnowflakeSink(sqlContext, parameters, partitionColumns, outputMode)
} 
Example 4
Source File: DataFrameExample.scala    From drizzle-spark   with Apache License 2.0 7 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

import java.io.File

import scopt.OptionParser

import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.util.Utils


object DataFrameExample {

  case class Params(input: String = "data/mllib/sample_libsvm_data.txt")
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DataFrameExample") {
      head("DataFrameExample: an example app using DataFrame for ML.")
      opt[String]("input")
        .text(s"input path to dataframe")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>
        success
      }
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val spark = SparkSession
      .builder
      .appName(s"DataFrameExample with $params")
      .getOrCreate()

    // Load input data
    println(s"Loading LIBSVM file with UDT from ${params.input}.")
    val df: DataFrame = spark.read.format("libsvm").load(params.input).cache()
    println("Schema from LIBSVM:")
    df.printSchema()
    println(s"Loaded training data as a DataFrame with ${df.count()} records.")

    // Show statistical summary of labels.
    val labelSummary = df.describe("label")
    labelSummary.show()

    // Convert features column to an RDD of vectors.
    val features = df.select("features").rdd.map { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(Vectors.fromML(feat)),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    // Save the records in a parquet file.
    val tmpDir = Utils.createTempDir()
    val outputDir = new File(tmpDir, "dataframe").toString
    println(s"Saving to $outputDir as Parquet file.")
    df.write.parquet(outputDir)

    // Load the records back.
    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDF = spark.read.parquet(outputDir)
    println(s"Schema from Parquet:")
    newDF.printSchema()

    spark.stop()
  }
}
// scalastyle:on println 
Example 5
Source File: JdbcRelationProvider.scala    From drizzle-spark   with Apache License 2.0 7 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val url = jdbcOptions.url
    val table = jdbcOptions.table
    val createTableOptions = jdbcOptions.createTableOptions
    val isTruncate = jdbcOptions.isTruncate

    val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, url, table)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, table)
              saveTable(df, url, table, jdbcOptions)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, table)
              createTable(df.schema, url, table, createTableOptions, conn)
              saveTable(df, url, table, jdbcOptions)
            }

          case SaveMode.Append =>
            saveTable(df, url, table, jdbcOptions)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(df.schema, url, table, createTableOptions, conn)
        saveTable(df, url, table, jdbcOptions)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
} 
Example 6
Source File: Cleaner.scala    From cleanframes   with Apache License 2.0 6 votes vote down vote up
package cleanframes

import org.apache.spark.sql.{Column, DataFrame, functions}
import shapeless.labelled.FieldType
import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness}

trait Cleaner[A] {
  def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column]
}

object Cleaner {
  def apply[A](frame: DataFrame, name: Option[String], alias: Option[String])(implicit env: Cleaner[A]): DataFrame = {
    frame.select(
      env.clean(frame, name, alias): _*
    )
  }

  def materialize[A](func: (DataFrame, Option[String], Option[String]) => List[Column]): Cleaner[A] = new Cleaner[A] {
    override def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column] = func(frame, name, alias)
  }

  implicit val hnilCleaner: Cleaner[HNil] = materialize((_, _, _) => Nil)

  implicit def genericObjectCleaner[A, H <: HList](implicit
                                                   gen: LabelledGeneric.Aux[A, H],
                                                   hCleaner: Lazy[Cleaner[H]]): Cleaner[A] =
    materialize((frame, name, alias) => {
      val structColumn = functions.struct(
        hCleaner.value.clean(frame, name, alias): _*
      )

      List(
        alias
          .map(structColumn.as)
          .getOrElse(structColumn)
      )
    })

  implicit def hlistObjectCleaner[K <: Symbol, H, T <: HList](implicit
                                                              witness: Witness.Aux[K],
                                                              hCleaner: Lazy[Cleaner[H]],
                                                              tCleaner: Cleaner[T]): Cleaner[FieldType[K, H] :: T] = {
    val fieldName: String = witness.value.name

    materialize { (frame, name, alias) =>

      val columnName = alias match {
        case None |
             Some(`reserved_root_level_alias`) => fieldName
        case Some(alias) => s"$alias.$fieldName"
      }

      val hColumns = hCleaner.value.clean(frame, Some(columnName), alias = Some(fieldName))
      val tColumns = tCleaner.clean(frame, name, alias)
      hColumns ::: tColumns
    }
  }
} 
Example 7
Source File: OnErrorSuite.scala    From spark-snowflake   with Apache License 2.0 6 votes vote down vote up
package net.snowflake.spark.snowflake

import net.snowflake.client.jdbc.SnowflakeSQLException
import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class OnErrorSuite extends IntegrationSuiteBase {
  lazy val table = s"spark_test_table_$randomSuffix"

  lazy val schema = new StructType(
    Array(StructField("var", StringType, nullable = false))
  )

  lazy val df: DataFrame = sparkSession.createDataFrame(
    sc.parallelize(
      Seq(Row("{\"dsadas\nadsa\":12311}"), Row("{\"abc\":334}")) // invalid json key
    ),
    schema
  )

  override def beforeAll(): Unit = {
    super.beforeAll()
    jdbcUpdate(s"create or replace table $table(var variant)")
  }

  override def afterAll(): Unit = {
    jdbcUpdate(s"drop table $table")
    super.afterAll()
  }

  test("continue_on_error off") {

    assertThrows[SnowflakeSQLException] {
      df.write
        .format(SNOWFLAKE_SOURCE_NAME)
        .options(connectorOptionsNoTable)
        .option("dbtable", table)
        .mode(SaveMode.Append)
        .save()
    }
  }

  test("continue_on_error on") {
    df.write
      .format(SNOWFLAKE_SOURCE_NAME)
      .options(connectorOptionsNoTable)
      .option("continue_on_error", "on")
      .option("dbtable", table)
      .mode(SaveMode.Append)
      .save()

    val result = sparkSession.read
      .format(SNOWFLAKE_SOURCE_NAME)
      .options(connectorOptionsNoTable)
      .option("dbtable", table)
      .load()

    assert(result.collect().length == 1)
  }

} 
Example 8
Source File: MNISTBenchmark.scala    From spark-knn   with Apache License 2.0 6 votes vote down vote up
package com.github.saurfang.spark.ml.knn.examples

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.log4j

import scala.collection.mutable


object MNISTBenchmark {

  val logger = log4j.Logger.getLogger(getClass)

  def main(args: Array[String]) {
    val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt)
    val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2"
    val numPartitions = if(args.length >= 3) args(2).toInt else 10
    val models = if(args.length >=4) args(3).split(',') else Array("tree","naive")

    val spark = SparkSession.builder().getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._

    //read in raw label and features
    val rawDataset = MLUtils.loadLibSVMFile(sc, path)
      .zipWithIndex()
      .filter(_._2 < ns.max)
      .sortBy(_._2, numPartitions = numPartitions)
      .keys
      .toDF()

    // convert "features" from mllib.linalg.Vector to ml.linalg.Vector
    val dataset =  MLUtils.convertVectorColumnsToML(rawDataset)
      .cache()
    dataset.count() //force persist

    val limiter = new Limiter()
    val knn = new KNNClassifier()
      .setTopTreeSize(numPartitions * 10)
      .setFeaturesCol("features")
      .setPredictionCol("prediction")
      .setK(1)
    val naiveKNN = new NaiveKNNClassifier()

    val pipeline = new Pipeline()
      .setStages(Array(limiter, knn))
    val naivePipeline = new Pipeline()
      .setStages(Array(limiter, naiveKNN))

    val paramGrid = new ParamGridBuilder()
      .addGrid(limiter.n, ns)
      .build()

    val bm = new Benchmarker()
      .setEvaluator(new MulticlassClassificationEvaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumTimes(3)

    val metrics = mutable.ArrayBuffer[String]()
    if(models.contains("tree")) {
      val bmModel = bm.setEstimator(pipeline).fit(dataset)
      metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}"
    }
    if(models.contains("naive")) {
      val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset)
      metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}"
    }
    logger.info(metrics.mkString("\n"))
  }
}

class Limiter(override val uid: String) extends Transformer {
  def this() = this(Identifiable.randomUID("limiter"))

  val n: IntParam = new IntParam(this, "n", "number of rows to limit")

  def setN(value: Int): this.type = set(n, value)

  // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN)
  override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF()

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = schema
} 
Example 9
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)

  }
} 
Example 10
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object GradientBoostedTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val gbt = new GBTClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxIter(10)

    stages += vectorAssembler
    stages += gbt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0))
    val labels = model.transform(test).select("label").rdd.map(_.getDouble(0))
    val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision
    println(s"  Accuracy : $accuracy")

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls")

    savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }

} 
Example 11
Source File: SparkPFASuiteBase.scala    From aardpfark   with Apache License 2.0 6 votes vote down vote up
package com.ibm.aardpfark.pfa

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.SparkConf
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.scalactic.Equality
import org.scalatest.FunSuite

abstract class SparkPFASuiteBase extends FunSuite with DataFrameSuiteBase with PFATestUtils {

  val sparkTransformer: Transformer
  val input: Array[String]
  val expectedOutput: Array[String]

  val sparkConf =  new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID).
    set("spark.driver.host", "localhost")
  override lazy val spark = SparkSession.builder().config(sparkConf).getOrCreate()
  override val reuseContextIfPossible = true

  // Converts column containing a vector to an array
  def withColumnAsArray(df: DataFrame, colName: String) = {
    val vecToArray = udf { v: Vector => v.toArray }
    df.withColumn(colName, vecToArray(df(colName)))
  }

  def withColumnAsArray(df: DataFrame, first: String, others: String*) = {
    val vecToArray = udf { v: Vector => v.toArray }
    var result = df.withColumn(first, vecToArray(df(first)))
    others.foreach(c => result = result.withColumn(c, vecToArray(df(c))))
    result
  }

  // Converts column containing a vector to a sparse vector represented as a map
  def getColumnAsSparseVectorMap(df: DataFrame, colName: String) = {
    val vecToMap = udf { v: Vector => v.toSparse.indices.map(i => (i.toString, v(i))).toMap }
    df.withColumn(colName, vecToMap(df(colName)))
  }

}

abstract class Result

object ApproxEquality extends ApproxEquality

trait ApproxEquality {

  import org.scalactic.Tolerance._
  import org.scalactic.TripleEquals._

  implicit val seqApproxEq: Equality[Seq[Double]] = new Equality[Seq[Double]] {
    override def areEqual(a: Seq[Double], b: Any): Boolean = {
      b match {
        case d: Seq[Double] =>
          a.zip(d).forall { case (l, r) => l === r +- 0.001 }
        case _ =>
          false
      }
    }
  }

  implicit val vectorApproxEq: Equality[Vector] = new Equality[Vector] {
    override def areEqual(a: Vector, b: Any): Boolean = {
      b match {
        case v: Vector =>
          a.toArray.zip(v.toArray).forall { case (l, r) => l === r +- 0.001 }
        case _ =>
          false
      }
    }
  }
} 
Example 12
Source File: TopTransformer.scala    From modelmatrix   with Apache License 2.0 5 votes vote down vote up
package com.collective.modelmatrix.transform

import com.collective.modelmatrix.CategoricalColumn.AllOther
import com.collective.modelmatrix.transform.FeatureTransformationError.{FeatureColumnNotFound, UnsupportedTransformDataType}
import com.collective.modelmatrix.{CategoricalColumn, ModelFeature}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._
import org.slf4j.LoggerFactory

import scalaz.{@@, \/}
import scalaz.syntax.either._

class TopTransformer(input: DataFrame @@ Transformer.Features) extends CategoricalTransformer(input) {

  private val log = LoggerFactory.getLogger(classOf[TopTransformer])

  private val supportedDataTypes = Seq(ShortType, IntegerType, LongType, DoubleType, StringType)

  def validate: PartialFunction[ModelFeature, FeatureTransformationError \/ TypedModelFeature] = {
    case f@ModelFeature(_, _, _, _, Top(_, _)) if featureDataType(f.feature).isEmpty =>
      FeatureColumnNotFound(f.feature).left

    case f@ModelFeature(_, _, _, _, Top(_, _))
      if featureDataType(f.feature).isDefined && supportedDataTypes.contains(featureDataType(f.feature).get) =>
      TypedModelFeature(f, featureDataType(f.feature).get).right

    case f@ModelFeature(_, _, _, _, t@Top(_, _)) =>
      UnsupportedTransformDataType(f.feature, featureDataType(f.feature).get, t).left
  }

  def transform(feature: TypedModelFeature): Seq[CategoricalColumn] = {
    require(feature.feature.transform.isInstanceOf[Top], s"Illegal transform type: ${feature.feature.transform}")

    val ModelFeature(_, _, f, _, Top(cover, allOther)) = feature.feature

    log.info(s"Calculate top transformation for feature: ${feature.feature.feature}. " +
      s"Cover: $cover. " +
      s"All other: $allOther. " +
      s"Extract type: ${feature.extractType}")

    // Group and count by extract value
    val df = scalaz.Tag.unwrap(input)

    val grouped: DataFrame = df.filter(df(f).isNotNull).groupBy(f).count()

    val featureValues: Seq[Value] = grouped.collect().toSeq.map { row =>
      val value = row.get(0)
      val cnt = row.getLong(1)
      Value(value, cnt)
    }

    log.debug(s"Collected '$f' values: ${featureValues.size}")

    val topValues = featureValues.sortBy(_.count)(implicitly[Ordering[Long]].reverse)

    // Get number of columns below cover threshold
    val threshold = (cover / 100) * topValues.map(_.count).sum
    val columnsBelowThreshold = topValues.map(_.count).scanLeft(0L)((cum, cnt) => cum + cnt).takeWhile(_ < threshold).size

    // Transform categorical values
    val valueColumns = topValues.take(columnsBelowThreshold).foldLeft(Scan()) {
      case (state@Scan(columnId, cumulativeCnt, columns), value) =>
        val column = valueColumn(feature.extractType)(columnId, cumulativeCnt, value)
        Scan(column.columnId, column.cumulativeCount, columns :+ column)
    }

    // Get all other columns if required
    val allOtherColumns = if (allOther) {
      val allOtherCnt = topValues.drop(columnsBelowThreshold).map(_.count).sum
      Seq(AllOther(valueColumns.columnId + 1, allOtherCnt, valueColumns.cumulativeCnt + allOtherCnt))
    } else Seq.empty

    // Add them together
    valueColumns.columns ++ allOtherColumns.filter(_.count > 0)
  }
} 
Example 13
Source File: IndexTransformer.scala    From modelmatrix   with Apache License 2.0 5 votes vote down vote up
package com.collective.modelmatrix.transform

import com.collective.modelmatrix.CategoricalColumn.AllOther
import com.collective.modelmatrix.{CategoricalColumn, ModelFeature}
import com.collective.modelmatrix.transform.FeatureTransformationError.{UnsupportedTransformDataType, FeatureColumnNotFound}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._
import org.slf4j.LoggerFactory

import scalaz.{@@, \/}
import scalaz.syntax.either._

class IndexTransformer(input: DataFrame @@ Transformer.Features) extends CategoricalTransformer(input) {

  private val log = LoggerFactory.getLogger(classOf[IndexTransformer])

  private val supportedDataTypes = Seq(ShortType, IntegerType, LongType, DoubleType, StringType)

  def validate: PartialFunction[ModelFeature, FeatureTransformationError \/ TypedModelFeature] = {
    case f@ModelFeature(_, _, _, _, Index(_, _)) if featureDataType(f.feature).isEmpty =>
      FeatureColumnNotFound(f.feature).left

    case f@ModelFeature(_, _, _, _, Index(_, _))
      if featureDataType(f.feature).isDefined && supportedDataTypes.contains(featureDataType(f.feature).get) =>
      TypedModelFeature(f, featureDataType(f.feature).get).right

    case f@ModelFeature(_, _, _, _, t@Index(_, _)) =>
      UnsupportedTransformDataType(f.feature, featureDataType(f.feature).get, t).left
  }

  def transform(feature: TypedModelFeature): Seq[CategoricalColumn] = {
    require(feature.feature.transform.isInstanceOf[Index], s"Illegal transform type: ${feature.feature.transform}")

    val ModelFeature(_, _, f, _, Index(support, allOther)) = feature.feature

    log.info(s"Calculate index transformation for feature: ${feature.feature.feature}. " +
      s"Support: $support. " +
      s"All other: $allOther. " +
      s"Extract type: ${feature.extractType}")

    val df = scalaz.Tag.unwrap(input)

    import org.apache.spark.sql.functions._

    // Group and count by extract value
    val grouped: DataFrame = df.filter(df(f).isNotNull).groupBy(f).count()

    // Get support threshold
    val totalCount = grouped.sumOf("count")
    val threshold = (support / 100) * totalCount

    // Collect only support values
    val supportValues: Seq[Value] = grouped.filter(col("count") > threshold).collect().toSeq.map { row =>
      val value = row.get(0)
      val cnt = row.getLong(1)
      Value(value, cnt)
    }
    val topSupportValues = supportValues.sortBy(_.count)(implicitly[Ordering[Long]].reverse)

    log.debug(s"Collected '$f' support values: ${supportValues.size}")

    // Transform categorical values
    val valueColumns = topSupportValues.foldLeft(Scan()) {
      case (state@Scan(columnId, cumulativeCnt, columns), value) =>
        val column = valueColumn(feature.extractType)(columnId, cumulativeCnt, value)
        Scan(column.columnId, column.cumulativeCount, columns :+ column)
    }

    // Get all other column if required
    val allOtherColumns = if (allOther && support < 100.0) {
      // Count for values that are not in support set
      val allOtherCnt = grouped.filter(col("count") <= threshold).sumOf("count")
      Seq(AllOther(valueColumns.columnId + 1, allOtherCnt, valueColumns.cumulativeCnt + allOtherCnt))
    } else Seq.empty

    // Add them together
    valueColumns.columns ++ allOtherColumns.filter(_.count > 0)
  }
} 
Example 14
Source File: Transformers.scala    From modelmatrix   with Apache License 2.0 5 votes vote down vote up
package com.collective.modelmatrix.transform

import com.collective.modelmatrix.ModelFeature
import org.apache.spark.sql.{SQLContext, DataFrame}

import scalaz._

trait Transformers {

  protected class Transformers(input: DataFrame @@ Transformer.Features)(implicit sqlContext: SQLContext) {

    val identity = new IdentityTransformer(input)
    val top = new TopTransformer(input)
    val index = new IndexTransformer(input)
    val bins = new BinsTransformer(input)

    private val unknownFeature: PartialFunction[ModelFeature, FeatureTransformationError \/ TypedModelFeature] = {
      case feature => sys.error(s"Feature can't be validated by any of transformers: $feature")
    }

    def validate(feature: ModelFeature): FeatureTransformationError \/ TypedModelFeature =
      (identity.validate orElse
        top.validate orElse
        index.validate orElse
        bins.validate orElse
        unknownFeature
        )(feature)

  }
} 
Example 15
Source File: BinsTransformer.scala    From modelmatrix   with Apache License 2.0 5 votes vote down vote up
package com.collective.modelmatrix.transform

import com.collective.modelmatrix.BinColumn.BinValue
import com.collective.modelmatrix.transform.FeatureTransformationError.{FeatureColumnNotFound, UnsupportedTransformDataType}
import com.collective.modelmatrix.{BinColumn, ModelFeature}
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._
import org.slf4j.LoggerFactory

import scalaz._
import scalaz.syntax.either._

class BinsTransformer(input: DataFrame @@ Transformer.Features) extends Transformer(input) with Binner {

  private val log = LoggerFactory.getLogger(classOf[BinsTransformer])

  private val config = ConfigFactory.load()

  private val sampleSize = config.getLong("modelmatrix.transform.bins.sample-size")

  private val supportedDataTypes = Seq(ShortType, IntegerType, LongType, DoubleType)

  protected case class Scan(columnId: Int = 0, columns: Seq[BinValue] = Seq.empty)

  def validate: PartialFunction[ModelFeature, FeatureTransformationError \/ TypedModelFeature] = {
    case f@ModelFeature(_, _, _, _, Bins(_, _, _)) if featureDataType(f.feature).isEmpty =>
      FeatureColumnNotFound(f.feature).left

    case f@ModelFeature(_, _, _, _, Bins(_, _, _))
      if featureDataType(f.feature).isDefined && supportedDataTypes.contains(featureDataType(f.feature).get) =>
      TypedModelFeature(f, featureDataType(f.feature).get).right

    case f@ModelFeature(_, _, _, _, b@Bins(_, _, _)) =>
      UnsupportedTransformDataType(f.feature, featureDataType(f.feature).get, b).left
  }

  def transform(feature: TypedModelFeature): Seq[BinColumn] = {
    require(feature.feature.transform.isInstanceOf[Bins],
      s"Illegal transform type: ${feature.feature.transform}")

    val ModelFeature(_, _, f, _, Bins(nbins, minPoints, minPct)) = feature.feature

    log.info(s"Calculate bins transformation for feature: ${feature.feature.feature}. " +
      s"Bins: $nbins. " +
      s"Min points: $minPoints. " +
      s"Min percentage: $minPct. " +
      s"Extract type: ${feature.extractType}")

    val df = scalaz.Tag.unwrap(input)
    val inputSize = df.count()
    val fraction = if (sampleSize >= inputSize) 1.0D else sampleSize.toDouble / inputSize
    val sample = df.select(f).filter(df(f).isNotNull).sample(withReplacement = false, fraction)

    // Collect sample values
    val x = sample.collect().map {
      case row if feature.extractType == ShortType => row.getShort(0).toDouble
      case row if feature.extractType == IntegerType => row.getInt(0).toDouble
      case row if feature.extractType == LongType => row.getLong(0).toDouble
      case row if feature.extractType == DoubleType => row.getDouble(0)
    }

    log.debug(s"Collected sample size of: ${x.length}")

    // Doesn't make any sense to do binning if no enough sample points available
    require(x.length > nbins * 10,
      s"Number of sample points for binning is too small")

    // Find optimal split
    val bins = optimalSplit(x, nbins, minPoints, minPct)
    log.debug(s"Calculated optimal split: ${bins.size}. " +
      s"Bins: ${bins.map(bin => s"${bin.count} in [${bin.low}, ${bin.high})").mkString(", ")}")

    require(bins.size >= 2,
      s"Got less than 2 bins, probably sample size is too small or data is too skewed")

    // Transform bins to Bin columns
    val scan = bins.foldLeft(Scan()) {
      case (state@Scan(columnId, cols), bin) =>
        val column = BinColumn.BinValue(columnId + 1, bin.low, bin.high, bin.count, x.length)
        Scan(column.columnId, cols :+ column)
    }

    val columns = scan.columns

    // Update first and last bins to catch out-of-sample values
    BinColumn.toLowerBin(columns.head) +: columns.drop(1).dropRight(1) :+ BinColumn.toUpperBin(columns.last)
  }
} 
Example 16
Source File: ModelFeatureSpec.scala    From modelmatrix   with Apache License 2.0 5 votes vote down vote up
package com.collective.modelmatrix

import com.collective.modelmatrix.transform.{Bins, Identity, Index, Top}
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.catalyst.SqlParser
import org.scalatest.FlatSpec

import scalaz.syntax.validation._

class ModelFeatureSpec extends FlatSpec {

  private val isActive = true
  private val notActive = false

  private val isAllOther = true
  private val features = ConfigFactory.load("./matrix-model.conf").getConfig("features")

  "Model Feature" should "parse 'identity' feature" in {
    val adNetwork = ModelFeature.parse("ad_network", features.getConfig("ad_network"))
    assert(adNetwork == ModelFeature(
      isActive,
      "advertisement",
      "ad_network",
      "network",
      Identity
    ).successNel)
  }

  it should "parse 'top' feature" in {
    val adType = ModelFeature.parse("ad_type", features.getConfig("ad_type"))
    assert(adType == ModelFeature(
      isActive,
      "advertisement",
      "ad_type",
      "type",
      Top(95.0, isAllOther)
    ).successNel)
  }

  it should "parse 'index' feature" in {
    val adSize = ModelFeature.parse("ad_size", features.getConfig("ad_size"))
    assert(adSize == ModelFeature(
      isActive,
      "advertisement",
      "ad_size",
      "size",
      Index(0.5, isAllOther)
    ).successNel)
  }

  it should "parse 'bins' feature" in {
    val adPerformance = ModelFeature.parse("ad_performance", features.getConfig("ad_performance"))
    assert(adPerformance == ModelFeature(
      isActive,
      "performance",
      "ad_performance",
      "pct_clicks",
      Bins(10, 100, 1.0)
    ).successNel)
  }

  it should "parse extract expression" in {
    val popDensity = ModelFeature.parse("pop_density", features.getConfig("pop_density"))
    assert(popDensity.isSuccess)
  }

  it should "fail to parse bad extract expression" in {
    val popDensity = ModelFeature.parse("pop_density_err", features.getConfig("pop_density_err"))
    assert(popDensity.isFailure)
  }

  it should "parse deactivated feature" in {
    val adVisibility = ModelFeature.parse("ad_visibility", features.getConfig("ad_visibility"))
    assert(adVisibility == ModelFeature(
      notActive,
      "advertisement",
      "ad_visibility",
      "visibility",
      Top(95.0, isAllOther)
    ).successNel)
  }

  it should "fail on wrong transformation type" in {
    val adTag = ModelFeature.parse("ad_tag", features.getConfig("ad_tag"))
    assert(adTag == "Unknown transform type: magic-transform".failureNel)
  }

  it should "fail on wrong transformation parameter" in {
    val adPosition = ModelFeature.parse("ad_position", features.getConfig("ad_position"))
    assert(adPosition.isFailure)
  }

} 
Example 17
Source File: Sink.scala    From modelmatrix   with Apache License 2.0 5 votes vote down vote up
package com.collective.modelmatrix.cli

import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}

import scala.util.{Failure, Success, Try}

sealed trait Sink {
  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit
}

object Sink {
  private val hive = "hive://(.*)".r
  private val parquet = "parquet://(.*)".r

  def validate(sink: String): Either[String, Unit] = {
    Try(apply(sink)) match {
      case Success(s) => Right(())
      case Failure(err) => Left(s"Unsupported sink type: $sink")
    }
  }

  def apply(sink: String): Sink = sink match {
    case hive(table) => HiveSink(table)
    case parquet(path) => ParquetSink(path)
  }
}

object NoSink extends Sink {
  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = {
    sys.error(s"Sink is not defined")
  }

  override def toString: String = "Sink is not defined"
}

case class HiveSink(
  tableName: String
) extends Sink {

  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = {
    df.saveAsTable(tableName, SaveMode.Overwrite)
  }

  override def toString: String =
    s"Hive table: $tableName"
}

case class ParquetSink(
  path: String
) extends Sink {

  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = {
    df.saveAsParquetFile(path)
  }

  override def toString: String =
    s"Parquet: $path"
} 
Example 18
Source File: Source.scala    From modelmatrix   with Apache License 2.0 5 votes vote down vote up
package com.collective.modelmatrix.cli

import org.apache.spark.sql.{DataFrame, SQLContext}

import scala.util.{Failure, Success, Try}

sealed trait Source {
  def asDataFrame(implicit sqlContext: SQLContext): DataFrame
}

object Source {
  private val hive = "hive://(.*)".r
  private val parquet = "parquet://(.*)".r

  def validate(source: String): Either[String, Unit] = {
    Try(apply(source)) match {
      case Success(s) => Right(())
      case Failure(err) => Left(s"Unsupported source type: $source")
    }
  }

  def apply(source: String): Source = source match {
    case hive(table) => HiveSource(table)
    case parquet(path) => ParquetSource(path)
  }
}

object NoSource extends Source {
  def asDataFrame(implicit sqlContext: SQLContext): DataFrame = {
    sys.error(s"Source is not defined")
  }

  override def toString: String = "Source is not defined"
}

case class HiveSource(
  tableName: String
) extends Source {

  def asDataFrame(implicit sqlContext: SQLContext): DataFrame = {
    sqlContext.sql(s"SELECT * FROM $tableName")
  }

  override def toString: String = {
    s"Hive table: $tableName"
  }

}

case class ParquetSource(
  path: String
) extends Source {

  def asDataFrame(implicit sqlContext: SQLContext): DataFrame = {
    sqlContext.parquetFile(path)
  }

  override def toString: String = {
    s"Parquet: $path"
  }

} 
Example 19
Source File: LogisticRegressionRecommender.scala    From wordpress-posts-recommender   with Apache License 2.0 5 votes vote down vote up
package wordpressworkshop

import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame

case class LogisticRegressionRecommender(training: DataFrame) {

  val lr = new LogisticRegression()
  val paramMap = ParamMap(lr.maxIter -> 20)
                 .put(lr.regParam -> 0.01)
                 .put(lr.probabilityCol -> "probability")

  val model: LogisticRegressionModel = lr.fit(training, paramMap)

  def metrics(testData: DataFrame) = {
    val predictionAndLabels: RDD[(Double, Double)] =
      model.transform(testData).map(row => row.getAs[Vector]("probability")(1) -> row.getAs[Double]("label"))

    new BinaryClassificationMetrics(predictionAndLabels)
  }

  def likeScores(testData: DataFrame): RDD[(Long, Long, Double)] =
    model.transform(testData)
    .map(row => (row.getAs[Long]("userId"), row.getAs[Long]("postId"), row.getAs[Vector]("probability")(1)))
} 
Example 20
Source File: IrisKMeansClustering.scala    From spark-spec   with MIT License 5 votes vote down vote up
package com.github.mrpowers.spark.spec.ml.clustering

import com.github.mrpowers.spark.spec.Config
import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.DataFrame

object IrisKMeansClustering
  extends SparkSessionWrapper {

  val irisDF = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(Config.get("irisData"))

  val Array(trainingDF, testDF) = irisDF.randomSplit(Array(0.7, 0.3), seed = 12345)

  def withVectorizedFeatures(
    featureColNames: Array[String] = Array("SepalLengthCm", "SepalLengthCm", "PetalLengthCm", "PetalWidthCm"),
    outputColName: String = "features"
  )(df: DataFrame): DataFrame = {
    val assembler: VectorAssembler = new VectorAssembler()
      .setInputCols(featureColNames)
      .setOutputCol(outputColName)
    assembler.transform(df)
  }

  def model(df: DataFrame = trainingDF): KMeansModel = {
    val trainFeatures: DataFrame = df
      .transform(withVectorizedFeatures())

    new KMeans()
      .setK(3) // # of clusters
      .setSeed(2L)
      .fit(trainFeatures)
  }

  def persistModel(): Unit = {
    model().save("./tmp/iris_kMeans_model/")
  }

} 
Example 21
Source File: TitanicLogisticRegression.scala    From spark-spec   with MIT License 5 votes vote down vote up
package com.github.mrpowers.spark.spec.ml.classification

import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.sql.DataFrame

object TitanicLogisticRegression extends SparkSessionWrapper {

  def withVectorizedFeatures(
    featureColNames: Array[String] = Array("Gender", "Age", "SibSp", "Parch", "Fare"),
    outputColName: String = "features"
  )(df: DataFrame): DataFrame = {
    val assembler: VectorAssembler = new VectorAssembler()
      .setInputCols(featureColNames)
      .setOutputCol(outputColName)
    assembler.transform(df)
  }

  def withLabel(
    inputColName: String = "Survived",
    outputColName: String = "label"
  )(df: DataFrame) = {
    val labelIndexer: StringIndexer = new StringIndexer()
      .setInputCol(inputColName)
      .setOutputCol(outputColName)

    labelIndexer
      .fit(df)
      .transform(df)
  }

  def model(df: DataFrame = TitanicData.trainingDF()): LogisticRegressionModel = {
    val trainFeatures: DataFrame = df
      .transform(withVectorizedFeatures())
      .transform(withLabel())
      .select("features", "label")

    // only uses the features and label columns
    new LogisticRegression()
      .fit(trainFeatures)
  }

  def persistModel(): Unit = {
    model().save("./tmp/titanic_model/")
  }

} 
Example 22
Source File: TitanicData.scala    From spark-spec   with MIT License 5 votes vote down vote up
package com.github.mrpowers.spark.spec.ml.classification

import org.apache.spark.sql.functions._
import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper
import org.apache.spark.sql.DataFrame

object TitanicData extends SparkSessionWrapper {

  def trainingDF(
    titanicDataDirName: String = "./src/test/resources/titanic/"
  ): DataFrame = {
    spark
      .read
      .option("header", "true")
      .csv(titanicDataDirName + "train.csv")
      .withColumn(
        "Gender",
        when(
          col("Sex").equalTo("male"), 0
        )
          .when(col("Sex").equalTo("female"), 1)
          .otherwise(null)
      )
      .select(
        col("Gender").cast("double"),
        col("Survived").cast("double"),
        col("Pclass").cast("double"),
        col("Age").cast("double"),
        col("SibSp").cast("double"),
        col("Parch").cast("double"),
        col("Fare").cast("double")
      )
      .filter(
        col("Gender").isNotNull &&
          col("Survived").isNotNull &&
          col("Pclass").isNotNull &&
          col("Age").isNotNull &&
          col("SibSp").isNotNull &&
          col("Parch").isNotNull &&
          col("Fare").isNotNull
      )
  }

  def testDF(
    titanicDataDirName: String = "./src/test/resources/titanic/"
  ): DataFrame = {
    val rawTestDF = spark
      .read
      .option("header", "true")
      .csv(titanicDataDirName + "test.csv")

    val genderSubmissionDF = spark
      .read
      .option("header", "true")
      .csv(titanicDataDirName + "gender_submission.csv")

    rawTestDF
      .join(
        genderSubmissionDF,
        Seq("PassengerId")
      )
      .withColumn(
        "Gender",
        when(col("Sex").equalTo("male"), 0)
          .when(col("Sex").equalTo("female"), 1)
          .otherwise(null)
      )
      .select(
        col("Gender").cast("double"),
        col("Survived").cast("double"),
        col("Pclass").cast("double"),
        col("Age").cast("double"),
        col("SibSp").cast("double"),
        col("Parch").cast("double"),
        col("Fare").cast("double")
      )
      .filter(
        col("Gender").isNotNull &&
          col("Pclass").isNotNull &&
          col("Age").isNotNull &&
          col("SibSp").isNotNull &&
          col("Parch").isNotNull &&
          col("Fare").isNotNull
      )

  }

} 
Example 23
Source File: IrisKMeansClusteringSpec.scala    From spark-spec   with MIT License 5 votes vote down vote up
package com.github.mrpowers.spark.spec.ml.clustering

import com.github.mrpowers.spark.daria.sql.SparkSessionExt._
import com.github.mrpowers.spark.fast.tests.ColumnComparer
import com.github.mrpowers.spark.spec.SparkSessionTestWrapper
import org.apache.spark.ml.evaluation.ClusteringEvaluator
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType
import org.scalatest.FunSpec

class IrisKMeansClusteringSpec
  extends FunSpec
  with SparkSessionTestWrapper
  with ColumnComparer {

  describe("withVectorizedFeatures") {

    it("converts all the features to a vector without blowing up") {

      val df = spark.createDF(
        List(
          (5.1, 3.5, 1.4, 0.2)
        ), List(
          ("SepalLengthCm", DoubleType, true),
          ("SepalWidthCm", DoubleType, true),
          ("PetalLengthCm", DoubleType, true),
          ("PetalWidthCm", DoubleType, true)
        )
      ).transform(IrisKMeansClustering.withVectorizedFeatures())

      df.show()
      df.printSchema()

    }

  }

  describe("model") {

    it("prints the cluster centers") {

      println("Cluster Centers: ")
      IrisKMeansClustering.model().clusterCenters.foreach(println)

    }

    it("trains a KMeans Clustering model that's Silhouette with squared euclidean distance above 0.70 percent") {

      val trainData: DataFrame = IrisKMeansClustering.trainingDF
        .transform(IrisKMeansClustering.withVectorizedFeatures())
        .select("features")

      val testData: DataFrame = IrisKMeansClustering.testDF
        .transform(IrisKMeansClustering.withVectorizedFeatures())
        .select("features")

      val predictions: DataFrame = IrisKMeansClustering
        .model()
        .transform(testData)
        .select(
          col("features"),
          col("prediction")
        )

      val res = new ClusteringEvaluator()
        .evaluate(predictions)

      assert(res >= 0.60)
    }

  }

} 
Example 24
Source File: package.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus

import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row }
import org.apache.spark.sql.types.StructType

package object ply {

  
  implicit class PlyDataFrameReader(reader: DataFrameReader) {
    def ply: String => DataFrame = reader.format("fr.ign.spark.iqmulus.ply").load
  }

  implicit class PlyDataFrame(df: DataFrame) {
    def saveAsPly(location: String, littleEndian: Boolean = true) = {
      val df_id = df.drop("pid").drop("fid")
      val schema = df_id.schema
      val saver = (key: Int, iter: Iterator[Row]) =>
        Iterator(iter.saveAsPly(s"$location/$key.ply", schema, littleEndian))
      df_id.rdd.mapPartitionsWithIndex(saver, true).collect
    }
  }

  implicit class PlyRowIterator(iter: Iterator[Row]) {
    def saveAsPly(
      filename: String,
      schema: StructType,
      littleEndian: Boolean
    ) = {
      val path = new org.apache.hadoop.fs.Path(filename)
      val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration)
      val f = fs.create(path)
      val rows = iter.toArray
      val count = rows.size.toLong
      val header = new PlyHeader(filename, littleEndian, Map("vertex" -> ((count, schema))))
      val dos = new java.io.DataOutputStream(f);
      dos.write(header.toString.getBytes)
      val ros = new RowOutputStream(dos, littleEndian, schema)
      rows.foreach(ros.write)
      dos.close
      header
    }
  }
} 
Example 25
Source File: package.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus

import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row }
import org.apache.spark.sql.types.{ FloatType, StructType }

package object xyz {

  
  implicit class XyzDataFrameReader(reader: DataFrameReader) {
    def xyz: String => DataFrame = reader.format("fr.ign.spark.iqmulus.xyz").load
  }

  implicit class XyzDataFrame(df: DataFrame) {
    def saveAsXyz(location: String) = {
      val df_id = df.drop("id")
      require(df_id.schema.fieldNames.take(3) sameElements Array("x", "y", "z"))
      require(df_id.schema.fields.map(_.dataType).take(3).forall(_ == FloatType))
      val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveXyz(s"$location/$key.xyz"))
      df_id.rdd.mapPartitionsWithIndex(saver, true).collect
    }
  }

  implicit class XyzRowIterator(iter: Iterator[Row]) {
    def saveXyz(filename: String) = {
      val path = new org.apache.hadoop.fs.Path(filename)
      val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration)
      val f = fs.create(path)
      val dos = new java.io.DataOutputStream(f)
      var count = 0L
      iter.foreach(row => { count += 1; dos.writeBytes(row.mkString("", "\t", "\n")) })
      dos.close
      (filename, count)
    }
  }
} 
Example 26
Source File: package.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus

import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame }
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.Row

package object las {

  
  implicit class LasDataFrameReader(reader: DataFrameReader) {
    def las: String => DataFrame = reader.format("fr.ign.spark.iqmulus.las").load
  }

  implicit class LasDataFrame(df: DataFrame) {
    def saveAsLas(
      location: String,
      formatOpt: Option[Byte] = None,
      version: Version = Version(),
      scale: Array[Double] = Array(0.01, 0.01, 0.01),
      offset: Array[Double] = Array(0, 0, 0)
    ) = {
      val format = formatOpt.getOrElse(LasHeader.formatFromSchema(df.schema))
      val schema = LasHeader.schema(format) // no user types for now
      val cols = schema.fieldNames.intersect(df.schema.fieldNames)
      val saver = (key: Int, iter: Iterator[Row]) =>
        Iterator(iter.saveAsLas(s"$location/$key.las", schema, format, scale, offset, version))
      df.select(cols.head, cols.tail: _*).rdd.mapPartitionsWithIndex(saver, true).collect
    }
  }

  implicit class LasRowIterator(iter: Iterator[Row]) {
    def saveAsLas(
      filename: String, schema: StructType, format: Byte,
      scale: Array[Double], offset: Array[Double], version: Version = Version()
    ) = {
      // materialize the partition to access it in a single pass, TODO workaround that 
      val rows = iter.toArray
      val count = rows.length.toLong
      val pmin = Array.fill[Double](3)(Double.PositiveInfinity)
      val pmax = Array.fill[Double](3)(Double.NegativeInfinity)
      val countByReturn = Array.fill[Long](15)(0)
      rows.foreach { row =>
        val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble
        val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble
        val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble
        val ret = row.getAs[Byte]("flags") & 0x3
        countByReturn(ret) += 1
        pmin(0) = Math.min(pmin(0), x)
        pmin(1) = Math.min(pmin(1), y)
        pmin(2) = Math.min(pmin(2), z)
        pmax(0) = Math.max(pmax(0), x)
        pmax(1) = Math.max(pmax(1), y)
        pmax(2) = Math.max(pmax(2), z)
      }
      val path = new org.apache.hadoop.fs.Path(filename)
      val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration)
      val f = fs.create(path)
      val header = new LasHeader(filename, format, count, pmin, pmax, scale, offset,
        version = version, pdr_return_nb = countByReturn)
      val dos = new java.io.DataOutputStream(f);
      header.write(dos)
      val ros = new RowOutputStream(dos, littleEndian = true, schema)
      rows.foreach(ros.write)
      dos.close
      header
    }
  }
} 
Example 27
Source File: DataFrameConverterSpec.scala    From incubator-toree   with Apache License 2.0 5 votes vote down vote up
package org.apache.toree.utils

import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row}
import org.mockito.Mockito._
import org.scalatest.mock.MockitoSugar
import org.scalatest.{BeforeAndAfterAll, FunSpec, Matchers}
import play.api.libs.json.{JsArray, JsString, Json}
import test.utils.SparkContextProvider

import scala.collection.mutable

class DataFrameConverterSpec extends FunSpec with MockitoSugar with Matchers with BeforeAndAfterAll {

  lazy val spark = SparkContextProvider.sparkContext

  override protected def afterAll(): Unit = {
    spark.stop()
    super.afterAll()
  }

  val dataFrameConverter: DataFrameConverter = new DataFrameConverter
  val mockDataFrame = mock[DataFrame]
  val mockRdd = spark.parallelize(Seq(Row(new mutable.WrappedArray.ofRef(Array("test1", "test2")), 2, null)))
  val mockStruct = mock[StructType]
  val columns = Seq("foo", "bar").toArray

  doReturn(mockStruct).when(mockDataFrame).schema
  doReturn(columns).when(mockStruct).fieldNames
  doReturn(mockRdd).when(mockDataFrame).rdd

  describe("DataFrameConverter") {
    describe("#convert") {
      it("should convert to a valid JSON object") {
        val someJson = dataFrameConverter.convert(mockDataFrame, "json")
        val jsValue = Json.parse(someJson.get)
        jsValue \ "columns" should be (JsArray(Seq(JsString("foo"), JsString("bar"))))
        jsValue \ "rows" should be (JsArray(Seq(
          JsArray(Seq(JsString("[test1, test2]"), JsString("2"), JsString("null")))
        )))
      }
      it("should convert to csv") {
        val csv = dataFrameConverter.convert(mockDataFrame, "csv").get
        val values = csv.split("\n")
        values(0) shouldBe "foo,bar"
        values(1) shouldBe "[test1, test2],2,null"
      }
      it("should convert to html") {
        val html = dataFrameConverter.convert(mockDataFrame, "html").get
        html.contains("<th>foo</th>") should be(true)
        html.contains("<th>bar</th>") should be(true)
        html.contains("<td>[test1, test2]</td>") should be(true)
        html.contains("<td>2</td>") should be(true)
        html.contains("<td>null</td>") should be(true)
      }
      it("should convert limit the selection") {
        val someLimited = dataFrameConverter.convert(mockDataFrame, "csv", 1)
        val limitedLines = someLimited.get.split("\n")
        limitedLines.length should be(2)
      }
      it("should return a Failure for invalid types") {
        val result = dataFrameConverter.convert(mockDataFrame, "Invalid Type")
        result.isFailure should be(true)
      }
    }
  }
} 
Example 28
Source File: HBaseSource.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.datasources

import org.apache.hadoop.hbase.spark.datasources.HBaseTableCatalog
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience

@InterfaceAudience.Private
case class HBaseRecord(
  col0: String,
  col1: Boolean,
  col2: Double,
  col3: Float,
  col4: Int,
  col5: Long,
  col6: Short,
  col7: String,
  col8: Byte)

@InterfaceAudience.Private
object HBaseRecord {
  def apply(i: Int): HBaseRecord = {
    val s = s"""row${"%03d".format(i)}"""
    HBaseRecord(s,
      i % 2 == 0,
      i.toDouble,
      i.toFloat,
      i,
      i.toLong,
      i.toShort,
      s"String$i extra",
      i.toByte)
  }
}

@InterfaceAudience.Private
object HBaseSource {
  val cat = s"""{
                |"table":{"namespace":"default", "name":"HBaseSourceExampleTable"},
                |"rowkey":"key",
                |"columns":{
                |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
                |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
                |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
                |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
                |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
                |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
                |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
                |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
                |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
                |}
                |}""".stripMargin

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("HBaseSourceExample")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    import sqlContext.implicits._

    def withCatalog(cat: String): DataFrame = {
      sqlContext
        .read
        .options(Map(HBaseTableCatalog.tableCatalog->cat))
        .format("org.apache.hadoop.hbase.spark")
        .load()
    }

    val data = (0 to 255).map { i =>
      HBaseRecord(i)
    }

    sc.parallelize(data).toDF.write.options(
      Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5"))
      .format("org.apache.hadoop.hbase.spark")
      .save()

    val df = withCatalog(cat)
    df.show()
    df.filter($"col0" <= "row005")
      .select($"col0", $"col1").show
    df.filter($"col0" === "row005" || $"col0" <= "row005")
      .select($"col0", $"col1").show
    df.filter($"col0" > "row250")
      .select($"col0", $"col1").show
    df.registerTempTable("table1")
    val c = sqlContext.sql("select count(col1) from table1 where col0 < 'row050'")
    c.show()
  }
} 
Example 29
Source File: IntermediateCacher.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}

class IntermediateCacher(override val uid: String)
  extends Transformer with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("intermediateCacher"))
  }

  val inputCols = new StringArrayParam(this, "inputCols", "Input column names")

  def getInputCols: Array[String] = $(inputCols)

  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
  setDefault(inputCols -> Array.empty[String])

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    val intermediateDF = if ($(inputCols).isEmpty) dataset.toDF() else dataset.select($(inputCols).map(col(_)): _*)
    intermediateDF.cache()
  }

  override def copy(extra: ParamMap): IntermediateCacher = {
    defaultCopy(extra)
  }
}

object IntermediateCacher extends DefaultParamsReadable[IntermediateCacher] 
Example 30
Source File: RankingMetricFormatter.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.closures.UDFs._
import ws.vinta.albedo.evaluators.RankingEvaluator._

class RankingMetricFormatter(override val uid: String, val sourceType: String)
  extends Transformer with DefaultParamsWritable {

  def this(sourceType: String) = {
    this(Identifiable.randomUID("rankingMetricFormatter"), sourceType)
  }

  val userCol = new Param[String](this, "userCol", "User column name")

  def getUserCol: String = $(userCol)

  def setUserCol(value: String): this.type = set(userCol, value)
  setDefault(userCol -> "user")

  val itemCol = new Param[String](this, "itemCol", "Item column name")

  def getItemCol: String = $(itemCol)

  def setItemCol(value: String): this.type = set(itemCol, value)
  setDefault(itemCol -> "item")

  val predictionCol = new Param[String](this, "predictionCol", "Prediction column name")

  def getPredictionCol: String = $(predictionCol)

  def setPredictionCol(value: String): this.type = set(predictionCol, value)
  setDefault(predictionCol -> "prediction")

  val topK = new IntParam(this, "topK", "Recommend top-k items for every user")

  def getTopK: Int = $(topK)

  def setTopK(value: Int): this.type = set(topK, value)
  setDefault(topK -> 15)

  override def transformSchema(schema: StructType): StructType = {
    Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType)
      .foreach{
        case(columnName: String, expectedDataType: DataType) => {
          val actualDataType = schema(columnName).dataType
          require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.")
        }
      }

    schema
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    sourceType match {
      case "als" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK)))
      case "lr" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK)))
    }
  }

  override def copy(extra: ParamMap): RankingMetricFormatter = {
    val copied = new RankingMetricFormatter(uid, sourceType)
    copyValues(copied, extra)
  }
}

object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter] 
Example 31
Source File: UserRepoTransformer.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.closures.UDFs._

class UserRepoTransformer(override val uid: String)
  extends Transformer with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("userRepoTransformer"))
  }

  val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "Input column names")

  def getInputCols: Array[String] = $(inputCols)

  def setInputCols(value: Array[String]): this.type = set(inputCols, value)

  override def transformSchema(schema: StructType): StructType = {
    $(inputCols).foreach((inputColName: String) => {
      require(schema.fieldNames.contains(inputColName), s"Input column $inputColName must exist.")
    })

    val newFields: Array[StructField] = Array(
      StructField("repo_language_index_in_user_recent_repo_languages", IntegerType, nullable = false),
      StructField("repo_language_count_in_user_recent_repo_languages", IntegerType, nullable = false)
    )
    StructType(schema.fields ++ newFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    import dataset.sparkSession.implicits._

    dataset
      .withColumn("repo_language_index_in_user_recent_repo_languages", repoLanguageIndexInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages"))
      .withColumn("repo_language_count_in_user_recent_repo_languages", repoLanguageCountInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages"))
  }

  override def copy(extra: ParamMap): UserRepoTransformer = {
    defaultCopy(extra)
  }
}

object UserRepoTransformer extends DefaultParamsReadable[UserRepoTransformer] 
Example 32
Source File: ContentRecommender.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.recommenders

import org.apache.http.HttpHost
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.elasticsearch.action.search.SearchRequest
import org.elasticsearch.client.{RestClient, RestHighLevelClient}
import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item
import org.elasticsearch.index.query.QueryBuilders._
import org.elasticsearch.search.SearchHit
import org.elasticsearch.search.builder.SearchSourceBuilder
import ws.vinta.albedo.closures.DBFunctions._

class ContentRecommender(override val uid: String) extends Recommender {

  def this() = {
    this(Identifiable.randomUID("contentRecommender"))
  }

  val enableEvaluationMode = new Param[Boolean](this, "enableEvaluationMode", "Should be enable for evaluation only")

  def getEnableEvaluationMode: Boolean = $(enableEvaluationMode)

  def setEnableEvaluationMode(value: Boolean): this.type = set(enableEvaluationMode, value)
  setDefault(enableEvaluationMode -> false)

  override def source = "content"

  override def recommendForUsers(userDF: Dataset[_]): DataFrame = {
    transformSchema(userDF.schema)

    import userDF.sparkSession.implicits._

    val userRecommendedItemDF = userDF
      .as[Int]
      .flatMap {
        case (userId) => {
          // 因為 More Like This query 用 document id 查詢時
          // 結果會過濾掉那些做為條件的 document ids
          // 但是這樣在 evaluate 的時候就不太合適了
          // 所以我們改用後 k 個 repo 當作查詢條件
          val limit = $(topK)
          val offset = if ($(enableEvaluationMode)) $(topK) else 0
          val repoIds = selectUserStarredRepos(userId, limit, offset)

          val lowClient = RestClient.builder(new HttpHost("127.0.0.1", 9200, "http")).build()
          val highClient = new RestHighLevelClient(lowClient)

          val fields = Array("description", "full_name", "language", "topics")
          val texts = Array("")
          val items = repoIds.map((itemId: Int) => new Item("repo", "repo_info_doc", itemId.toString))
          val queryBuilder = moreLikeThisQuery(fields, texts, items)
            .minTermFreq(2)
            .maxQueryTerms(50)

          val searchSourceBuilder = new SearchSourceBuilder()
          searchSourceBuilder.query(queryBuilder)
          searchSourceBuilder.size($(topK))
          searchSourceBuilder.from(0)

          val searchRequest = new SearchRequest()
          searchRequest.indices("repo")
          searchRequest.types("repo_info_doc")
          searchRequest.source(searchSourceBuilder)

          val searchResponse = highClient.search(searchRequest)
          val hits = searchResponse.getHits
          val searchHits = hits.getHits

          val userItemScoreTuples = searchHits.map((searchHit: SearchHit) => {
            val itemId = searchHit.getId.toInt
            val score = searchHit.getScore
            (userId, itemId, score)
          })

          lowClient.close()

          userItemScoreTuples
        }
      }
      .toDF($(userCol), $(itemCol), $(scoreCol))
      .withColumn($(sourceCol), lit(source))

    userRecommendedItemDF
  }
} 
Example 33
Source File: PopularityRecommender.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.recommenders

import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import ws.vinta.albedo.utils.DatasetUtils._

class PopularityRecommender(override val uid: String) extends Recommender {

  def this() = {
    this(Identifiable.randomUID("popularityRecommender"))
  }

  override def source = "popularity"

  override def recommendForUsers(userDF: Dataset[_]): DataFrame = {
    transformSchema(userDF.schema)

    implicit val spark: SparkSession = userDF.sparkSession
    import spark.implicits._

    val popularRepoDF = loadPopularRepoDF()
      .limit($(topK))
      .cache()

    def calculateScoreUDF = udf((stargazers_count: Int, created_at: java.sql.Timestamp) => {
      val valueScore = math.round(math.log10(stargazers_count) * 1000.0) / 1000.0
      val timeScore = (created_at.getTime / 1000.0) / (60 * 60 * 24 * 30 * 12) / 5.0
      valueScore + timeScore
    })

    userDF
      .select($(userCol))
      .crossJoin(popularRepoDF)
      .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"repo_stargazers_count", $"repo_created_at").alias($(scoreCol)))
      .withColumn($(sourceCol), lit(source))
  }
} 
Example 34
Source File: CurationRecommender.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.recommenders

import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import ws.vinta.albedo.utils.DatasetUtils._

class CurationRecommender(override val uid: String) extends Recommender {

  def this() = {
    this(Identifiable.randomUID("curationRecommender"))
  }

  override def source = "curation"

  override def recommendForUsers(userDF: Dataset[_]): DataFrame = {
    transformSchema(userDF.schema)

    implicit val spark: SparkSession = userDF.sparkSession
    import spark.implicits._

    val rawStarringDS = loadRawStarringDS().cache()

    val curatorIds = Array(652070, 1912583, 59990, 646843, 28702) // vinta, saiday, tzangms, fukuball, wancw
    val curatedRepoDF = rawStarringDS
      .select($"repo_id", $"starred_at")
      .where($"user_id".isin(curatorIds: _*))
      .groupBy($"repo_id")
      .agg(max($"starred_at").alias("starred_at"))
      .orderBy($"starred_at".desc)
      .limit($(topK))
      .cache()

    def calculateScoreUDF = udf((starred_at: java.sql.Timestamp) => {
      starred_at.getTime / 1000.0
    })

    userDF
      .select($(userCol))
      .crossJoin(curatedRepoDF)
      .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"starred_at").alias($(scoreCol)))
      .withColumn($(sourceCol), lit(source))
  }
} 
Example 35
Source File: ALSRecommender.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.recommenders

import com.github.fommil.netlib.F2jBLAS
import org.apache.spark.ml.recommendation.ALSModel
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.settings

class ALSRecommender(override val uid: String) extends Recommender {

  def this() = {
    this(Identifiable.randomUID("alsRecommender"))
  }

  private def alsModel: ALSModel = {
    val alsModelPath = s"${settings.dataDir}/${settings.today}/alsModel.parquet"
    ALSModel.load(alsModelPath)
  }

  def blockify(factors: Dataset[(Int, Array[Float])], blockSize: Int = 4096): Dataset[Seq[(Int, Array[Float])]] = {
    import factors.sparkSession.implicits._
    factors.mapPartitions(_.grouped(blockSize))
  }

  override def source = "als"

  override def recommendForUsers(userDF: Dataset[_]): DataFrame = {
    transformSchema(userDF.schema)

    import userDF.sparkSession.implicits._

    val activeUsers = userDF.select(col($(userCol)).alias("id"))
    val userFactors = alsModel.userFactors.join(activeUsers, Seq("id"))
    val itemFactors = alsModel.itemFactors
    val rank = alsModel.rank
    val num = $(topK)

    val userFactorsBlocked = blockify(userFactors.as[(Int, Array[Float])])
    val itemFactorsBlocked = blockify(itemFactors.as[(Int, Array[Float])])
    val ratings = userFactorsBlocked.crossJoin(itemFactorsBlocked)
      .as[(Seq[(Int, Array[Float])], Seq[(Int, Array[Float])])]
      .flatMap { case (srcIter, dstIter) =>
        val m = srcIter.size
        val n = math.min(dstIter.size, num)
        val output = new Array[(Int, Int, Float)](m * n)
        var i = 0
        val pq = new BoundedPriorityQueue[(Int, Float)](num)(Ordering.by(_._2))
        srcIter.foreach { case (srcId, srcFactor) =>
          dstIter.foreach { case (dstId, dstFactor) =>
            val score = new F2jBLAS().sdot(rank, srcFactor, 1, dstFactor, 1)
            pq += dstId -> score
          }
          pq.foreach { case (dstId, score) =>
            output(i) = (srcId, dstId, score)
            i += 1
          }
          pq.clear()
        }
        output.toSeq
      }

    ratings
      .toDF($(userCol), $(itemCol), $(scoreCol))
      .withColumn($(sourceCol), lit(source))
  }
} 
Example 36
Source File: SimpleVectorAssembler.scala    From albedo   with MIT License 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkException
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row}

import scala.collection.mutable.ArrayBuilder



  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)

    val schema = dataset.schema
    val assembleFunc = udf { r: Row =>
      SimpleVectorAssembler.assemble(r.toSeq: _*)
    }
    val args = $(inputCols).map { c =>
      schema(c).dataType match {
        case DoubleType => dataset(c)
        case _: VectorUDT => dataset(c)
        case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid")
      }
    }

    dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol)))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputColNames = $(inputCols)
    val outputColName = $(outputCol)
    val inputDataTypes = inputColNames.map(name => schema(name).dataType)
    inputDataTypes.foreach {
      case _: NumericType | BooleanType =>
      case t if t.isInstanceOf[VectorUDT] =>
      case other =>
        throw new IllegalArgumentException(s"Data type $other is not supported.")
    }
    if (schema.fieldNames.contains(outputColName)) {
      throw new IllegalArgumentException(s"Output column $outputColName already exists.")
    }
    StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true))
  }

  override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra)
}

object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] {
  override def load(path: String): SimpleVectorAssembler = super.load(path)

  def assemble(vv: Any*): Vector = {
    val indices = ArrayBuilder.make[Int]
    val values = ArrayBuilder.make[Double]
    var cur = 0
    vv.foreach {
      case v: Double =>
        if (v != 0.0) {
          indices += cur
          values += v
        }
        cur += 1
      case vec: Vector =>
        vec.foreachActive { case (i, v) =>
          if (v != 0.0) {
            indices += cur + i
            values += v
          }
        }
        cur += vec.size
      case null =>
        // TODO: output Double.NaN?
        throw new SparkException("Values to assemble cannot be null.")
      case o =>
        throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.")
    }
    Vectors.sparse(cur, indices.result(), values.result()).compressed
  }
} 
Example 37
Source File: CouchbaseSink.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.sql.streaming

import com.couchbase.spark.Logging
import org.apache.spark.sql.{DataFrame, SaveMode}
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.sql.types.StringType
import com.couchbase.spark.sql._
import com.couchbase.spark._
import com.couchbase.client.core.CouchbaseException
import com.couchbase.client.java.document.JsonDocument
import com.couchbase.client.java.document.json.JsonObject
import scala.concurrent.duration._



class CouchbaseSink(options: Map[String, String]) extends Sink with Logging {

  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    val bucketName = options.get("bucket").orNull
    val idFieldName = options.getOrElse("idField", DefaultSource.DEFAULT_DOCUMENT_ID_FIELD)
    val removeIdField = options.getOrElse("removeIdField", "true").toBoolean
    val timeout = options.get("timeout").map(v => Duration(v.toLong, MILLISECONDS))

    val createDocument = options.get("expiry").map(_.toInt)
      .map(expiry => (id: String, content: JsonObject) => JsonDocument.create(id, expiry, content))
      .getOrElse((id: String, content: JsonObject) => JsonDocument.create(id, content))

    data.toJSON
      .queryExecution
      .toRdd
      .map(_.get(0, StringType).asInstanceOf[UTF8String].toString())
      .map { rawJson =>
          val encoded = JsonObject.fromJson(rawJson)
          val id = encoded.get(idFieldName)

          if (id == null) {
              throw new Exception(s"Could not find ID field $idFieldName in $encoded")
          }

          if (removeIdField) {
              encoded.removeKey(idFieldName)
          }

          createDocument(id.toString, encoded)
      }
      .saveToCouchbase(bucketName, StoreMode.UPSERT, timeout)
  }

} 
Example 38
Source File: DataFrameReaderFunctions.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.sql

import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, DataFrameReader}

class DataFrameReaderFunctions(@transient val dfr: DataFrameReader) extends Serializable {

  
  private def buildFrame(options: Map[String, String] = null, schema: StructType = null,
    schemaFilter: Option[Filter] = null): DataFrame = {
    val builder = dfr
      .format(source)
      .schema(schema)

    val filter = schemaFilter.map(N1QLRelation.filterToExpression)
    if (filter.isDefined) {
      builder.option("schemaFilter", filter.get)
    }

    if (options != null) {
      builder.options(options)
    }

    builder.load()
  }

} 
Example 39
Source File: N1qlSpec.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.n1ql

import com.couchbase.client.core.CouchbaseException
import com.couchbase.client.java.error.QueryExecutionException
import com.couchbase.client.java.query.N1qlQuery
import org.apache.spark.{SparkConf, SparkContext, SparkException}
import org.apache.spark.sql.sources.EqualTo
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.scalatest._
import com.couchbase.spark._
import com.couchbase.spark.connection.CouchbaseConnection
import com.couchbase.spark.sql.N1QLRelation
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import scala.util.control.NonFatal

class N1qlSpec extends FunSuite with Matchers with BeforeAndAfterAll {

  private val master = "local[2]"
  private val appName = "cb-int-specs1"

  private var spark: SparkSession = _


  override def beforeAll(): Unit = {
    spark = SparkSession
      .builder()
      .master(master)
      .appName(appName)
      .config("spark.couchbase.username", "Administrator")
      .config("spark.couchbase.password", "password")
      // Open 2 buckets as tests below rely on it
      .config("com.couchbase.bucket.default", "")
      .config("com.couchbase.bucket.travel-sample", "")
      .getOrCreate()
  }

  override def afterAll(): Unit = {
    CouchbaseConnection().stop()
    spark.stop()
  }

  test("Creating N1QLRelation with default bucket, when two buckets exist, should fail") {
    assertThrows[IllegalStateException] {
      spark.read
        .format("com.couchbase.spark.sql.DefaultSource")
        .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline")))
        .option("schemaFilter", "`type` = 'airline'")
        .schema(StructType(StructField("name", StringType) :: Nil))
        .load()
    }
  }

  test("Creating N1QLRelation with non-default bucket, when two buckets exist, should succeed") {
    spark.read
      .format("com.couchbase.spark.sql.DefaultSource")
      .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline")))
      .option("schemaFilter", "`type` = 'airline'")
      .option("bucket", "travel-sample")
      .schema(StructType(StructField("name", StringType) :: Nil))
      .load()
  }

  test("N1QL failures should fail the Observable") {
    try {
      spark.sparkContext
        .couchbaseQuery(N1qlQuery.simple("BAD QUERY"), bucketName = "default")
        .collect()
        .foreach(println)
      fail()
    }
    catch {
      case e: SparkException =>
        assert (e.getCause.isInstanceOf[QueryExecutionException])
        val err = e.getCause.asInstanceOf[QueryExecutionException]
        assert (err.getMessage == "syntax error - at QUERY")
      case NonFatal(e) =>
        println(e)
        fail()
    }
  }
} 
Example 40
Source File: CouchbaseDataFrameSpec.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.sql

import com.couchbase.spark.connection.CouchbaseConnection
import org.apache.avro.generic.GenericData.StringType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}
import org.apache.spark.sql.sources.EqualTo
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest._
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class CouchbaseDataFrameSpec extends FlatSpec with Matchers with BeforeAndAfterAll {

  private val master = "local[2]"
  private val appName = "cb-int-specs1"

  private var spark: SparkSession = null


  override def beforeAll(): Unit = {
    val conf = new SparkConf()
      .setMaster(master)
      .setAppName(appName)
      .set("spark.couchbase.nodes", "127.0.0.1")
      .set("com.couchbase.username", "Administrator")
      .set("com.couchbase.password", "password")
      .set("com.couchbase.bucket.default", "")
      .set("com.couchbase.bucket.travel-sample", "")
    spark = SparkSession.builder().config(conf).getOrCreate()

    loadData()
  }

  override def afterAll(): Unit = {
    CouchbaseConnection().stop()
    spark.stop()
  }

  def loadData(): Unit = {

  }

  "If two buckets are used and the bucket is specified the API" should
    "not fail" in {
    val ssc = spark.sqlContext
    ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample"))
  }

  "The DataFrame API" should "infer the schemas" in {
    val ssc = spark.sqlContext
    import com.couchbase.spark.sql._

    val airline = ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample"))
    val airport = ssc.read.couchbase(EqualTo("type", "airport"), Map("bucket" -> "travel-sample"))
    val route = ssc.read.couchbase(EqualTo("type", "route"), Map("bucket" -> "travel-sample"))
    val landmark = ssc.read.couchbase(EqualTo("type", "landmark"), Map("bucket" -> "travel-sample"))


    airline
      .limit(10)
      .write
      .mode(SaveMode.Overwrite)
      .couchbase(Map("bucket" -> "default"))

    // TODO: validate schemas which are inferred on a field and type basis

  }

  it should "write and ignore" in {
    val ssc = spark.sqlContext
    import com.couchbase.spark.sql._

    // create df, write it twice
    val data = ("Michael", 28, true)
    val df = ssc.createDataFrame(spark.sparkContext.parallelize(Seq(data)))

    df.write
      .mode(SaveMode.Ignore)
      .couchbase(options = Map("idField" -> "_1", "bucket" -> "default"))

    df.write
      .mode(SaveMode.Ignore)
      .couchbase(options = Map("idField" -> "_1", "bucket" -> "default"))
  }

  it should "filter based on a function" in {
    val ssc = spark.sqlContext
    import com.couchbase.spark.sql._

    val airlineBySubstrCountry: DataFrame = ssc.read.couchbase(
      EqualTo("'substr(country, 0, 6)'", "United"), Map("bucket" -> "travel-sample"))

    airlineBySubstrCountry.count() should equal(6797)
  }

} 
Example 41
Source File: CustomConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.DataFrame

import CustomConstraint.{FailureMsg, SuccessMsg}

import scala.util.Try

case class CustomConstraint(name: String,
                            constraintFunction: DataFrame => Either[FailureMsg, SuccessMsg]
                           ) extends Constraint {

  val fun = (df: DataFrame) => {
    val tryFun = Try(constraintFunction(df))
    val messagePrefix = s"Custom constraint '$name'"
    val message = tryFun.map {
      case Left(failureMsg) => s"$messagePrefix failed: $failureMsg"
      case Right(successMsg) => s"$messagePrefix succeeded: $successMsg"
    }.recover {
      case throwable => s"$messagePrefix errored: $throwable"
    }.get
    val status = ConstraintUtil.tryToStatus[Either[FailureMsg, SuccessMsg]](tryFun, _.isRight)
    CustomConstraintResult(this, message, status)
  }

}

case class CustomConstraintResult(constraint: CustomConstraint,
                                  message: String,
                                  status: ConstraintStatus) extends ConstraintResult[CustomConstraint]

object CustomConstraint {

  type SuccessMsg = String
  type FailureMsg = String

} 
Example 42
Source File: FunctionalDependencyConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class FunctionalDependencyConstraint(determinantSet: Seq[String],
                                          dependentSet: Seq[String]) extends Constraint {

  require(determinantSet.nonEmpty, "determinantSet must not be empty")
  require(dependentSet.nonEmpty, "dependentSet must not be empty")

  val fun = (df: DataFrame) => {
    val determinantColumns = determinantSet.map(columnName => new Column(columnName))
    val dependentColumns = dependentSet.map(columnName => new Column(columnName))
    val maybeRelevantSelection = Try(df.select(determinantColumns ++ dependentColumns: _*))

    val maybeDeterminantValueCounts = maybeRelevantSelection.map(_.distinct.groupBy(determinantColumns: _*).count)
    val maybeViolatingDeterminantValuesCount = maybeDeterminantValueCounts.map(_.filter(new Column("count") =!= 1).count)
    FunctionalDependencyConstraintResult(
      constraint = this,
      data = maybeViolatingDeterminantValuesCount.toOption.map(FunctionalDependencyConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeViolatingDeterminantValuesCount, _ == 0)
    )
  }

}

case class FunctionalDependencyConstraintResult(constraint: FunctionalDependencyConstraint,
                                                data: Option[FunctionalDependencyConstraintResultData],
                                                status: ConstraintStatus) extends ConstraintResult[FunctionalDependencyConstraint] {

  val message: String = {
    val maybeFailedRows = data.map(_.failedRows)
    val maybeRowPluralS = maybeFailedRows.map(failedRows => if (failedRows == 1) "" else "s")
    val dependentSet = constraint.dependentSet
    val determinantString = s"${constraint.determinantSet.mkString(", ")}"
    val dependentString = s"${dependentSet.mkString(", ")}"
    val (columnPluralS, columnVerb) = if (dependentSet.size == 1) ("", "is") else ("s", "are")
    (status, maybeFailedRows, maybeRowPluralS) match {
      case (ConstraintSuccess, Some(0), _) =>
        s"Column$columnPluralS $dependentString $columnVerb functionally dependent on $determinantString."
      case (ConstraintFailure, Some(failedRows), Some(rowPluralS)) =>
        s"Column$columnPluralS $dependentString $columnVerb not functionally dependent on " +
        s"$determinantString ($failedRows violating determinant value$rowPluralS)."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether column$columnPluralS $dependentString $columnVerb functionally " +
          s"dependent on $determinantString failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class FunctionalDependencyConstraintResultData(failedRows: Long) 
Example 43
Source File: NumberOfRowsConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.functions.count
import org.apache.spark.sql.{Column, DataFrame}

case class NumberOfRowsConstraint private[ddq] (expected: Column) extends Constraint {

  val fun = (df: DataFrame) => {
    val countDf = df.agg(count(new Column("*")).as(NumberOfRowsConstraint.countKey))
    val actual = countDf.collect().map(_.getLong(0)).apply(0)
    val satisfied = countDf.select(expected).collect().map(_.getBoolean(0)).apply(0)
    NumberOfRowsConstraintResult(
      constraint = this,
      actual = actual,
      status = if (satisfied) ConstraintSuccess else ConstraintFailure
    )
  }

}

object NumberOfRowsConstraint {

  private[constraints] val countKey: String = "count"

  def apply(expected: Column => Column): NumberOfRowsConstraint = {
    new NumberOfRowsConstraint(expected(new Column(countKey)))
  }

  def greaterThan(expected: Int): NumberOfRowsConstraint = {
    NumberOfRowsConstraint(_ > expected)
  }

  def lessThan(expected: Int): NumberOfRowsConstraint = {
    NumberOfRowsConstraint(_ < expected)
  }

  def equalTo(expected: Int): NumberOfRowsConstraint = {
    NumberOfRowsConstraint(_ === expected)
  }

}

case class NumberOfRowsConstraintResult(constraint: NumberOfRowsConstraint,
                                        actual: Long,
                                        status: ConstraintStatus) extends ConstraintResult[NumberOfRowsConstraint] {

  val message: String = {
    val expected = constraint.expected
    status match {
      case ConstraintSuccess => s"The number of rows satisfies $expected."
      case ConstraintFailure => s"The actual number of rows $actual does not satisfy $expected."
      case default => throw IllegalConstraintResultException(this)
    }
  }

} 
Example 44
Source File: AlwaysNullConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class AlwaysNullConstraint(columnName: String) extends Constraint {

  override val fun = (df: DataFrame) => {
    val tryNotNullCount = Try(df.filter(new Column(columnName).isNotNull).count)
    AlwaysNullConstraintResult(
      constraint = this,
      status = ConstraintUtil.tryToStatus[Long](tryNotNullCount, _ == 0),
      data = tryNotNullCount.toOption.map(AlwaysNullConstraintResultData)
    )
  }

}

case class AlwaysNullConstraintResult(constraint: AlwaysNullConstraint,
                                      status: ConstraintStatus,
                                      data: Option[AlwaysNullConstraintResultData]
                                     ) extends ConstraintResult[AlwaysNullConstraint] {

  val message: String = {
    val columnName = constraint.columnName
    val maybeNonNullRows = data.map(_.nonNullRows)
    val maybePluralS = maybeNonNullRows.map(n => if (n == 1) "" else "s")
    (status, maybeNonNullRows, maybePluralS) match {
      case (ConstraintError(throwable), None, None) =>
        s"Checking column $columnName for being always null failed: $throwable"
      case (ConstraintSuccess, Some(0), Some(pluralS)) =>
        s"Column $columnName is always null."
      case (ConstraintFailure, Some(nonNullRows), Some(pluralS)) =>
        s"Column $columnName contains $nonNullRows non-null row$pluralS (should always be null)."
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class AlwaysNullConstraintResultData(nonNullRows: Long) 
Example 45
Source File: StringColumnConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.DataFrame

import scala.util.Try

case class StringColumnConstraint(constraintString: String) extends Constraint {

  val fun = (df: DataFrame) => {
    val maybeSucceedingRows = Try(df.filter(constraintString).count)
    val count = df.count
    val maybeFailingRows = maybeSucceedingRows.map(succeedingRows => count - succeedingRows)
    StringColumnConstraintResult(
      constraint = this,
      data = maybeFailingRows.toOption.map(StringColumnConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0)
    )
  }

}

case class StringColumnConstraintResult(constraint: StringColumnConstraint,
                                        data: Option[StringColumnConstraintResultData],
                                        status: ConstraintStatus) extends ConstraintResult[StringColumnConstraint] {

  val message: String = ColumnConstraintUtil.createColumnConstraintMessage(
    status = status,
    constraintResult = this,
    constraintString = constraint.constraintString,
    maybeViolatingRows = data.map(_.failedRows)
  )

}

case class StringColumnConstraintResultData(failedRows: Long) 
Example 46
Source File: DateFormatConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import java.text.SimpleDateFormat

import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class DateFormatConstraint(columnName: String,
                                formatString: String) extends Constraint {

  val fun = (df: DataFrame) => {
    val cannotBeDate = udf((column: String) =>
      column != null && Try {
        val format = new SimpleDateFormat(formatString)
        format.setLenient(false)
        format.parse(column)
      }.isFailure)
    val maybeCannotBeDateCount = Try(df.filter(cannotBeDate(new Column(columnName))).count)
    DateFormatConstraintResult(
      this,
      data = maybeCannotBeDateCount.toOption.map(DateFormatConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeCannotBeDateCount, _ == 0)
    )
  }

}

case class DateFormatConstraintResult(constraint: DateFormatConstraint,
                                      data: Option[DateFormatConstraintResultData],
                                      status: ConstraintStatus) extends ConstraintResult[DateFormatConstraint] {

  val message: String = {
    val format = constraint.formatString
    val columnName = constraint.columnName
    val maybeFailedRows = data.map(_.failedRows)
    val maybePluralS = maybeFailedRows.map(failedRows => if (failedRows == 1) "" else "s")
    val maybeVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) "is" else "are")
    (status, maybeFailedRows, maybePluralS, maybeVerb) match {
      case (ConstraintSuccess, Some(0), _, _) =>
        s"Column $columnName is formatted by $format."
      case (ConstraintFailure, Some(failedRows), Some(pluralS), Some(verb)) =>
        s"Column $columnName contains $failedRows row$pluralS that $verb not formatted by $format."
      case (ConstraintError(throwable), None, None, None) =>
        s"Checking whether column $columnName is formatted by $format failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }

  }

}

case class DateFormatConstraintResultData(failedRows: Long) 
Example 47
Source File: TypeConversionConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class TypeConversionConstraint(columnName: String,
                                    convertedType: DataType) extends Constraint {

  val fun = (df: DataFrame) => {
    val originalColumn = new Column(columnName)
    val castedColumnName = columnName + "_casted"
    val maybeCasted = Try(df.select(originalColumn, originalColumn.cast(convertedType).as(castedColumnName)))
    val maybeFailedCastsAndOriginalType = maybeCasted.map(casted => {
      val failedCastsCount = casted.filter(new Column(castedColumnName).isNull && originalColumn.isNotNull).count
      val originalType = df.schema.find(_.name == columnName).get.dataType
      (failedCastsCount, originalType)
    })
    TypeConversionConstraintResult(
      constraint = this,
      data = maybeFailedCastsAndOriginalType.toOption.map{ case (failedCastsCount, originalType) =>
        TypeConversionConstraintResultData(
          originalType = originalType,
          failedRows = failedCastsCount
        )
      },
      status = ConstraintUtil.tryToStatus[Long](maybeFailedCastsAndOriginalType.map{
        case (failedCastsCount, originalType) => failedCastsCount
      }, _ == 0)
    )
  }

}

case class TypeConversionConstraintResult(constraint: TypeConversionConstraint,
                                          data: Option[TypeConversionConstraintResultData],
                                          status: ConstraintStatus) extends ConstraintResult[TypeConversionConstraint] {

  val message: String = {
    val convertedType = constraint.convertedType
    val columnName = constraint.columnName
    val maybePluralSVerb = data.map(data => if (data.failedRows == 1) ("", "is") else ("s", "are"))
    (status, data, maybePluralSVerb) match {
      case (ConstraintSuccess, Some(TypeConversionConstraintResultData(originalType, 0)), _) =>
        s"Column $columnName can be converted from $originalType to $convertedType."
      case (ConstraintFailure, Some(TypeConversionConstraintResultData(originalType, failedRows)), Some((pluralS, verb))) =>
        s"Column $columnName cannot be converted from $originalType to $convertedType. " +
        s"$failedRows row$pluralS could not be converted."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether column $columnName can be converted to $convertedType failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class TypeConversionConstraintResultData(originalType: DataType, failedRows: Long) 
Example 48
Source File: ConditionalColumnConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class ConditionalColumnConstraint(statement: Column, implication: Column) extends Constraint {

  val fun = (df: DataFrame) => {
    val maybeFailingRows = Try {
      val succeedingRows = df.filter(!statement || implication).count
      df.count - succeedingRows
    }
    ConditionalColumnConstraintResult(
      constraint = this,
      data = maybeFailingRows.toOption.map(ConditionalColumnConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0)
    )
  }

}

case class ConditionalColumnConstraintResult(constraint: ConditionalColumnConstraint,
                                             data: Option[ConditionalColumnConstraintResultData],
                                             status: ConstraintStatus) extends ConstraintResult[ConditionalColumnConstraint] {

  val message: String = ColumnConstraintUtil.createColumnConstraintMessage(
    status = status,
    constraintResult = this,
    constraintString = s"${constraint.statement} -> ${constraint.implication}",
    maybeViolatingRows = data.map(_.failedRows)
  )

}

case class ConditionalColumnConstraintResultData(failedRows: Long) 
Example 49
Source File: ColumnColumnConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class ColumnColumnConstraint(constraintColumn: Column) extends Constraint {

  val fun = (df: DataFrame) => {
    val maybeFailingRows = Try {
      val succeedingRows = df.filter(constraintColumn).count
      df.count - succeedingRows
    }
    ColumnColumnConstraintResult(
      constraint = this,
      data = maybeFailingRows.toOption.map(ColumnColumnConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0)
    )
  }

}

case class ColumnColumnConstraintResult(constraint: ColumnColumnConstraint,
                                        data: Option[ColumnColumnConstraintResultData],
                                        status: ConstraintStatus) extends ConstraintResult[ColumnColumnConstraint] {

  val message: String = ColumnConstraintUtil.createColumnConstraintMessage(
    status = status,
    constraintResult = this,
    constraintString = constraint.constraintColumn.toString,
    maybeViolatingRows = data.map(_.failedRows)
  )

}

case class ColumnColumnConstraintResultData(failedRows: Long) 
Example 50
Source File: JoinableConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class JoinableConstraint(columnNames: Seq[(String, String)], referenceTable: DataFrame) extends Constraint {

  val fun = (df: DataFrame) => {
    val columnsMap = columnNames.toMap
    val renamedColumns = columnNames.map{ case (baseColumn, refColumn) => ("b_" + baseColumn, "r_" + refColumn)}
    val (baseColumns, refColumns) = columnNames.unzip
    val (renamedBaseColumns, renamedRefColumns) = renamedColumns.unzip

    val maybeNonUniqueRows = Try(
      referenceTable.groupBy(refColumns.map(new Column(_)):_*).count.filter(new Column("count") > 1).count
    )

    // rename all columns to avoid ambiguous column references
    val maybeRenamedDfAndRef = maybeNonUniqueRows.map(_ => {
      val renamedDf = df.select(baseColumns.zip(renamedBaseColumns).map {
        case (original, renamed) => new Column(original).as(renamed)
      }: _*)
      val renamedRef = referenceTable.select(refColumns.zip(renamedRefColumns).map {
        case (original, renamed) => new Column(original).as(renamed)
      }: _*)
      (renamedDf, renamedRef)
    })

    // check if join yields some values
    val maybeDistinctBeforeAndMatchingRows = maybeRenamedDfAndRef.map { case (renamedDf, renamedRef) =>
      val renamedDfDistinct = renamedDf.distinct
      val distinctBefore = renamedDfDistinct.count
      val joinCondition = renamedColumns.map{
        case (baseColumn, refColumn) => new Column(baseColumn) === new Column(refColumn)
      }.reduce(_ && _)
      val join = renamedDfDistinct.join(renamedRef, joinCondition)
      val matchingRows = join.distinct.count
      (distinctBefore, matchingRows)
    }

    JoinableConstraintResult(
      constraint = this,
      data = maybeDistinctBeforeAndMatchingRows.toOption.map{ case (distinctBefore, matchingRows) =>
        JoinableConstraintResultData(
          distinctBefore = distinctBefore,
          matchingKeys = matchingRows
        )
      },
      status = ConstraintUtil.tryToStatus[Long](maybeDistinctBeforeAndMatchingRows.map{
        case (distinctBefore, matchingRows) => matchingRows
      }, _ > 0)
    )
  }

}

case class JoinableConstraintResult(constraint: JoinableConstraint,
                                    data: Option[JoinableConstraintResultData],
                                    status: ConstraintStatus) extends ConstraintResult[JoinableConstraint] {

  val maybeMatchRatio: Option[Double] = data.map(d => d.matchingKeys.toDouble / d.distinctBefore)

  val message: String = {
    val columnNames = constraint.columnNames
    val columnsString = columnNames.map{ case (baseCol, refCol) => baseCol + "->" + refCol }.mkString(", ")
    val maybeMatchPercentage = maybeMatchRatio.map(_ * 100.0)
    (status, data, maybeMatchPercentage) match {
      case (ConstraintSuccess, Some(JoinableConstraintResultData(distinctBefore, matchingKeys)), Some(matchPercentage)) =>
        s"Key $columnsString can be used for joining. " +
        s"Join columns cardinality in base table: $distinctBefore. " +
        s"Join columns cardinality after joining: $matchingKeys (${"%.2f".format(matchPercentage)}" + "%)."
      case (ConstraintFailure, Some(_), Some(_)) => s"Key $columnsString cannot be used for joining (no result)."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether $columnsString can be used for joining failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class JoinableConstraintResultData(distinctBefore: Long, matchingKeys: Long) 
Example 51
Source File: AnyOfConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class AnyOfConstraint(columnName: String, allowedValues: Set[Any]) extends Constraint {

  val fun = (df: DataFrame) => {
    val maybeError = Try(df.select(new Column(columnName))) // check if column is not ambiguous
    val maybeColumnIndex = maybeError.map(_ => df.columns.indexOf(columnName))
    val maybeNotAllowedCount = maybeColumnIndex.map(columnIndex => df.rdd.filter(row => !row.isNullAt(columnIndex) &&
      !allowedValues.contains(row.get(columnIndex))).count)
    AnyOfConstraintResult(
      constraint = this,
      data = maybeNotAllowedCount.toOption.map(AnyOfConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeNotAllowedCount, _ == 0)
    )
  }

}

case class AnyOfConstraintResult(constraint: AnyOfConstraint,
                                 data: Option[AnyOfConstraintResultData],
                                 status: ConstraintStatus) extends ConstraintResult[AnyOfConstraint] {
  val message: String = {
    val allowed = constraint.allowedValues
    val columnName = constraint.columnName
    val maybeFailedRows = data.map(_.failedRows)
    val maybePluralSAndVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) ("", "is") else ("s", "are"))
    (status, maybeFailedRows, maybePluralSAndVerb) match {
      case (ConstraintSuccess, Some(0), Some((pluralS, verb))) =>
        s"Column $columnName contains only values in $allowed."
      case (ConstraintFailure, Some(failedRows), Some((pluralS, verb))) =>
        s"Column $columnName contains $failedRows row$pluralS that $verb not in $allowed."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether column $columnName contains only values in $allowed failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }
}

case class AnyOfConstraintResultData(failedRows: Long) 
Example 52
Source File: ForeignKeyConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class ForeignKeyConstraint(columnNames: Seq[(String, String)], referenceTable: DataFrame) extends Constraint {

  val fun = (df: DataFrame) => {
    val renamedColumns = columnNames.map{ case (baseColumn, refColumn) => ("b_" + baseColumn, "r_" + refColumn)}
    val (baseColumns, refColumns) = columnNames.unzip
    val (renamedBaseColumns, renamedRefColumns) = renamedColumns.unzip

    // check if foreign key is a key in reference table
    val maybeNonUniqueRows = Try(
      referenceTable.groupBy(refColumns.map(new Column(_)):_*).count.filter(new Column("count") > 1).count
    )
    if (maybeNonUniqueRows.toOption.exists(_ > 0)) {
      ForeignKeyConstraintResult(
        constraint = this,
        data = Some(ForeignKeyConstraintResultData(numNonMatchingRefs = None)),
        status = ConstraintFailure
      )
    } else {
      // rename all columns to avoid ambiguous column references
      val maybeRenamedDfAndRef = maybeNonUniqueRows.map(_ => {
        val renamedDf = df.select(baseColumns.zip(renamedBaseColumns).map {
          case (original, renamed) => new Column(original).as(renamed)
        }: _*)
        val renamedRef = referenceTable.select(refColumns.zip(renamedRefColumns).map {
          case (original, renamed) => new Column(original).as(renamed)
        }: _*)
        (renamedDf, renamedRef)
      })

      // check if left outer join yields some null values
      val maybeLeftOuterJoin = maybeRenamedDfAndRef.map { case (renamedDf, renamedRef) =>
        val joinCondition = renamedColumns.map {
          case (baseColumn, refColumn) => new Column(baseColumn) === new Column(refColumn)
        }.reduce(_ && _)
        renamedDf.distinct.join(renamedRef, joinCondition, "outer")
      }

      val maybeNotMatchingRefs = maybeLeftOuterJoin.map(_.filter(renamedRefColumns.map(new Column(_).isNull).reduce(_ && _)).count)

      ForeignKeyConstraintResult(
        constraint = this,
        data = maybeNotMatchingRefs.toOption.map(Some(_)).map(ForeignKeyConstraintResultData),
        status = ConstraintUtil.tryToStatus[Long](maybeNotMatchingRefs, _ == 0)
      )
    }
  }

}

case class ForeignKeyConstraintResult(constraint: ForeignKeyConstraint,
                                      data: Option[ForeignKeyConstraintResultData],
                                      status: ConstraintStatus) extends ConstraintResult[ForeignKeyConstraint] {

  val message: String = {
    val referenceTable = constraint.referenceTable
    val columnNames = constraint.columnNames
    val columnsString = columnNames.map { case (baseCol, refCol) => baseCol + "->" + refCol }.mkString(", ")
    val isPlural = columnNames.length > 1
    val (columnDo, columnDefine, columnIs, columnPluralS) =
      if (isPlural) ("do", "define", "are", "s") else ("does", "defines", "is", "")
    val columnNoun = "Column" + columnPluralS
    val maybeNumNonMatchingRefs = data.map(_.numNonMatchingRefs)
    (status, maybeNumNonMatchingRefs) match {
      case (ConstraintSuccess, Some(Some(0))) =>
        s"$columnNoun $columnsString $columnDefine a foreign key " +
        s"pointing to the reference table $referenceTable."
      case (ConstraintFailure, Some(None)) =>
        s"$columnNoun $columnsString $columnIs not a key in the reference table."
      case (ConstraintFailure, Some(Some(nonMatching))) =>
        val (rowsNoun, rowsDo) = if (nonMatching != 1) ("rows", "do") else ("row", "does")
        s"$columnNoun $columnsString $columnDo not define a foreign key " +
          s"pointing to $referenceTable. $nonMatching $rowsNoun $rowsDo not match."
      case (ConstraintError(throwable), None) =>
        s"Checking whether ${columnNoun.toLowerCase} $columnsString $columnDefine a foreign key failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class ForeignKeyConstraintResultData(numNonMatchingRefs: Option[Long]) 
Example 53
Source File: ExactEqualityConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class ExactEqualityConstraint(other: DataFrame) extends Constraint {

  val fun = (df: DataFrame) => {
    val tryEquality = Try {
      if (df.schema != other.schema) {
        throw new IllegalArgumentException("Schemas do not match")
      }
      val dfGroupCount = df.groupBy(df.columns.map(new Column(_)):_*).count()
      val otherGroupCount = other.groupBy(df.columns.map(new Column(_)):_*).count()
      val diffCount1 = dfGroupCount.except(otherGroupCount).count()
      val diffCount2 = otherGroupCount.except(dfGroupCount).count()
      (diffCount1, diffCount2)
    }

    ExactEqualityConstraintResult(
      constraint = this,
      data = tryEquality.toOption.map {
        case (leftToRightCount, rightToLeftCount) => ExactEqualityConstraintData(leftToRightCount, rightToLeftCount)
      },
      status = ConstraintUtil.tryToStatus[(Long, Long)](tryEquality, {
        case (leftToRightCount, rightToLeftCount) => leftToRightCount + rightToLeftCount == 0
      })
    )
  }

}

case class ExactEqualityConstraintResult(constraint: ExactEqualityConstraint,
                                         data: Option[ExactEqualityConstraintData],
                                         status: ConstraintStatus) extends ConstraintResult[ExactEqualityConstraint] {
  val message: String = {
    val otherName = constraint.other.toString()
    val maybeNonMatchingRows = data.map(data => (data.numNonMatchingLeftToRight, data.numNonMatchingRightToLeft))
    val maybePluralS = maybeNonMatchingRows.map {
      case (leftToRightCount, rightToLeftCount) => (
        if (leftToRightCount == 1) "" else "s",
        if (rightToLeftCount == 1) "" else "s"
      )
    }
    val maybeVerb = maybeNonMatchingRows.map {
      case (leftToRightCount, rightToLeftCount) => (
        if (leftToRightCount == 1) "is" else "are",
        if (rightToLeftCount == 1) "is" else "are"
      )
    }
    (status, maybeNonMatchingRows, maybePluralS, maybeVerb) match {
      case (ConstraintSuccess, Some(_), Some(_), Some(_)) =>
        s"It is equal to $otherName."
      case (
        ConstraintFailure,
        Some((leftToRightRows, rightToLeftRows)),
        Some((leftToRightPluralS, rightToLeftPluralS)),
        Some((leftToRightVerb, rightToLeftVerb))
        ) =>
          s"It is not equal ($leftToRightRows distinct count row$leftToRightPluralS $leftToRightVerb " +
            s"present in the checked dataframe but not in the other " +
            s"and $rightToLeftRows distinct count row$rightToLeftPluralS $rightToLeftVerb " +
            s"present in the other dataframe but not in the checked one) to $otherName."
      case (ConstraintError(throwable), None, None, None) =>
        s"Checking equality with $otherName failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }
}

case class ExactEqualityConstraintData(numNonMatchingLeftToRight: Long, numNonMatchingRightToLeft: Long) 
Example 54
Source File: NeverNullConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class NeverNullConstraint(columnName: String) extends Constraint {

  val fun = (df: DataFrame) => {
    val tryNullCount = Try(df.filter(new Column(columnName).isNull).count)
    NeverNullConstraintResult(
      constraint = this,
      data = tryNullCount.toOption.map(NeverNullConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](tryNullCount, _ == 0)
    )
  }

}

case class NeverNullConstraintResult(constraint: NeverNullConstraint,
                                     data: Option[NeverNullConstraintResultData],
                                     status: ConstraintStatus) extends ConstraintResult[NeverNullConstraint] {
  val message: String = {
    val columnName = constraint.columnName
    val maybeNullRows = data.map(_.nullRows)
    val maybePluralS = maybeNullRows.map(nullRows => if (nullRows == 1) "" else "s")
    val maybeVerb = maybeNullRows.map(nullRows => if (nullRows == 1) "is" else "are")
    (status, maybeNullRows, maybePluralS, maybeVerb) match {
      case (ConstraintSuccess, Some(0), Some(pluralS), Some(verb)) =>
        s"Column $columnName is never null."
      case (ConstraintFailure, Some(nullRows), Some(pluralS), Some(verb)) =>
        s"Column $columnName contains $nullRows row$pluralS that $verb null (should never be null)."
      case (ConstraintError(throwable), None, None, None) =>
        s"Checking column $columnName for being never null failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }
}

case class NeverNullConstraintResultData(nullRows: Long) 
Example 55
Source File: UniqueKeyConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class UniqueKeyConstraint(columnNames: Seq[String]) extends Constraint {

  require(columnNames.nonEmpty)

  val fun = (df: DataFrame) => {
    val columns = columnNames.map(name => new Column(name))
    val maybeNonUniqueRows = Try(df.groupBy(columns: _*).count.filter(new Column("count") > 1).count)
    UniqueKeyConstraintResult(
      constraint = this,
      data = maybeNonUniqueRows.toOption.map(UniqueKeyConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeNonUniqueRows, _ == 0)
    )
  }

}

case class UniqueKeyConstraintResult(constraint: UniqueKeyConstraint,
                                     data: Option[UniqueKeyConstraintResultData],
                                     status: ConstraintStatus) extends ConstraintResult[UniqueKeyConstraint] {

  val message: String = {
    val columnNames = constraint.columnNames
    val columnsString = columnNames.mkString(", ")
    val isPlural = columnNames.length > 1
    val columnNoun = "Column" + (if (isPlural) "s" else "")
    val columnVerb = if (isPlural) "are" else "is"
    val maybeNumNonUniqueTuples = data.map(_.numNonUniqueTuples)
    val maybePluralS = maybeNumNonUniqueTuples.map(numNonUniqueTuples => if (numNonUniqueTuples != 1) "s" else "")
    (status, maybeNumNonUniqueTuples, maybePluralS) match {
      case (ConstraintSuccess, Some(0), _) =>
        s"$columnNoun $columnsString $columnVerb a key."
      case (ConstraintFailure, Some(numNonUniqueTuples), Some(pluralS)) =>
        s"$columnNoun $columnsString $columnVerb not a key ($numNonUniqueTuples non-unique tuple$pluralS)."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether ${columnNoun.toLowerCase()} $columnsString $columnVerb a key failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class UniqueKeyConstraintResultData(numNonUniqueTuples: Long) 
Example 56
Source File: RegexConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import java.util.regex.Pattern

import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class RegexConstraint(columnName: String, regex: String) extends Constraint {

  val fun = (df: DataFrame) => {
    val pattern = Pattern.compile(regex)
    val doesNotMatch = udf((column: String) => column != null && !pattern.matcher(column).find())
    val maybeDoesNotMatchCount = Try(df.filter(doesNotMatch(new Column(columnName))).count)
    RegexConstraintResult(
      constraint = this,
      data = maybeDoesNotMatchCount.toOption.map(RegexConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeDoesNotMatchCount, _ == 0)
    )
  }

}

case class RegexConstraintResult(constraint: RegexConstraint,
                                 data: Option[RegexConstraintResultData],
                                 status: ConstraintStatus) extends ConstraintResult[RegexConstraint] {

  val message: String = {
    val columnName = constraint.columnName
    val regex = constraint.regex
    val maybeFailedRows = data.map(_.failedRows)
    val maybePluralSAndVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) ("", "does") else ("s", "do"))
    (status, maybeFailedRows, maybePluralSAndVerb) match {
      case (ConstraintSuccess, Some(0), _) =>
        s"Column $columnName matches $regex"
      case (ConstraintFailure, Some(failedRows), Some((pluralS, verb))) =>
        s"Column $columnName contains $failedRows row$pluralS that $verb not match $regex"
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether column $columnName matches $regex failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class RegexConstraintResultData(failedRows: Long) 
Example 57
Source File: MarkdownReporterTest.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.reporters

import java.io.{ByteArrayOutputStream, PrintStream}

import de.frosner.ddq.constraints._
import de.frosner.ddq.core._
import de.frosner.ddq.testutils.{DummyConstraint, DummyConstraintResult}
import org.apache.spark.sql.DataFrame
import org.mockito.Mockito._
import org.scalatest.mock.MockitoSugar
import org.scalatest.{FlatSpec, Matchers}

class MarkdownReporterTest extends FlatSpec with Matchers with MockitoSugar {

  "A Markdown reporter" should "produce correct output for a check with constraints" in {
    val baos = new ByteArrayOutputStream()
    val markdownReporter = new MarkdownReporter(new PrintStream(baos))

    val df = mock[DataFrame]
    val dfName = "myDf"
    val dfColumns = Array("1", "2")
    val dfCount = 5
    when(df.columns).thenReturn(dfColumns)

    val header = s"Checking $dfName"
    val prologue = s"It has a total number of ${dfColumns.size} columns and $dfCount rows."
    val message1 = "1"
    val status1 = ConstraintSuccess
    val constraint1 = DummyConstraint(message1, status1)
    val result1 = constraint1.fun(df)

    val message2 = "2"
    val status2 = ConstraintFailure
    val constraint2 = DummyConstraint(message2, status2)
    val result2 = constraint2.fun(df)

    val message3 = "3"
    val status3 = ConstraintError(new IllegalArgumentException())
    val constraint3 = DummyConstraint(message3, status3)
    val result3 = DummyConstraintResult(constraint3, message3, status3)

    val constraints = Map[Constraint, ConstraintResult[Constraint]](
      constraint1 -> result1,
      constraint2 -> result2,
      constraint3 -> result3
    )

    val check = Check(df, Some(dfName), Option.empty, constraints.keys.toSeq)

    markdownReporter.report(CheckResult(constraints, check, dfCount))
    val expectedOutput = s"""**$header**

$prologue

- *SUCCESS*: ${result1.message}
- *FAILURE*: ${result2.message}
- *ERROR*: ${result3.message}

"""

    baos.toString shouldBe expectedOutput
  }

  it should "produce correct output for a check without constraint" in {
    val baos = new ByteArrayOutputStream()
    val markdownReporter = new MarkdownReporter(new PrintStream(baos))

    val df = mock[DataFrame]
    val dfName = "myDf"
    val dfColumns = Array("1", "2")
    val dfCount = 5
    when(df.columns).thenReturn(dfColumns)

    val header = s"Checking $dfName"
    val prologue = s"It has a total number of ${dfColumns.size} columns and $dfCount rows."
    val check = Check(df, Some(dfName), Option.empty, Seq.empty)

    markdownReporter.report(CheckResult(Map.empty, check, dfCount))
    val expectedOutput = s"""**$header**

$prologue

Nothing to check!

"""

    baos.toString shouldBe expectedOutput
  }

} 
Example 58
Source File: ConsoleReporterTest.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.reporters

import java.io.{ByteArrayOutputStream, PrintStream}

import de.frosner.ddq.constraints._
import de.frosner.ddq.core._
import de.frosner.ddq.testutils.{DummyConstraint, DummyConstraintResult}
import org.apache.spark.sql.DataFrame
import org.mockito.Mockito._
import org.scalatest.mock.MockitoSugar
import org.scalatest.{FlatSpec, Matchers}

class ConsoleReporterTest extends FlatSpec with Matchers with MockitoSugar {

  "A Console reporter" should "produce correct output for a check with constraints" in {
    val baos = new ByteArrayOutputStream()
    val consoleReporter = new ConsoleReporter(new PrintStream(baos))

    val df = mock[DataFrame]
    val displayName = "myDf"
    val dfColumns = Array("1", "2")
    val dfCount = 5
    when(df.columns).thenReturn(dfColumns)

    val header = s"Checking $displayName"
    val prologue = s"It has a total number of ${dfColumns.size} columns and $dfCount rows."

    val message1 = "1"
    val status1 = ConstraintSuccess
    val constraint1 = DummyConstraint(message1, status1)
    val result1 = constraint1.fun(df)

    val message2 = "2"
    val status2 = ConstraintFailure
    val constraint2 = DummyConstraint(message2, status2)
    val result2 = constraint2.fun(df)

    val message3 = "3"
    val status3 = ConstraintError(new IllegalArgumentException())
    val constraint3 = DummyConstraint(message3, status3)
    val result3 = DummyConstraintResult(constraint3, message3, status3)

    val constraints = Map[Constraint, ConstraintResult[Constraint]](
      constraint1 -> result1,
      constraint2 -> result2,
      constraint3 -> result3
    )
    val check = Check(df, Some(displayName), Option.empty, constraints.keys.toSeq)

    consoleReporter.report(CheckResult(constraints, check, dfCount))
    val expectedOutput = s"""${Console.BLUE}$header${Console.RESET}
${Console.BLUE}$prologue${Console.RESET}
${Console.GREEN}- ${result1.message}${Console.RESET}
${Console.RED}- ${result2.message}${Console.RESET}
${Console.YELLOW}- ${result3.message}${Console.RESET}

"""

    baos.toString shouldBe expectedOutput
  }

  it should "produce correct output for a check without constraint" in {
    val baos = new ByteArrayOutputStream()
    val consoleReporter = new ConsoleReporter(new PrintStream(baos))

    val df = mock[DataFrame]
    val displayName = "myDf"
    val dfColumns = Array("1", "2")
    val dfCount = 5
    when(df.columns).thenReturn(dfColumns)

    val header = s"Checking $displayName"
    val prologue = s"It has a total number of ${dfColumns.size} columns and $dfCount rows."
    val check = Check(df, Some(displayName), Option.empty, Seq.empty)

    consoleReporter.report(CheckResult(Map.empty, check, dfCount))
    val expectedOutput = s"""${Console.BLUE}$header${Console.RESET}
${Console.BLUE}$prologue${Console.RESET}
${Console.BLUE}Nothing to check!${Console.RESET}

"""

    baos.toString shouldBe expectedOutput
  }

} 
Example 59
Source File: TestData.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.testutils

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object TestData {

  def makeIntegerDf(spark: SparkSession, numbers: Seq[Int]): DataFrame =
    spark.createDataFrame(
      spark.sparkContext.makeRDD(numbers.map(Row(_))),
      StructType(List(StructField("column", IntegerType, nullable = false)))
    )

  def makeNullableStringDf(spark: SparkSession, strings: Seq[String]): DataFrame =
    spark.createDataFrame(spark.sparkContext.makeRDD(strings.map(Row(_))), StructType(List(StructField("column", StringType, nullable = true))))

  def makeIntegersDf(spark: SparkSession, row1: Seq[Int], rowN: Seq[Int]*): DataFrame = {
    val rows = row1 :: rowN.toList
    val numCols = row1.size
    val rdd = spark.sparkContext.makeRDD(rows.map(Row(_:_*)))
    val schema = StructType((1 to numCols).map(idx => StructField("column" + idx, IntegerType, nullable = false)))
    spark.createDataFrame(rdd, schema)
  }

} 
Example 60
Source File: RunnerTest.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.core


import de.frosner.ddq.constraints.{ConstraintFailure, ConstraintSuccess}
import de.frosner.ddq.reporters.Reporter
import de.frosner.ddq.testutils.DummyConstraint
import org.apache.spark.sql.DataFrame
import org.apache.spark.storage.StorageLevel
import org.mockito.Mockito._
import org.scalatest.mock.MockitoSugar
import org.scalatest.{FlatSpec, Matchers}

class RunnerTest extends FlatSpec with Matchers with MockitoSugar {

  "A runner" should "run with multiple checks" in {
    val df1 = mock[DataFrame]
    val df2 = mock[DataFrame]

    val message1 = "1"
    val status1 = ConstraintSuccess
    val constraint1 = DummyConstraint(message1, status1)
    val result1 = constraint1.fun(df1)

    val message2 = "2"
    val status2 = ConstraintFailure
    val constraint2 = DummyConstraint(message2, status2)
    val result2 = constraint2.fun(df2)

    val check1 = Check(df1, None, None, Seq(constraint1))
    val check2 = Check(df2, None, None, Seq(constraint2))

    val checkResults = Runner.run(List(check1, check2), List.empty)

    checkResults.size shouldBe 2

    val checkResult1 = checkResults(check1)
    val checkResult2 = checkResults(check2)

    checkResult1.check shouldBe check1
    checkResult1.constraintResults shouldBe Map((constraint1, result1))

    checkResult2.check shouldBe check2
    checkResult2.constraintResults shouldBe Map((constraint2, result2))
  }

  it should "persist and unpersist the data frame if a persist method is specified" in {
    val storageLevel = StorageLevel.MEMORY_AND_DISK

    val df = mock[DataFrame]
    when(df.persist(storageLevel)).thenReturn(df.asInstanceOf[df.type])

    val check = Check(df, None, Some(storageLevel), Seq(DummyConstraint("test", ConstraintSuccess)))
    val checkResult = Runner.run(List(check), List.empty)(check)

    verify(df).persist(storageLevel)
    verify(df).unpersist()
  }

  it should "not persist and unpersist the data frame if no persist method is specified" in {
    val df = mock[DataFrame]

    val check = Check(df, None, None, Seq(DummyConstraint("test", ConstraintSuccess)))
    val checkResult = Runner.run(List(check), List.empty)(check)

    verify(df, never()).persist()
    verify(df, never()).unpersist()
  }

  it should "report to all reporters what it returns" in {
    val df = mock[DataFrame]

    val check = Check(df, None, None, Seq(DummyConstraint("test", ConstraintSuccess)))
    val checkResult = Runner.run(List(check), List.empty)(check)

    val reporter1 = mock[Reporter]
    val reporter2 = mock[Reporter]

    Runner.run(List(check), List(reporter1, reporter2))
    verify(reporter1).report(checkResult)
    verify(reporter2).report(checkResult)
  }

} 
Example 61
Source File: QueryFunctions.scala    From azure-sqldb-spark   with MIT License 5 votes vote down vote up
package com.microsoft.azure.sqldb.spark.query

import java.sql.{Connection, SQLException}

import com.microsoft.azure.sqldb.spark.connect.ConnectionUtils._
import com.microsoft.azure.sqldb.spark.LoggingTrait
import com.microsoft.azure.sqldb.spark.config.{Config, SqlDBConfig}
import com.microsoft.azure.sqldb.spark.connect._
import org.apache.spark.sql.{DataFrame, SQLContext}


  def sqlDBQuery(config: Config): Either[DataFrame, Boolean] = {

    var connection: Connection = null

    val sql = config.get[String](SqlDBConfig.QueryCustom).getOrElse(
      throw new IllegalArgumentException("Query not found in QueryCustom in Config")
    )

    try {
      connection = getConnection(config)
      val statement = connection.createStatement()

      if (statement.execute(sql)) {
        Left(sqlContext.read.sqlDB(config))
      }
      else {
        Right(true)
      }
    }
    catch {
      case sqlException: SQLException => {
        sqlException.printStackTrace()
        Right(false)
      }
      case exception: Exception => {
        exception.printStackTrace()
        Right(false)
      }
    }
    finally {
      connection.close()
    }
  }
} 
Example 62
Source File: testData.scala    From sparkGLM   with Apache License 2.0 5 votes vote down vote up
package com.Alteryx.testUtils.data

import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.test._
import org.apache.spark.sql.test.TestSQLContext.implicits._


object testData {
  val numericDF: DataFrame = TestSQLContext.read.json(
    "./src/test/scala/com/Alteryx/testUtils/data/linear_reg_all_numeric.json")

  val mixedDF: DataFrame = TestSQLContext.read.json(
    "./src/test/scala/com/Alteryx/testUtils/data/linear_reg_mixed.json")

  case class testRow(intField: Int, strField: String, numField: Double)
  val dummyDF: DataFrame = {
    TestSQLContext.sparkContext.parallelize(
      testRow(1, "a", 1.0) ::
      testRow(2, "b", 2.0) ::
      testRow(3, "c", 3.0) :: Nil).toDF()
  }

  val oneLessCategoryDF: DataFrame = {
    TestSQLContext.sparkContext.parallelize(
      testRow(1, "a", 1.0) ::
      testRow(2, "b", 2.0) ::
      testRow(3, "a", 3.0) :: Nil).toDF()
  }

  val testRDD = TestSQLContext.sparkContext.parallelize(Seq(
      Array(1.0, 1.1, 21.4),
      Array(1.0, 2.2, 36.5),
      Array(1.0, 3.3, 15.0),
      Array(1.0, 4.4, 62.5),
      Array(1.0, 5.5, 36.1),
      Array(1.0, 6.6, 12.0),
      Array(1.0, 7.7, 37.0),
      Array(1.0, 8.8, 41.0),
      Array(1.0, 9.9, 36.6),
      Array(1.0, 11.0, 17.9),
      Array(1.0, 12.1, 53.1),
      Array(1.0, 13.2, 29.6),
      Array(1.0, 14.3, 8.3),
      Array(1.0, 15.4, -24.7),
      Array(1.0, 16.5, 41.0),
      Array(1.0, 17.6, 16.5),
      Array(1.0, 18.7, 16.0),
      Array(1.0, 19.8, 34.1),
      Array(1.0, 20.9, 30.5),
      Array(1.0, 22.0, 24.9),
      Array(1.0, 23.1, 30.3),
      Array(1.0, 24.2, 26.4),
      Array(1.0, 25.3, 11.2),
      Array(1.0, 26.4, -31.2),
      Array(1.0, 27.5, 19.9),
      Array(1.0, 28.6, 5.3),
      Array(1.0, 29.7, 2.2),
      Array(1.0, 30.8, -25.2),
      Array(1.0, 31.9, -6.5),
      Array(1.0, 33.0, 10.4),
      Array(1.0, 34.1, 28.1),
      Array(1.0, 35.2, -2.3),
      Array(1.0, 36.3, 6.5),
      Array(1.0, 37.4, -3.5),
      Array(1.0, 38.5, -31.0),
      Array(1.0, 39.6, -12.9),
      Array(1.0, 40.7, -13.6),
      Array(1.0, 41.8, -8.0),
      Array(1.0, 42.9, 14.1),
      Array(1.0, 44.0, 6.3),
      Array(1.0, 45.1, -13.4),
      Array(1.0, 46.2, -16.3),
      Array(1.0, 47.3, 1.6),
      Array(1.0, 48.4, -2.3),
      Array(1.0, 49.5, -28.3),
      Array(1.0, 50.6, -29.7),
      Array(1.0, 51.7, -9.4),
      Array(1.0, 52.8, -2.4),
      Array(1.0, 53.9, -21.1),
      Array(1.0, 55.0, -2.4)
  ), 4).map(x => Row(x(0), x(1), x(2)))

  val testSchema = StructType(
      StructField("intercept", DoubleType, true) ::
      StructField("x", DoubleType, true) ::
      StructField("y", DoubleType, true) :: Nil)

  val testDFSinglePart: DataFrame = {
    TestSQLContext.createDataFrame(testRDD, testSchema).coalesce(1)
  }

  val testDFMultiPart: DataFrame = {
    TestSQLContext.createDataFrame(testRDD, testSchema)
  }
} 
Example 63
Source File: DataFrameToMleap.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.converter

import com.truecar.mleap.runtime.types.StringArrayType
import com.truecar.mleap.spark
import com.truecar.mleap.spark.SparkDataset
import com.truecar.mleap.runtime.types
import com.truecar.mleap.spark.SparkLeapFrame
import org.apache.spark.ml.mleap
import org.apache.spark.mllib.linalg.VectorUDT
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.types._
import com.truecar.mleap.runtime.{Row => MleapRow}


case class DataFrameToMleap(dataset: DataFrame) {
  def toMleap: SparkLeapFrame = {
    val mleapFields = dataset.schema.fields.flatMap {
      field =>
        field.dataType match {
          case _: NumericType | BooleanType | StringType => Seq(types.StructField(field.name, types.DoubleType))
          case _: VectorUDT => Seq(types.StructField(field.name, types.VectorType))
          case _: StringType => Seq(types.StructField(field.name, types.StringType))
          case dataType: ArrayType =>
            dataType.elementType match {
              case StringType => Seq(types.StructField(field.name, StringArrayType))
              case _ => Seq()
            }
          case _ => Seq()
        }
    }

    toMleap(types.StructType(mleapFields))
  }

  def toMleap(schema: types.StructType): SparkLeapFrame = {
    val sparkSchema = dataset.schema

    // cast MLeap field numeric types to DoubleTypes
    val mleapCols = schema.fields.map {
      field =>
        field.dataType match {
          case types.DoubleType => dataset.col(field.name).cast(DoubleType).as(s"mleap.${field.name}")
          case types.StringType => dataset.col(field.name).cast(StringType).as(s"mleap.${field.name}")
          case types.VectorType => dataset.col(field.name).cast(new mleap.VectorUDT()).as(s"mleap.${field.name}")
          case types.StringArrayType => dataset.col(field.name).cast(new ArrayType(StringType, containsNull = false)).as(s"mleap.${field.name}")
        }
    }
    val cols = Seq(dataset.col("*")) ++ mleapCols
    val castDataset = dataset.select(cols: _*)

    val sparkIndices = sparkSchema.fields.indices
    val mleapIndices = (sparkSchema.fields.length until (sparkSchema.fields.length + schema.fields.length)).toArray

    val rdd = castDataset.rdd.map {
      row =>
        // finish converting Spark data structure to MLeap
        // TODO: make a Spark UDT for MleapVector and just
        // cast like we do for numeric types
        val mleapValues = mleapIndices.map(row.get)
        val mleapRow = MleapRow(mleapValues: _*)
        val sparkValues: IndexedSeq[Any] = sparkIndices.map(row.get)

        (mleapRow, sparkValues)
    }

    val mleapDataset = SparkDataset(rdd)
    SparkLeapFrame(schema,
      sparkSchema,
      mleapDataset)
  }
} 
Example 64
Source File: LeapFrameToSpark.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.converter

import com.truecar.mleap.core.linalg.Vector
import com.truecar.mleap.runtime.types.StructType
import com.truecar.mleap.spark.{SparkLeapFrame, MleapSparkSupport}
import org.apache.spark.sql.{types, Row, DataFrame, SQLContext}
import MleapSparkSupport._


trait LeapFrameToSpark[T] {
  def toSpark(t: T)(implicit sqlContext: SQLContext): DataFrame
}

case class LeapFrameToSparkWrapper[T: LeapFrameToSpark](t: T) {
  def toSpark(implicit sqlContext: SQLContext): DataFrame = {
    implicitly[LeapFrameToSpark[T]].toSpark(t)
  }
}

object LeapFrameToSpark {
  implicit object SparkLeapFrameToSpark extends LeapFrameToSpark[SparkLeapFrame] {
    override def toSpark(t: SparkLeapFrame)
                        (implicit sqlContext: SQLContext): DataFrame = {
      val outputNames = t.schema.fields.map(_.name).toSet -- t.sparkSchema.fields.map(_.name).toSet
      val outputs = outputNames.map {
        name => (t.schema(name), t.schema.indexOf(name))
      }.toArray.sortBy(_._2)
      val (outputFields, outputIndices) = outputs.unzip
      val outputMleapSchema = StructTypeToSpark(StructType(outputFields)).toSpark
      val outputSchema = types.StructType(t.sparkSchema.fields ++ outputMleapSchema.fields)

      val rows = t.dataset.rdd.map {
        case (mleapRow, sparkValues) =>
          val mleapData = outputIndices.map {
            index =>
              mleapRow.get(index) match {
                case value: Vector => value.toSpark
                case value => value
              }
          }

          Row(sparkValues ++ mleapData: _*)
      }

      sqlContext.createDataFrame(rows, outputSchema)
    }
  }
} 
Example 65
Source File: MleapSparkSupport.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package com.truecar.mleap.spark

import com.truecar.mleap.core.linalg
import com.truecar.mleap.runtime.transformer.{Transformer => MleapTransformer}
import com.truecar.mleap.runtime.{types, Row => MleapRow}
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.mleap.converter._
import org.apache.spark.ml.mleap.converter.runtime.{BaseTransformerConverter, TransformerToMleap}
import org.apache.spark.ml.mleap.converter.runtime.classification.DecisionTreeClassificationModelToMleap
import org.apache.spark.ml.mleap.converter.runtime.regression.DecisionTreeRegressionModelToMleap
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.tree._
import org.apache.spark.ml.Transformer
import org.apache.spark.mllib.linalg._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, SQLContext}


trait MleapSparkSupport extends BaseTransformerConverter {
  import scala.language.implicitConversions

  implicit def transformerToMleapLifted[T <: Transformer]
  (t: T)
  (implicit transformerToMleap: TransformerToMleap[T, _ <: MleapTransformer]): MleapTransformer = {
    transformerToMleap.toMleapLifted(t)
  }

  implicit def mleapTransformerWrapper[T <: MleapTransformer](t: T): MleapTransformerWrapper[T] = {
    MleapTransformerWrapper(t)
  }

  implicit def vectorToSpark(vector: linalg.Vector): VectorToSpark = VectorToSpark(vector)
  implicit def vectorToMleap(vector: Vector): VectorToMleap = VectorToMleap(vector)
  implicit def dataFrameToMleap(dataset: DataFrame): DataFrameToMleap = DataFrameToMleap(dataset)
  implicit def decisionTreeRegressionModelToMleap(tree: DecisionTreeRegressionModel): DecisionTreeRegressionModelToMleap = DecisionTreeRegressionModelToMleap(tree)
  implicit def decisionTreeClassificationModelToMleap(tree: DecisionTreeClassificationModel): DecisionTreeClassificationModelToMleap = DecisionTreeClassificationModelToMleap(tree)
  implicit def nodeToMleap(node: Node): NodeToMleap = NodeToMleap(node)
  implicit def splitToMleap(split: Split): SplitToMleap = SplitToMleap(split)
  implicit def structTypeToMleap(schema: StructType): StructTypeToMleap = StructTypeToMleap(schema)

  implicit def rowToSpark(row: MleapRow): RowToSpark = RowToSpark(row)
  implicit def structTypeToSpark(schema: types.StructType): StructTypeToSpark = StructTypeToSpark(schema)
  implicit def leapFrameToSpark[T: LeapFrameToSpark](frame: T): LeapFrameToSparkWrapper[T] = {
    LeapFrameToSparkWrapper(frame)
  }
  implicit def leapFrameToSparkConvert[T: LeapFrameToSpark](frame: T)
                                                           (implicit sqlContext: SQLContext): DataFrame = {
    implicitly[LeapFrameToSpark[T]].toSpark(frame)
  }
  implicit def dataFrameToLeapFrame(dataFrame: DataFrame): SparkLeapFrame = dataFrame.toMleap
}
object MleapSparkSupport extends MleapSparkSupport 
Example 66
Source File: DataPreprocess.scala    From xgbspark-text-classification   with Apache License 2.0 5 votes vote down vote up
package com.lenovo.ml

import org.apache.spark.sql.{SparkSession, DataFrame, Dataset}
import scala.collection.mutable
import scala.util.matching.Regex
import org.ansj.library.DicLibrary
import org.ansj.recognition.impl.StopRecognition
import org.ansj.splitWord.analysis.DicAnalysis


object DataPreprocess {
  def textCleaner(sparkSession: SparkSession, rawText: DataFrame): Dataset[String] = {
    // 过滤文本中的时间、网址和邮箱
    val regex1 = new Regex("""[-—0-9a-z]+[:]+[0-9a-z]+[:]?""")
    val regex2 = new Regex("""[0-9]+年|[0-9]+月|[0-9]+[日]|[0-9]+[天]|[0-9]+[号]|[0-9]+[次]""")
    val regex3 = new Regex("""http[s]?://[a-z0-9./?=_-]+""")
    val regex4 = new Regex("""[0-9_a-z]+([-+.][0-9_a-z]+)*@[0-9_a-z]+([-.][0-9_a-z]+)*\.[0-9_a-z]+([-.][0-9_a-z]+)*""")

    import sparkSession.implicits._
    rawText.map(x => x.toString).map(x => x.substring(1,x.length - 1).toLowerCase).map(x => regex1.replaceAllIn(x,""))
      .map(x => regex2.replaceAllIn(x,"")).map(x => regex3.replaceAllIn(x,"")).map(x => regex4.replaceAllIn(x,""))
  }

  def segWords(sparkSession: SparkSession, stopWordsPath: String, dictionaryPath: String, synonymWordsPath: String,
               singleWordsPath: String, rawText: DataFrame): DataFrame = {
    val filter = new StopRecognition()
    // 设定停用词性
    filter.insertStopNatures("w","ns","nr","t","r","u","e","y","o")
    // 加载停用词表
    val stopWords = sparkSession.sparkContext.textFile(stopWordsPath).cache()
    stopWords.collect().foreach{line => filter.insertStopWords(line)}
    // 加载自定义词表
    val dictionary = sparkSession.sparkContext.textFile(dictionaryPath).cache()
    dictionary.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)}
    stopWords.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)}
    // 构建同义词表
    val synonymWords = sparkSession.sparkContext.textFile(synonymWordsPath).cache()
    var synonymMap: Map[String, String] = Map()
    synonymWords.collect().foreach{line =>
      val data = line.split(" ",2)
      synonymMap = synonymMap + (data(0) -> data(1))
    }
    // 构建单字白名单
    val singleWords = sparkSession.sparkContext.textFile(singleWordsPath).cache()
    val singleWhiteList: mutable.Set[String] = mutable.Set()
    singleWords.collect().foreach{line => singleWhiteList.add(line)}

    // 通过广播将词表发送给各节点
    val stop = sparkSession.sparkContext.broadcast(filter)
    val dic = sparkSession.sparkContext.broadcast(DicLibrary.get(DicLibrary.DEFAULT))
    val synonym = sparkSession.sparkContext.broadcast(synonymMap)
    val single = sparkSession.sparkContext.broadcast(singleWhiteList)

    // 读取文本数据,过滤后分词
    import sparkSession.implicits._
    textCleaner(sparkSession, rawText).map { x =>
      val parse = DicAnalysis.parse(x, dic.value).recognition(stop.value)
      // 抽取分词结果,不附带词性
      val words = for(i<-Range(0,parse.size())) yield parse.get(i).getName
      val filterWords = words.map(_.trim).filter(x => x.length > 1 || single.value.contains(x))
      filterWords.map(x => if(synonym.value.contains(x)) synonym.value(x) else x).mkString(" ")
    }.toDF("words")
  }
} 
Example 67
Source File: RedshiftReaderM.scala    From SqlShift   with MIT License 5 votes vote down vote up
package com.databricks.spark.redshift

import com.amazonaws.auth.AWSCredentials
import com.amazonaws.services.s3.AmazonS3Client
import org.apache.spark.SparkContext
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.{DataFrame, SQLContext}

object RedshiftReaderM {

    val endpoint = "s3.ap-south-1.amazonaws.com"

    def getS3Client(provider: AWSCredentials): AmazonS3Client = {
        val client = new AmazonS3Client(provider)
        client.setEndpoint(endpoint)
        client
    }

    def getDataFrameForConfig(configs: Map[String, String], sparkContext: SparkContext, sqlContext: SQLContext): DataFrame = {
        val source: DefaultSource = new DefaultSource(new JDBCWrapper(), getS3Client)
        val br: BaseRelation = source.createRelation(sqlContext, configs)
        sqlContext.baseRelationToDataFrame(br)
    }
} 
Example 68
Source File: SparkNRedshiftUtil.scala    From SqlShift   with MIT License 5 votes vote down vote up
package com.goibibo.sqlshift

import java.sql.{Connection, DriverManager}
import java.util.Properties

import com.databricks.spark.redshift.RedshiftReaderM
import com.typesafe.config.Config
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, Suite}
import org.slf4j.{Logger, LoggerFactory}


trait SparkNRedshiftUtil extends BeforeAndAfterAll {
    self: Suite =>
    private val logger: Logger = LoggerFactory.getLogger(this.getClass)
    @transient private var _sc: SparkContext = _
    @transient private var _sqlContext: SQLContext = _

    def sc: SparkContext = _sc
    def sqlContext: SQLContext = _sqlContext

    private def getRedshiftConnection(config: Config): Connection = {
        val mysql = config.getConfig("redshift")
        val connectionProps = new Properties()
        connectionProps.put("user", mysql.getString("username"))
        connectionProps.put("password", mysql.getString("password"))
        val jdbcUrl = s"jdbc:redshift://${mysql.getString("hostname")}:${mysql.getInt("portno")}/${mysql.getString("database")}?useSSL=false"
        Class.forName("com.amazon.redshift.jdbc4.Driver")
        DriverManager.getConnection(jdbcUrl, connectionProps)
    }

    val getSparkContext: (SparkContext, SQLContext) = {
        val sparkConf: SparkConf = new SparkConf().setAppName("Full Dump Testing").setMaster("local")
        val sc: SparkContext = new SparkContext(sparkConf)
        val sqlContext: SQLContext = new SQLContext(sc)

        System.setProperty("com.amazonaws.services.s3.enableV4", "true")
        sc.hadoopConfiguration.set("fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com")
        sc.hadoopConfiguration.set("fs.s3a.fast.upload", "true")
        (sc, sqlContext)
    }

    def readTableFromRedshift(config: Config, tableName: String): DataFrame = {
        val redshift: Config = config.getConfig("redshift")
        val options = Map("dbtable" -> tableName,
            "user" -> redshift.getString("username"),
            "password" -> redshift.getString("password"),
            "url" -> s"jdbc:redshift://${redshift.getString("hostname")}:${redshift.getInt("portno")}/${redshift.getString("database")}",
            "tempdir" -> config.getString("s3.location"),
            "aws_iam_role" -> config.getString("redshift.iamRole")
        )
        RedshiftReaderM.getDataFrameForConfig(options, sc, sqlContext)
    }

    def dropTableRedshift(config: Config, tables: String*): Unit = {
        logger.info("Droping table: {}", tables)
        val conn = getRedshiftConnection(config)
        val statement = conn.createStatement()
        try {
            val dropTableQuery = s"""DROP TABLE ${tables.mkString(",")}"""
            logger.info("Running query: {}", dropTableQuery)
            statement.executeUpdate(dropTableQuery)
        } finally {
            statement.close()
            conn.close()
        }
    }

    override protected def beforeAll(): Unit = {
        super.beforeAll()
        val (sc, sqlContext) = getSparkContext
        _sc = sc
        _sqlContext = sqlContext
    }

    override protected def afterAll(): Unit = {
        super.afterAll()
        _sc.stop()
    }
} 
Example 69
Source File: LogisticRegressionSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.classification

import com.ibm.aardpfark.pfa.ProbClassifierResult

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.{DataFrame, Row}

class LogisticRegressionSuite extends SparkClassifierPFASuiteBase[ProbClassifierResult] {
  import spark.implicits._

  def getOutput(df: DataFrame) = {
    df.select(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).map
    {
      case Row(p: Double, raw: Vector, pr: Vector) => (p, raw.toArray, pr.toArray)
    }.toDF(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).toJSON.collect()
  }

  val binaryData = spark.read.format("libsvm").load("data/sample_libsvm_data.txt")
  val multiData = spark.read.format("libsvm").load("data/sample_multiclass_classification_data.txt")

  val clf = new LogisticRegression()

  override val sparkTransformer = clf.fit(binaryData)
  val result = sparkTransformer.transform(binaryData)
  override val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
  override val expectedOutput = getOutput(result)

  // Additional tests
  test("LogisticRegression w/o fitIntercept") {
    val sparkTransformer = clf.setFitIntercept(false).fit(binaryData)
    val result = sparkTransformer.transform(binaryData)
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("LogisticRegression w/ non-default threshold") {
    val sparkTransformer = clf.setThreshold(0.0).fit(binaryData)
    val result = sparkTransformer.transform(binaryData)
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)

    val sparkTransformer2 = clf.setThreshold(1.0).fit(binaryData)
    val result2 = sparkTransformer2.transform(binaryData)
    val expectedOutput2 = getOutput(result2)

    parityTest(sparkTransformer2, input, expectedOutput2)
  }

  test("MLOR w/ intercept") {
    val sparkTransformer = clf.fit(multiData)
    val result = sparkTransformer.transform(multiData)
    val input =  withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("MLOR w/o intercept") {
    val sparkTransformer = clf.setFitIntercept(false).fit(multiData)
    val result = sparkTransformer.transform(multiData)
    val input =  withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("MLOR w/ thresholds") {
    val sparkTransformer = clf.setThresholds(Array(0.1, 0.6, 0.3)).fit(multiData)
    val result = sparkTransformer.transform(multiData)
    val input =  withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("MLOR w/ thresholds - one zero") {
    val sparkTransformer = clf.setThresholds(Array(0.0, 0.6, 0.3)).fit(multiData)
    val result = sparkTransformer.transform(multiData)
    val input =  withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

} 
Example 70
Source File: DataCoder.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.hbase.examples

import org.apache.spark.sql.execution.datasources.hbase._
import org.apache.spark.sql.{DataFrame, SparkSession}

case class DCRecord(
    col00: String,
    col01: Int,
    col1: Boolean,
    col2: Double,
    col3: Float,
    col4: Int,
    col5: Long,
    col6: Short,
    col7: String,
    col8: Byte)

object DCRecord {
  def apply(i: Int): DCRecord = {
    DCRecord(s"row${"%03d".format(i)}",
      if (i % 2 == 0) {
        i
      } else {
        -i
      },
      i % 2 == 0,
      i.toDouble,
      i.toFloat,
      i,
      i.toLong,
      i.toShort,
      s"String$i extra",
      i.toByte)
  }
}

object DataCoder {
  def cat = s"""{
                |"table":{"namespace":"default", "name":"shcExampleDCTable", "tableCoder":"Phoenix", "version":"2.0"},
                |"rowkey":"key1:key2",
                |"columns":{
                |"col00":{"cf":"rowkey", "col":"key1", "type":"string"},
                |"col01":{"cf":"rowkey", "col":"key2", "type":"int"},
                |"col1":{"cf":"CF1", "col":"COL1", "type":"boolean"},
                |"col2":{"cf":"CF1", "col":"COL2", "type":"double"},
                |"col3":{"cf":"CF2", "col":"COL3", "type":"float"},
                |"col4":{"cf":"CF2", "col":"COL4", "type":"int"},
                |"col5":{"cf":"CF3", "col":"COL5", "type":"bigint"},
                |"col6":{"cf":"CF3", "col":"COL6", "type":"smallint"},
                |"col7":{"cf":"CF4", "col":"COL7", "type":"string"},
                |"col8":{"cf":"CF4", "col":"COL8", "type":"tinyint"}
                |}
                |}""".stripMargin

  def main(args: Array[String]){
    val spark = SparkSession.builder()
      .appName("DataCoderExample")
      .getOrCreate()

    val sc = spark.sparkContext
    val sqlContext = spark.sqlContext

    import sqlContext.implicits._

    def withCatalog(cat: String): DataFrame = {
      sqlContext
        .read
        .options(Map(HBaseTableCatalog.tableCatalog->cat))
        .format("org.apache.spark.sql.execution.datasources.hbase")
        .load()
    }

    // populate table with composite key
    val data = (0 to 255).map { i =>
      DCRecord(i)
    }

    sc.parallelize(data).toDF.write
      .options(Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5"))
      .format("org.apache.spark.sql.execution.datasources.hbase")
      .save()

    val df = withCatalog(cat)
    df.show
    df.filter($"col0" <= "row005")
      .select($"col0", $"col1").show
    df.filter($"col0" === "row005" || $"col0" <= "row005")
      .select($"col0", $"col1").show
    df.filter($"col0" > "row250")
      .select($"col0", $"col1").show
    df.registerTempTable("table1")
    val c = sqlContext.sql("select count(col1) from table1 where col0 < 'row050'")
    c.show()
  }
} 
Example 71
Source File: LRJobForDataSources.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.hbase.examples

import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.execution.datasources.hbase.{HBaseRelation, HBaseTableCatalog}

case class LRRecord(
    key: Int,
    col1: Boolean,
    col2: Double,
    col3: Float)

object LRRecord {
  def apply(i: Int): LRRecord = {
    LRRecord(i,
      i % 2 == 0,
      i.toDouble,
      i.toFloat)
  }
}

// long running job for different data sources
object LRJobForDataSources {
  val cat = s"""{
            |"table":{"namespace":"default", "name":"shcExampleTable", "tableCoder":"PrimitiveType"},
            |"rowkey":"key",
            |"columns":{
              |"key":{"cf":"rowkey", "col":"key", "type":"int"},
              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
              |"col3":{"cf":"cf3", "col":"col3", "type":"float"}
            |}
          |}""".stripMargin

  def main(args: Array[String]) {
    if (args.length < 1) {
      System.err.println("Usage: LRJobAccessing2Clusters <hiveTableName> [sleepTime]")
      System.exit(1)
    }

    val hiveTableName = args(0)
    val sleepTime = if (args.length > 1) args(1).toLong else 2 * 60 * 1000 // sleep 2 min by default

    val spark = SparkSession.builder()
      .appName("LRJobForDataSources")
      .enableHiveSupport()
      .getOrCreate()

    val sc = spark.sparkContext
    val sqlContext = spark.sqlContext

    import sqlContext.implicits._
    import spark.sql

    def withCatalog(cat: String): DataFrame = {
      sqlContext
        .read
        .options(Map(HBaseTableCatalog.tableCatalog->cat))
        .format("org.apache.spark.sql.execution.datasources.hbase")
        .load()
    }

    val timeEnd = System.currentTimeMillis() + (25 * 60 * 60 * 1000) // 25h later
    while (System.currentTimeMillis() < timeEnd) {
      // Part 1: write data into Hive table and read data from it, which accesses HDFS
      sql(s"DROP TABLE IF EXISTS $hiveTableName")
      sql(s"CREATE TABLE $hiveTableName(key INT, col1 BOOLEAN, col2 DOUBLE, col3 FLOAT)")
      for (i <- 1 to 3) {
        sql(s"INSERT INTO $hiveTableName VALUES ($i, ${i % 2 == 0}, ${i.toDouble}, ${i.toFloat})")
      }
      val df1 = sql(s"SELECT * FROM $hiveTableName")
      df1.show()

      // Part 2: create HBase table, write data into it, read data from it
      val data = (0 to 40).map { i =>
        LRRecord(i)
      }
      sc.parallelize(data).toDF.write.options(
        Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5"))
        .format("org.apache.spark.sql.execution.datasources.hbase")
        .save()
      val df2 = withCatalog(cat)
      df2.show
      df2.filter($"key" <= "5").select($"key", $"col1").show

      // Part 3: join the two dataframes
      val s1 = df1.filter($"key" <= "40").select("key", "col1")
      val s2 = df2.filter($"key" <= "20" && $"key" >= "1").select("key", "col2")
      val result =  s1.join(s2, Seq("key"))
      result.show()

      Thread.sleep(sleepTime)
    }

    spark.stop()
  }
} 
Example 72
Source File: Kudu.scala    From kafka-examples   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.streaming.refapp

import org.apache.kudu.spark.kudu._
import org.apache.spark.sql.streaming.DataStreamWriter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SparkSession}


class KuduSink(master: String, database: String, checkpointLocation: String => String) {

    def writeTable(sinkName: String, triggerSeconds: Int = 10) =
      new Sink {
        override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = {
          val fullTableName = s"impala::$database.$name"
          df
            .writeStream
            .format("kudu")
            .option("kudu.master", master)
            .option("kudu.table", fullTableName)
            .option("checkpointLocation", checkpointLocation(name))
            .option("retries", "3")
            .outputMode("update")
        }

        override val name: String = sinkName
      }

} 
Example 73
Source File: KuduSink.scala    From kafka-examples   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.streaming.refapp.kudu

import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.slf4j.LoggerFactory

import scala.util.control.NonFatal

object KuduSink {
  def withDefaultContext(sqlContext: SQLContext, parameters: Map[String, String]) =
    new KuduSink(new KuduContext(parameters("kudu.master"), sqlContext.sparkContext), parameters)
}


class KuduSink(initKuduContext: => KuduContext, parameters: Map[String, String]) extends Sink {

  private val logger = LoggerFactory.getLogger(getClass)

  private var kuduContext = initKuduContext

  private val tablename = parameters("kudu.table")

  private val retries = parameters.getOrElse("retries", "1").toInt
  require(retries >= 0, "retries must be non-negative")

  logger.info(s"Created Kudu sink writing to table $tablename")

  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    for (attempt <- 0 to retries) {
      try {
        kuduContext.upsertRows(data, tablename)
        return
      } catch {
        case NonFatal(e) =>
          if (attempt < retries) {
            logger.warn("Kudu upsert error, retrying...", e)
            kuduContext = initKuduContext
          }
          else {
            logger.error("Kudu upsert error, exhausted", e)
            throw e
          }
      }
    }
  }
} 
Example 74
Source File: Memory.scala    From kafka-examples   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.streaming.refapp

import org.apache.spark.sql.streaming.{DataStreamWriter, OutputMode}
import org.apache.spark.sql.{DataFrame, Row}

object Memory {

  def memorySink(sinkName: String) = new Sink {
    override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = df
      .writeStream
      .outputMode(OutputMode.Append)
      .queryName(name)
      .format("memory")

    override val name: String = sinkName
  }

} 
Example 75
Source File: KuduSinkUnitTest.scala    From kafka-examples   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.streaming.refapp.kudu

import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.DataFrame
import org.mockito.Mockito._
import org.scalatest._
import org.scalatest.mockito.MockitoSugar

class KuduSinkUnitTest extends FunSuite with MockitoSugar {

  private val frame = mock[DataFrame]

  private def setupKuduContextMock(kuduContext: KuduContext, failTimes: Int): KuduContext = {
    if (failTimes > 0) {
      val stubber = doThrow(new RuntimeException)
      for (_ <- 2 to failTimes) {
        stubber.doThrow(new RuntimeException)
      }
      stubber.doCallRealMethod()
        .when(kuduContext).upsertRows(frame, "table")
    }
    kuduContext
  }

  test("kudu upsert fails, retries once") {
    val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 1), 1)

    helper.sink.addBatch(0, frame)
    assert(helper.initialized == 1, "context should be initialized once")
  }

  test("kudu upsert fails twice, retries once, fails") {
    val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 2), 1)

    intercept[RuntimeException] {
      helper.sink.addBatch(0, frame)
    }
    assert(helper.initialized == 1, "context should be initialized once")
  }

  test("kudu upsert fails 3 times, retries 3 times") {
    val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 3), 3)
    helper.sink.addBatch(0, frame)
    assert(helper.initialized == 3, "context should be initialized three times")
  }

  test("kudu upsert fails 3 times, retries 4 times") {
    val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 3), 4)
    helper.sink.addBatch(0, frame)
    assert(helper.initialized == 3, "context should be initialized only three times")
  }

}

class KuduSinkWithMockedContext(kuduContext: KuduContext, retries: Int) {

  // KuduSink constructor inits once
  var initialized = -1

  private def initKuduConext: KuduContext = {
    initialized += 1
    kuduContext
  }

  val sink = new KuduSink(initKuduConext, Map(
    "kudu.table" -> "table",
    "kudu.master" -> "master",
    "retries" -> retries.toString))
} 
Example 76
Source File: KuduSink.scala    From kafka-examples   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.streaming.refapp.kudu

import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.slf4j.LoggerFactory

import scala.util.control.NonFatal

object KuduSink {
  def withDefaultContext(sqlContext: SQLContext, parameters: Map[String, String]) =
    new KuduSink(new KuduContext(parameters("kudu.master"), sqlContext.sparkContext), parameters)
}


class KuduSink(initKuduContext: => KuduContext, parameters: Map[String, String]) extends Sink {

  private val logger = LoggerFactory.getLogger(getClass)

  private var kuduContext = initKuduContext

  private val tablename = parameters("kudu.table")

  private val retries = parameters.getOrElse("retries", "1").toInt
  require(retries >= 0, "retries must be non-negative")

  logger.info(s"Created Kudu sink writing to table $tablename")

  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    for (attempt <- 0 to retries) {
      try {
        kuduContext.upsertRows(data, tablename)
        return
      } catch {
        case NonFatal(e) =>
          if (attempt < retries) {
            logger.warn("Kudu upsert error, retrying...", e)
            kuduContext = initKuduContext
          }
          else {
            logger.error("Kudu upsert error, exhausted", e)
            throw e
          }
      }
    }
  }
} 
Example 77
Source File: TestSparkContext.scala    From spark-images   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.image

import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types._
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, DataFrame, SQLContext, SparkSession}

import scala.reflect.runtime.universe._
import org.scalatest.{FunSuite, BeforeAndAfterAll}

// This context is used for all tests in this project
trait TestSparkContext extends BeforeAndAfterAll { self: FunSuite =>
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _
  @transient lazy val spark: SparkSession = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("Spark-Image-Test")
      .set("spark.ui.port", "4079")
      .set("spark.sql.shuffle.partitions", "4")  // makes small tests much faster

    val sess = SparkSession.builder().config(conf).getOrCreate()
    sess.sparkContext.setLogLevel("WARN")
    sess
  }

  override def beforeAll() {
    super.beforeAll()
    sc = spark.sparkContext
    sqlContext = spark.sqlContext
    import spark.implicits._
  }

  override def afterAll() {
    sqlContext = null
    if (sc != null) {
      sc.stop()
    }
    sc = null
    super.afterAll()
  }

  def makeDF[T: TypeTag](xs: Seq[T], col: String): DataFrame = {
    sqlContext.createDataFrame(xs.map(Tuple1.apply)).toDF(col)
  }

  def compareRows(r1: Array[Row], r2: Seq[Row]): Unit = {
    val a = r1.sortBy(_.toString())
    val b = r2.sortBy(_.toString())
    assert(a === b)
  }
} 
Example 78
Source File: SpreadsheetRelation.scala    From spark-google-spreadsheets   with Apache License 2.0 5 votes vote down vote up
package com.github.potix2.spark.google.spreadsheets

import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheetContext
import com.github.potix2.spark.google.spreadsheets.util._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

case class SpreadsheetRelation protected[spark] (
                                                  context:SparkSpreadsheetContext,
                                                  spreadsheetName: String,
                                                  worksheetName: String,
                                                  userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan with InsertableRelation {

  import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService._

  override def schema: StructType = userSchema.getOrElse(inferSchema())

  private lazy val aWorksheet: SparkWorksheet =
    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(aWorksheet) => aWorksheet
      case Left(e) => throw e
    }

  private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows

  private[spreadsheets] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] =
    for {
      sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right
      worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right
    } yield worksheet

  override def buildScan(): RDD[Row] = {
    val aSchema = schema
    sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter =>
      iter.map { m =>
        var index = 0
        val rowArray = new Array[Any](aSchema.fields.length)
        while(index < aSchema.fields.length) {
          val field = aSchema.fields(index)
          rowArray(index) = if (m.contains(field.name)) {
            TypeCast.castTo(m(field.name), field.dataType, field.nullable)
          } else {
            null
          }
          index += 1
        }
        Row.fromSeq(rowArray)
      }
    }
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    if(!overwrite) {
      sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.")
    }

    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(w) =>
        w.updateCells(data.schema, data.collect().toList, Util.toRowData)
      case Left(e) =>
        throw e
    }
  }

  private def inferSchema(): StructType =
    StructType(aWorksheet.headers.toList.map { fieldName =>
      StructField(fieldName, StringType, nullable = true)
    })

} 
Example 79
Source File: DefaultSource.scala    From spark-google-spreadsheets   with Apache License 2.0 5 votes vote down vote up
package com.github.potix2.spark.google.spreadsheets

import java.io.File

import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
  final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = {
    createRelation(sqlContext, parameters, null)
  }

  private[spreadsheets] def pathToSheetNames(parameters: Map[String, String]): (String, String) = {
    val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets."))
    val elems = path.split('/')
    if (elems.length < 2)
      throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'")

    (elems(0), elems(1))
  }

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = {
    val (spreadsheetName, worksheetName) = pathToSheetNames(parameters)
    val context = createSpreadsheetContext(parameters)
    createRelation(sqlContext, context, spreadsheetName, worksheetName, schema)
  }


  override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = {
    val (spreadsheetName, worksheetName) = pathToSheetNames(parameters)
    implicit val context = createSpreadsheetContext(parameters)
    val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName)
    if(!spreadsheet.isDefined)
      throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName")

    spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData)
    createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema)
  }

  private[spreadsheets] def createSpreadsheetContext(parameters: Map[String, String]) = {
    val serviceAccountIdOption = parameters.get("serviceAccountId")
    val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH)
    SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath))
  }

  private[spreadsheets] def createRelation(sqlContext: SQLContext,
                                           context: SparkSpreadsheetService.SparkSpreadsheetContext,
                                           spreadsheetName: String,
                                           worksheetName: String,
                                           schema: StructType): SpreadsheetRelation =
    if (schema == null) {
      createRelation(sqlContext, context, spreadsheetName, worksheetName, None)
    }
    else {
      createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema))
    }

  private[spreadsheets] def createRelation(sqlContext: SQLContext,
                                           context: SparkSpreadsheetService.SparkSpreadsheetContext,
                                           spreadsheetName: String,
                                           worksheetName: String,
                                           schema: Option[StructType]): SpreadsheetRelation =
    SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext)
} 
Example 80
Source File: DatasetUtil.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import org.apache.spark.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata}
import org.apache.spark.sql.{Column, DataFrame, Dataset}


object DatasetUtil {
  def withColumns[T](ds: Dataset[T],
                     colNames: Seq[String],
                     cols: Seq[Column],
                     metadata: Seq[Metadata]): DataFrame = {
    require(colNames.size == cols.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of columns: ${cols.size}")
    require(colNames.size == metadata.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of metadata elements: ${metadata.size}")

    val sparkSession = ds.sparkSession
    val queryExecution = ds.queryExecution
    val resolver = sparkSession.sessionState.analyzer.resolver
    val output = queryExecution.analyzed.output

    checkColumnNameDuplication(colNames,
      "in given column names",
      sparkSession.sessionState.conf.caseSensitiveAnalysis)

    val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) =>
      colName -> col.as(colName, metadata)
    }.toMap

    val replacedAndExistingColumns = output.map { field =>
      columnMap.find { case (colName, _) =>
        resolver(field.name, colName)
      } match {
        case Some((colName: String, col: Column)) => col.as(colName)
        case _ => new Column(field)
      }
    }

    val newColumns = columnMap.filter { case (colName, col) =>
      !output.exists(f => resolver(f.name, colName))
    }.map { case (colName, col) => col.as(colName) }

    ds.select(replacedAndExistingColumns ++ newColumns: _*)
  }

  def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = {
    withColumns(ds, Seq(colName), Seq(col), Seq(metadata))
  }

  private def checkColumnNameDuplication(columnNames: Seq[String], colType: String,
                                         caseSensitiveAnalysis: Boolean): Unit = {
    val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
    if (names.distinct.length != names.length) {
      val duplicateColumns = names.groupBy(identity).collect {
        case (x, ys) if ys.length > 1 => s"`$x`"
      }
      throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
    }
  }

  /**
    * Cast a column in a Dataset to Vector type.
    *
    * The supported data types of the input column are
    * - Vector
    * - float/double type Array.
    *
    * Note: The returned column does not have Metadata.
    *
    * @param dataset input DataFrame
    * @param colName column name.
    * @return Vector column
    */
  def columnToVector(dataset: Dataset[_], colName: String): Column = {
    val columnDataType = dataset.schema(colName).dataType
    columnDataType match {
      case _: VectorUDT => col(colName)
      case fdt: ArrayType =>
        val transferUDF = fdt.elementType match {
          case _: FloatType => udf(f = (vector: Seq[Float]) => {
            val inputArray = Array.fill[Double](vector.size)(0.0)
            vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble)
            Vectors.dense(inputArray)
          })
          case _: DoubleType => udf((vector: Seq[Double]) => {
            Vectors.dense(vector.toArray)
          })
          case other =>
            throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector")
        }
        transferUDF(col(colName))
      case other =>
        throw new IllegalArgumentException(s"$other column cannot be cast to Vector")
    }
  }

} 
Example 81
Source File: SQLTransformer.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.feature

import com.tencent.angel.sona.ml.Transformer
import com.tencent.angel.sona.ml.param.{Param, ParamMap}
import com.tencent.angel.sona.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType

/**
  * Implements the transformations which are defined by SQL statement.
  * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...'
  * where '__THIS__' represents the underlying table of the input dataset.
  * The select clause specifies the fields, constants, and expressions to display in
  * the output, it can be any select clause that Spark SQL supports. Users can also
  * use Spark SQL built-in function and UDFs to operate on these selected columns.
  * For example, [[SQLTransformer]] supports statements like:
  * {{{
  *  SELECT a, a + b AS a_b FROM __THIS__
  *  SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
  *  SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
  * }}}
  */
class SQLTransformer(override val uid: String) extends Transformer
  with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("sql"))

  /**
    * SQL statement parameter. The statement is provided in string form.
    *
    * @group param
    */
  final val statement: Param[String] = new Param[String](this, "statement", "SQL statement")

  
  def setStatement(value: String): this.type = set(statement, value)

  
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val tableName = Identifiable.randomUID(uid)
    dataset.createOrReplaceTempView(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val result = dataset.sparkSession.sql(realStatement)
    // Call SessionCatalog.dropTempView to avoid unpersisting the possibly cached dataset.
    dataset.sparkSession.catalog.dropTempView(tableName)
    // Compatible.sessionstate.catalog.dropTempView(tableName)
    result
  }

  override def transformSchema(schema: StructType): StructType = {
    val spark = SparkSession.builder().getOrCreate()
    val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty))
    val dummyDF = spark.createDataFrame(dummyRDD, schema)
    val tableName = Identifiable.randomUID(uid)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    dummyDF.createOrReplaceTempView(tableName)
    val outputSchema = spark.sql(realStatement).schema
    spark.catalog.dropTempView(tableName)
    outputSchema
  }

  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}


object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 82
Source File: BinaryClassificationSummaryImpl.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.evaluation.evaluating
import com.tencent.angel.sona.ml.evaluation.BinaryClassMetrics.BinaryPredictedResult
import com.tencent.angel.sona.ml.evaluation.{BinaryClassMetrics, BinaryClassificationSummary}
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}


class BinaryClassificationSummaryImpl(df: DataFrame,
                                      probabilityCol: String,
                                      labelCol: String)
  extends BinaryClassificationSummary with Serializable with Logging {

  private lazy val data: RDD[BinaryPredictedResult] = df.select(probabilityCol, labelCol).rdd.map {
    case Row(probability: Double, label: Double) =>
      BinaryPredictedResult(probability, label.toInt)
  }

  lazy val binaryMetrics: BinaryClassMetrics = data.aggregate(new BinaryClassMetrics)(
    seqOp = (metrics: BinaryClassMetrics, pres: BinaryPredictedResult) => metrics.add(pres),
    combOp = (metrics1: BinaryClassMetrics, metrics2: BinaryClassMetrics) => metrics1.merge(metrics2)
  )

  protected lazy val (tp: Double, fp: Double, fn: Double, tn: Double) = (
    binaryMetrics.getTP, binaryMetrics.getFP, binaryMetrics.getFN, binaryMetrics.getTN)
} 
Example 83
Source File: RegressionSummaryImpl.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.evaluation.evaluating
import com.tencent.angel.sona.ml.evaluation.RegressionMetrics.RegressionPredictedResult
import com.tencent.angel.sona.ml.evaluation.{RegressionMetrics, RegressionSummary}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}

class RegressionSummaryImpl(df: DataFrame, predictionCol: String, labelCol: String) extends RegressionSummary with Serializable {
  private lazy val data: RDD[RegressionPredictedResult] = df.select(predictionCol, labelCol).rdd.map {
    case Row(probability: Double, label: Double) =>
      RegressionPredictedResult(probability, label.toInt)
  }

  override val regMetrics: RegressionMetrics = data.aggregate(new RegressionMetrics)(
    seqOp = (metrics: RegressionMetrics, pres: RegressionPredictedResult) => metrics.add(pres),
    combOp = (metrics1: RegressionMetrics, metrics2: RegressionMetrics) => metrics1.merge(metrics2)
  )
} 
Example 84
Source File: MultiClassificationSummaryImpl.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.evaluation.evaluating
import com.tencent.angel.sona.ml.evaluation.{MultiClassMetrics, MultiClassificationSummary}
import com.tencent.angel.sona.ml.evaluation.MultiClassMetrics.MultiPredictedResult
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}

class MultiClassificationSummaryImpl(df: DataFrame, predictionCol: String, labelCol: String)
  extends MultiClassificationSummary with Serializable with Logging {

  private lazy val data: RDD[MultiPredictedResult] = df.select(predictionCol, labelCol).rdd.map {
    case Row(predictediction: Double, label: Double) =>
      MultiPredictedResult(predictediction.toInt, label.toInt)
  }

  lazy val multiMetrics: MultiClassMetrics = data.aggregate(new MultiClassMetrics)(
    seqOp = (metrics: MultiClassMetrics, pres: MultiPredictedResult) => metrics.add(pres),
    combOp = (metrics1: MultiClassMetrics, metrics2: MultiClassMetrics) => metrics1.merge(metrics2)
  )
} 
Example 85
Source File: ChiSquareTest.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.stat

import com.tencent.angel.sona.ml.feature.LabeledPoint
import org.apache.spark.linalg
import org.apache.spark.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.util.SONASchemaUtils
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col


/**
 * :: Experimental ::
 *
 * Chi-square hypothesis testing for categorical data.
 *
 * See <a href="http://en.wikipedia.org/wiki/Chi-squared_test">Wikipedia</a> for more information
 * on the Chi-squared test.
 */
object ChiSquareTest {

  
  private case class ChiSquareResult(
                                      pValues: linalg.Vector,
                                      degreesOfFreedom: Array[Int],
                                      statistics: linalg.Vector)

  /**
   * Conduct Pearson's independence test for every feature against the label. For each feature, the
   * (feature, label) pairs are converted into a contingency matrix for which the Chi-squared
   * statistic is computed. All label and feature values must be categorical.
   *
   * The null hypothesis is that the occurrence of the outcomes is statistically independent.
   *
   * @param dataset  DataFrame of categorical labels and categorical features.
   *                 Real-valued features will be treated as categorical for each distinct value.
   * @param featuresCol  Name of features column in dataset, of type `Vector` (`VectorUDT`)
   * @param labelCol  Name of label column in dataset, of any numerical type
   * @return DataFrame containing the test result for every feature against the label.
   *         This DataFrame will contain a single Row with the following fields:
   *          - `pValues: Vector`
   *          - `degreesOfFreedom: Array[Int]`
   *          - `statistics: Vector`
   *         Each of these fields has one value per feature.
   */

  def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = {
    val spark = dataset.sparkSession
    import spark.implicits._

    SONASchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT)
    SONASchemaUtils.checkNumericType(dataset.schema, labelCol)
    val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, linalg.Vector)]
      .rdd.map { case (label, features) => LabeledPoint(label, features) }
    val testResults = Statistics.chiSqTest(rdd)
    val pValues: linalg.Vector = Vectors.dense(testResults.map(_.pValue))
    val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom)
    val statistics: linalg.Vector = Vectors.dense(testResults.map(_.statistic))
    spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics)))
  }
} 
Example 86
Source File: Correlation.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.stat

import org.apache.spark.linalg.{SQLDataTypes, Vector}

import scala.collection.JavaConverters._
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.types.{StructField, StructType}

/**
 * API for correlation functions in MLlib, compatible with DataFrames and Datasets.
 *
 * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]]
 * to spark.ml's Vector types.
 */
object Correlation {

  /**
   * :: Experimental ::
   * Compute the correlation matrix for the input Dataset of Vectors using the specified method.
   * Methods currently supported: `pearson` (default), `spearman`.
   *
   * @param dataset A dataset or a dataframe
   * @param column The name of the column of vectors for which the correlation coefficient needs
   *               to be computed. This must be a column of the dataset, and it must contain
   *               Vector objects.
   * @param method String specifying the method to use for computing correlation.
   *               Supported: `pearson` (default), `spearman`
   * @return A dataframe that contains the correlation matrix of the column of vectors. This
   *         dataframe contains a single row and a single column of name
   *         '$METHODNAME($COLUMN)'.
   * @throws IllegalArgumentException if the column is not a valid column in the dataset, or if
   *                                  the content of this column is not of type Vector.
   *
   *  Here is how to access the correlation coefficient:
   *  {{{
   *    val data: Dataset[Vector] = ...
   *    val Row(coeff: Matrix) = Correlation.corr(data, "value").head
   *    // coeff now contains the Pearson correlation matrix.
   *  }}}
   *
   * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column
   * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
   * which is fairly costly. Cache the input Dataset before calling corr with `method = "spearman"`
   * to avoid recomputing the common lineage.
   */

  def corr(dataset: Dataset[_], column: String, method: String): DataFrame = {
    val rdd = dataset.select(column).rdd.map {
      case Row(v: Vector) => v
    }
    val oldM = Statistics.corr(rdd, method)
    val name = s"$method($column)"
    val schema = StructType(Array(StructField(name, SQLDataTypes.MatrixType, nullable = false)))
    dataset.sparkSession.createDataFrame(Seq(Row(oldM)).asJava, schema)
  }

  /**
   * Compute the Pearson correlation matrix for the input Dataset of Vectors.
   */

  def corr(dataset: Dataset[_], column: String): DataFrame = {
    corr(dataset, column, "pearson")
  }
} 
Example 87
Source File: GraphIO.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.graph.utils
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

object GraphIO {

  private val DELIMITER = "delimiter"
  private val HEADER = "header"

  private val int2Long = udf[Long, Int](_.toLong)
  private val string2Long = udf[Long, String](_.toLong)
  private val int2Float = udf[Float, Int](_.toFloat)
  private val long2Float = udf[Float, Long](_.toFloat)
  private val double2Float = udf[Float, Double](_.toFloat)
  private val string2Float = udf[Float, String](_.toFloat)

  def convert2Float(df: DataFrame, structField: StructField, tmpSuffix: String): DataFrame = {
    val tmpName = structField.name + tmpSuffix
    structField.dataType match {
      case _: LongType =>
        df.withColumn(tmpName, long2Float(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: IntegerType =>
        df.withColumn(tmpName, int2Float(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: DoubleType =>
        df.withColumn(tmpName, double2Float(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: StringType =>
        df.withColumn(tmpName, string2Float(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: FloatType => df
      case t => throw new Exception(s"$t can't convert to Float")
    }
  }

  def convert2Long(df: DataFrame, structField: StructField, tmpSuffix: String): DataFrame = {
    val tmpName = structField.name + tmpSuffix
    structField.dataType match {
      case _: LongType => df
      case _: IntegerType =>
        df.withColumn(tmpName, int2Long(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: StringType =>
        df.withColumn(tmpName, string2Long(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case t => throw new Exception(s"$t can't convert to Long")
    }
  }

  def load(input: String, isWeighted: Boolean,
           srcIndex: Int = 0, dstIndex: Int = 1, weightIndex: Int = 2,
           sep: String = " "): DataFrame = {
    val ss = SparkSession.builder().getOrCreate()

    val schema = if (isWeighted) {
      StructType(Seq(
        StructField("src", LongType, nullable = false),
        StructField("dst", LongType, nullable = false),
        StructField("weight", FloatType, nullable = false)
      ))
    } else {
      StructType(Seq(
        StructField("src", LongType, nullable = false),
        StructField("dst", LongType, nullable = false)
      ))
    }
    ss.read
      .option("sep", sep)
      .option("header", "false")
      .schema(schema)
      .csv(input)
  }

  def save(df: DataFrame, output: String, seq: String = "\t"): Unit = {
    df.printSchema()
    df.write
      .mode(SaveMode.Overwrite)
      .option(HEADER, "false")
      .option(DELIMITER, seq)
      .csv(output)
  }

  def defaultCheckpointDir: Option[String] = {
    val sparkContext = SparkContext.getOrCreate()
    sparkContext.getConf.getOption("spark.yarn.stagingDir")
      .map { base =>
        new Path(base, s".sparkStaging/${sparkContext.getConf.getAppId}").toString
      }
  }
} 
Example 88
Source File: KCore.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.graph.kcore
import com.tencent.angel.sona.context.PSContext
import org.apache.spark.SparkContext
import com.tencent.angel.sona.graph.params._
import com.tencent.angel.sona.ml.Transformer
import com.tencent.angel.sona.ml.param.ParamMap
import com.tencent.angel.sona.ml.util.Identifiable
import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.storage.StorageLevel

class KCore(override val uid: String) extends Transformer
  with HasSrcNodeIdCol with HasDstNodeIdCol with HasOutputNodeIdCol with HasOutputCoreIdCol
  with HasStorageLevel with HasPartitionNum with HasPSPartitionNum with HasUseBalancePartition {

  def this() = this(Identifiable.randomUID("KCore"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val edges = dataset.select($(srcNodeIdCol), $(dstNodeIdCol)).rdd
      .map(row => (row.getLong(0), row.getLong(1)))
      .filter(e => e._1 != e._2)

    edges.persist(StorageLevel.DISK_ONLY)

    val maxId = edges.map(e => math.max(e._1, e._2)).max() + 1
    val minId = edges.map(e => math.min(e._1, e._2)).min()
    val nodes = edges.flatMap(e => Iterator(e._1, e._2))
    val numEdges = edges.count()

    println(s"minId=$minId maxId=$maxId numEdges=$numEdges level=${$(storageLevel)}")

    // Start PS and init the model
    println("start to run ps")
    PSContext.getOrCreate(SparkContext.getOrCreate())

    val model = KCorePSModel.fromMinMax(minId, maxId, nodes, $(psPartitionNum), $(useBalancePartition))
    var graph = edges.flatMap(e => Iterator((e._1, e._2), (e._2, e._1)))
      .groupByKey($(partitionNum))
      .mapPartitionsWithIndex((index, edgeIter) =>
        Iterator(KCoreGraphPartition.apply(index, edgeIter)))

    graph.persist($(storageLevel))
    graph.foreachPartition(_ => Unit)
    graph.foreach(_.initMsgs(model))

    var curIteration = 0
    var numMsgs = model.numMsgs()
    var prev = graph
    println(s"numMsgs=$numMsgs")

    do {
      curIteration += 1
      graph = prev.map(_.process(model, numMsgs, curIteration == 1))
      graph.persist($(storageLevel))
      graph.count()
      prev.unpersist(true)
      prev = graph
      model.resetMsgs()
      numMsgs = model.numMsgs()
      println(s"curIteration=$curIteration numMsgs=$numMsgs")
    } while (numMsgs > 0)

    val retRDD = graph.map(_.save()).flatMap{case (nodes,cores) => nodes.zip(cores)}
      .map(r => Row.fromSeq(Seq[Any](r._1, r._2)))

    dataset.sparkSession.createDataFrame(retRDD, transformSchema(dataset.schema))
  }

  override def transformSchema(schema: StructType): StructType = {
    StructType(Seq(
      StructField(s"${$(outputNodeIdCol)}", LongType, nullable = false),
      StructField(s"${$(outputCoreIdCol)}", IntegerType, nullable = false)
    ))
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

} 
Example 89
Source File: TokenizerSuite.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.feature

import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest}

import scala.beans.BeanInfo
import org.apache.spark.sql.{DataFrame, Row}

@BeanInfo
case class TokenizerTestData(rawText: String, wantedTokens: Array[String])

class TokenizerSuite extends MLTest with DefaultReadWriteTest {

  test("read/write") {
    val t = new Tokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }
}

class RegexTokenizerSuite extends MLTest with DefaultReadWriteTest {

  import testImplicits._

  def testRegexTokenizer(t: RegexTokenizer, dataframe: DataFrame): Unit = {
    testTransformer[(String, Seq[String])](dataframe, t, "tokens", "wantedTokens") {
      case Row(tokens, wantedTokens) =>
        assert(tokens === wantedTokens)
    }
  }

  test("RegexTokenizer") {
    val tokenizer0 = new RegexTokenizer()
      .setGaps(false)
      .setPattern("\\w+|\\p{Punct}")
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset0 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")),
      TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct"))
    ).toDF()
    testRegexTokenizer(tokenizer0, dataset0)

    val dataset1 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")),
      TokenizerTestData("Te,st. punct", Array("punct"))
    ).toDF()
    tokenizer0.setMinTokenLength(3)
    testRegexTokenizer(tokenizer0, dataset1)

    val tokenizer2 = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
    val dataset2 = Seq(
      TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")),
      TokenizerTestData("Te,st.  punct", Array("te,st.", "punct"))
    ).toDF()
    testRegexTokenizer(tokenizer2, dataset2)
  }

  test("RegexTokenizer with toLowercase false") {
    val tokenizer = new RegexTokenizer()
      .setInputCol("rawText")
      .setOutputCol("tokens")
      .setToLowercase(false)
    val dataset = Seq(
      TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")),
      TokenizerTestData("java scala", Array("java", "scala"))
    ).toDF()
    testRegexTokenizer(tokenizer, dataset)
  }

  test("read/write") {
    val t = new RegexTokenizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setMinTokenLength(2)
      .setGaps(false)
      .setPattern("hi")
      .setToLowercase(false)
    testDefaultReadWrite(t)
  }
} 
Example 90
Source File: NormalizerSuite.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.feature

import org.apache.spark.linalg
import org.apache.spark.linalg.{DenseVector, IntSparseVector, LongSparseVector, Vectors}
import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest}
import com.tencent.angel.sona.ml.util.TestingUtils._
import org.apache.spark.sql.{DataFrame, Row}


class NormalizerSuite extends MLTest with DefaultReadWriteTest {

  import testImplicits._

  @transient var data: Array[linalg.Vector] = _
  @transient var l1Normalized: Array[linalg.Vector] = _
  @transient var l2Normalized: Array[linalg.Vector] = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))),
      Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))),
      Vectors.sparse(3, Seq[(Int, Double)]()),
      Vectors.sparse( size= 3L, Seq[(Long, Double)]())
    )
    l1Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.12765957, -0.23404255, -0.63829787),
      Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))),
      Vectors.dense(0.625, 0.07894737, 0.29605263),
      Vectors.sparse(3, Seq[(Int, Double)]()),
      Vectors.sparse(3L, Seq[(Long, Double)]())
    )
    l2Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.184549876, -0.3383414, -0.922749378),
      Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))),
      Vectors.dense(0.897906166, 0.113419726, 0.42532397),
      Vectors.sparse(3, Seq[(Int, Double)]()),
      Vectors.sparse(3L, Seq[(Long, Double)]())
    )
  }

  def assertTypeOfVector(lhs: linalg.Vector, rhs: linalg.Vector): Unit = {
    assert((lhs, rhs) match {
      case (v1: DenseVector, v2: DenseVector) => true
      case (v1: IntSparseVector, v2: IntSparseVector) => true
      case (v1: LongSparseVector, v2: LongSparseVector) => true
      case _ => false
    }, "The vector type should be preserved after normalization.")
  }

  def assertValues(lhs: linalg.Vector, rhs: linalg.Vector): Unit = {
    assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.")
  }

  test("Normalization with default parameter") {
    val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized")
    val dataFrame: DataFrame = data.zip(l2Normalized).seq.toDF("features", "expected")

    testTransformer[(linalg.Vector, linalg.Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
      case Row(features: linalg.Vector, normalized: linalg.Vector, expected: linalg.Vector) =>
        assertTypeOfVector(normalized, features)
        assertValues(normalized, expected)
    }
  }

  test("Normalization with setter") {
    val dataFrame: DataFrame = data.zip(l1Normalized).seq.toDF("features", "expected")
    val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1)

    testTransformer[(linalg.Vector, linalg.Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
      case Row(features: linalg.Vector, normalized: linalg.Vector, expected: linalg.Vector) =>
        assertTypeOfVector(normalized, features)
        assertValues(normalized, expected)
    }
  }

  test("read/write") {
    val t = new Normalizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setP(3.0)
    testDefaultReadWrite(t)
  }
} 
Example 91
Source File: NGramSuite.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.feature

import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest}

import scala.beans.BeanInfo
import org.apache.spark.sql.{DataFrame, Row}


@BeanInfo
case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String])

class NGramSuite extends MLTest with DefaultReadWriteTest {

  import testImplicits._

  test("default behavior yields bigram features") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
    val dataset = Seq(NGramTestData(
      Array("Test", "for", "ngram", "."),
      Array("Test for", "for ngram", "ngram .")
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("NGramLength=4 yields length 4 n-grams") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = Seq(NGramTestData(
      Array("a", "b", "c", "d", "e"),
      Array("a b c d", "b c d e")
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("empty input yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(4)
    val dataset = Seq(NGramTestData(Array(), Array())).toDF()
    testNGram(nGram, dataset)
  }

  test("input array < n yields empty output") {
    val nGram = new NGram()
      .setInputCol("inputTokens")
      .setOutputCol("nGrams")
      .setN(6)
    val dataset = Seq(NGramTestData(
      Array("a", "b", "c", "d", "e"),
      Array()
    )).toDF()
    testNGram(nGram, dataset)
  }

  test("read/write") {
    val t = new NGram()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setN(3)
    testDefaultReadWrite(t)
  }

  def testNGram(t: NGram, dataFrame: DataFrame): Unit = {
    testTransformer[(Seq[String], Seq[String])](dataFrame, t, "nGrams", "wantedNGrams") {
      case Row(actualNGrams : Seq[_], wantedNGrams: Seq[_]) =>
        assert(actualNGrams === wantedNGrams)
    }
  }
} 
Example 92
Source File: Glow.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow

import java.util.ServiceLoader

import scala.collection.JavaConverters._

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.apache.spark.sql.{DataFrame, SQLUtils, SparkSession}

import io.projectglow.common.Named
import io.projectglow.sql.{GlowSQLExtensions, SqlExtensionProvider}
import io.projectglow.transformers.util.{SnakeCaseMap, StringUtils}


  def transform(operationName: String, df: DataFrame, options: Map[String, Any]): DataFrame = {
    val stringValuedMap = options.mapValues {
      case s: String => s
      case v => mapper.writeValueAsString(v)
    }.map(identity) // output of mapValues is not serializable: https://github.com/scala/bug/issues/7005
    lookupTransformer(operationName) match {
      case Some(transformer) => transformer.transform(df, new SnakeCaseMap(stringValuedMap))
      case None =>
        throw new IllegalArgumentException(s"No transformer with name $operationName")
    }
  }

  def transform(operationName: String, df: DataFrame, options: (String, Any)*): DataFrame = {
    transform(operationName, df, options.toMap)
  }

  def transform(
      operationName: String,
      df: DataFrame,
      options: java.util.Map[String, String]): DataFrame = {
    transform(operationName, df, options.asScala.toMap)
  }

  private def lookupTransformer(name: String): Option[DataFrameTransformer] = synchronized {
    transformerLoader.reload()
    transformerLoader
      .iterator()
      .asScala
      .find(n => StringUtils.toSnakeCase(n.name) == StringUtils.toSnakeCase(name))
  }

  private val transformerLoader = ServiceLoader
    .load(classOf[DataFrameTransformer])
}

object Glow extends GlowBase

trait DataFrameTransformer extends Named {
  def transform(df: DataFrame, options: Map[String, String]): DataFrame
} 
Example 93
Source File: VCFInputFormatter.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.vcf

import java.io.OutputStream

import scala.collection.JavaConverters._

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.catalyst.InternalRow

import io.projectglow.common.GlowLogging
import io.projectglow.transformers.pipe.{InputFormatter, InputFormatterFactory}


class VCFInputFormatter(converter: InternalRowToVariantContextConverter, sampleIdInfo: SampleIdInfo)
    extends InputFormatter
    with GlowLogging {

  private var writer: VCFStreamWriter = _
  private var stream: OutputStream = _

  override def init(stream: OutputStream): Unit = {
    this.stream = stream
    this.writer = new VCFStreamWriter(
      stream,
      converter.vcfHeader.getMetaDataInInputOrder.asScala.toSet,
      sampleIdInfo,
      writeHeader = true)
  }

  override def write(record: InternalRow): Unit = {
    converter.convert(record).foreach(writer.write)
  }

  override def close(): Unit = {
    logger.info("Closing VCF input formatter")
    writer.close()
  }
}

class VCFInputFormatterFactory extends InputFormatterFactory {
  override def name: String = "vcf"

  override def makeInputFormatter(df: DataFrame, options: Map[String, String]): InputFormatter = {
    val (headerLineSet, sampleIdInfo) =
      VCFHeaderUtils.parseHeaderLinesAndSamples(
        options,
        None,
        df.schema,
        df.sparkSession.sparkContext.hadoopConfiguration)
    val rowConverter = new InternalRowToVariantContextConverter(
      df.schema,
      headerLineSet,
      VCFOptionParser.getValidationStringency(options)
    )
    rowConverter.validate()

    new VCFInputFormatter(rowConverter, sampleIdInfo)
  }
} 
Example 94
Source File: VCFWriterUtils.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.vcf

import htsjdk.variant.variantcontext.{VariantContext, VariantContextBuilder}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{ArrayType, StructType}

import io.projectglow.common.GlowLogging

object VCFWriterUtils extends GlowLogging {

  def throwMixedSamplesFailure(): Unit = {
    throw new IllegalArgumentException("Cannot mix missing and non-missing sample IDs.")
  }

  def throwSampleInferenceFailure(): Unit = {
    throw new IllegalArgumentException(
      "Cannot infer sample ids because they are not the same in every row.")
  }

  
  def inferSampleIdsIfPresent(data: DataFrame): SampleIdInfo = {
    val genotypeSchemaOpt = data
      .schema
      .find(_.name == "genotypes")
      .map(_.dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType])
    if (genotypeSchemaOpt.isEmpty) {
      logger.info("No genotypes column, no sample IDs will be inferred.")
      return SampleIds(Seq.empty)
    }
    val genotypeSchema = genotypeSchemaOpt.get

    import data.sparkSession.implicits._
    val hasSampleIdsColumn = genotypeSchema.exists(_.name == "sampleId")

    if (hasSampleIdsColumn) {
      val distinctSampleIds = data
        .selectExpr("explode(genotypes.sampleId)")
        .distinct()
        .as[String]
        .collect
      val numPresentSampleIds = distinctSampleIds.count(!sampleIsMissing(_))

      if (numPresentSampleIds > 0) {
        if (numPresentSampleIds < distinctSampleIds.length) {
          throwMixedSamplesFailure()
        }
        return SampleIds(distinctSampleIds)
      }
    }

    val numGenotypesPerRow = data
      .selectExpr("size(genotypes)")
      .distinct()
      .as[Int]
      .collect

    if (numGenotypesPerRow.length > 1) {
      throw new IllegalArgumentException(
        "Rows contain varying number of missing samples; cannot infer sample IDs.")
    }

    logger.warn("Detected missing sample IDs, inferring sample IDs.")
    InferSampleIds
  }

  def sampleIsMissing(s: String): Boolean = {
    s == null || s.isEmpty
  }

  def convertVcAttributesToStrings(vc: VariantContext): VariantContextBuilder = {
    val vcBuilder = new VariantContextBuilder(vc)
    val iterator = vc.getAttributes.entrySet().iterator()
    while (iterator.hasNext) {
      // parse to string, then write, as the VCF encoder messes up double precisions
      val entry = iterator.next()
      vcBuilder.attribute(
        entry.getKey,
        VariantContextToInternalRowConverter.parseObjectAsString(entry.getValue))
    }
    vcBuilder
  }
}

case class SampleIds(unsortedSampleIds: Seq[String]) extends SampleIdInfo {
  val sortedSampleIds: Seq[String] = unsortedSampleIds.sorted
}
case object InferSampleIds extends SampleIdInfo {
  def fromNumberMissing(numMissingSamples: Int): Seq[String] = {
    (1 to numMissingSamples).map { idx =>
      "sample_" + idx
    }
  }
}

sealed trait SampleIdInfo 
Example 95
Source File: BlockVariantsAndSamplesTransformer.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.transformers.blockvariantsandsamples

import io.projectglow.DataFrameTransformer
import io.projectglow.common.logging.HlsUsageLogging

import org.apache.spark.sql.DataFrame


class BlockVariantsAndSamplesTransformer extends DataFrameTransformer with HlsUsageLogging {

  import BlockVariantsAndSamplesTransformer._

  override def name: String = TRANSFORMER_NAME

  override def transform(df: DataFrame, options: Map[String, String]): DataFrame = {

    val variantsPerBlock = validateIntegerOption(options, VARIANTS_PER_BLOCK)
    val sampleBlockCount = validateIntegerOption(options, SAMPLE_BLOCK_COUNT)

    VariantSampleBlockMaker.makeVariantAndSampleBlocks(df, variantsPerBlock, sampleBlockCount)
  }
}

object BlockVariantsAndSamplesTransformer {
  val TRANSFORMER_NAME = "block_variants_and_samples"
  val VARIANTS_PER_BLOCK = "variants_per_block"
  val SAMPLE_BLOCK_COUNT = "sample_block_count"

  def validateIntegerOption(options: Map[String, String], optionName: String): Int = {
    try {
      (options.get(optionName).get.toInt)
    } catch {
      case _: Throwable =>
        throw new IllegalArgumentException(
          s"$optionName is not provided or cannot be cast as an integer!"
        )
    }
  }
} 
Example 96
Source File: CSVInputFormatter.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.transformers.pipe

import java.io.{OutputStream, PrintWriter}

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.csv.SGUnivocityGenerator
import org.apache.spark.sql.types.StructType

import io.projectglow.SparkShim.CSVOptions

class CSVInputFormatter(schema: StructType, parsedOptions: CSVOptions) extends InputFormatter {

  private var writer: PrintWriter = _
  private var univocityGenerator: SGUnivocityGenerator = _

  override def init(stream: OutputStream): Unit = {
    writer = new PrintWriter(stream)
    univocityGenerator = new SGUnivocityGenerator(schema, writer, parsedOptions)
    if (parsedOptions.headerFlag) {
      univocityGenerator.writeHeaders()
    }
  }

  override def write(record: InternalRow): Unit = {
    univocityGenerator.write(record)
  }

  override def close(): Unit = {
    writer.close()
    univocityGenerator.close()
  }
}

class CSVInputFormatterFactory extends InputFormatterFactory {
  override def name: String = "csv"

  override def makeInputFormatter(
      df: DataFrame,
      options: Map[String, String]
  ): InputFormatter = {
    val sqlConf = df.sparkSession.sessionState.conf
    val parsedOptions =
      new CSVOptions(
        options,
        sqlConf.csvColumnPruning,
        sqlConf.sessionLocalTimeZone
      )
    new CSVInputFormatter(df.schema, parsedOptions)
  }
} 
Example 97
Source File: UTF8TextInputFormatter.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.transformers.pipe

import java.io.{OutputStream, PrintWriter}

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLUtils.dataTypesEqualExceptNullability
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types.StringType


class UTF8TextInputFormatter() extends InputFormatter {

  private var writer: PrintWriter = _

  override def init(stream: OutputStream): Unit = {
    writer = new PrintWriter(stream)
  }

  override def write(record: InternalRow): Unit = {
    if (!record.isNullAt(0)) {
      writer.println(record.getUTF8String(0)) // scalastyle:ignore
    }
  }

  override def close(): Unit = {
    writer.close()
  }
}

class UTF8TextInputFormatterFactory extends InputFormatterFactory {
  override def name: String = "text"

  override def makeInputFormatter(df: DataFrame, options: Map[String, String]): InputFormatter = {
    require(df.schema.length == 1, "Input dataframe must have one column,")
    require(
      dataTypesEqualExceptNullability(df.schema.head.dataType, StringType),
      "Input dataframe must have one string column.")
    new UTF8TextInputFormatter
  }
} 
Example 98
Source File: BigFileDatasource.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.sql

import java.net.URI
import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

import io.projectglow.common.{GlowLogging, WithUtils}


  def write(rdd: RDD[Array[Byte]], path: String) {
    val uri = new URI(path)
    uploaders.find(_.canUpload(rdd.sparkContext.hadoopConfiguration, path)) match {
      case Some(uploader) => uploader.upload(rdd, path)
      case None =>
        logger.info(s"Could not find a parallel uploader for $path, uploading from the driver")
        writeFileFromDriver(new Path(uri), rdd)
    }
  }

  private def writeFileFromDriver(path: Path, byteRdd: RDD[Array[Byte]]): Unit = {
    val sc = byteRdd.sparkContext
    val fs = path.getFileSystem(sc.hadoopConfiguration)
    WithUtils.withCloseable(fs.create(path)) { stream =>
      WithUtils.withCachedRDD(byteRdd) { cachedRdd =>
        cachedRdd.count()
        cachedRdd.toLocalIterator.foreach { chunk =>
          stream.write(chunk)
        }
      }
    }
  }
} 
Example 99
Source File: BigBgenDatasource.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.bgen

import java.io.ByteArrayOutputStream

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLUtils}
import org.apache.spark.sql.sources.DataSourceRegister

import io.projectglow.common.logging.{HlsEventRecorder, HlsTagValues}
import io.projectglow.sql.BigFileDatasource
import io.projectglow.sql.util.ComDatabricksDataSource

class BigBgenDatasource extends BigFileDatasource with DataSourceRegister {

  override def shortName(): String = "bigbgen"

  override def serializeDataFrame(
      options: Map[String, String],
      data: DataFrame): RDD[Array[Byte]] = {
    BigBgenDatasource.serializeDataFrame(options, data)
  }

}

class ComDatabricksBigBgenDatasource extends BigBgenDatasource with ComDatabricksDataSource

object BigBgenDatasource extends HlsEventRecorder {

  import io.projectglow.common.BgenOptions._

  private def parseOptions(options: Map[String, String]): BigBgenOptions = {
    val bitsPerProb = options.getOrElse(BITS_PER_PROB_KEY, BITS_PER_PROB_DEFAULT_VALUE).toInt
    val maxPloidy = options.getOrElse(MAX_PLOIDY_KEY, MAX_PLOIDY_VALUE).toInt
    val defaultPloidy = options.getOrElse(DEFAULT_PLOIDY_KEY, DEFAULT_PLOIDY_VALUE).toInt
    val defaultPhasing = options.getOrElse(DEFAULT_PHASING_KEY, DEFAULT_PHASING_VALUE).toBoolean
    BigBgenOptions(bitsPerProb, maxPloidy, defaultPloidy, defaultPhasing)
  }

  private def logBgenWrite(parsedOptions: BigBgenOptions): Unit = {
    val logOptions = Map(
      BITS_PER_PROB_KEY -> parsedOptions.bitsPerProb,
      MAX_PLOIDY_KEY -> parsedOptions.maxPloidy,
      DEFAULT_PLOIDY_KEY -> parsedOptions.defaultPloidy,
      DEFAULT_PHASING_KEY -> parsedOptions.defaultPhasing
    )
    recordHlsEvent(HlsTagValues.EVENT_BGEN_WRITE, logOptions)
  }

  def serializeDataFrame(options: Map[String, String], data: DataFrame): RDD[Array[Byte]] = {

    val parsedOptions = parseOptions(options)
    logBgenWrite(parsedOptions)

    val dSchema = data.schema
    val numVariants = data.count
    val rawRdd = data.queryExecution.toRdd

    val inputRdd = if (rawRdd.getNumPartitions == 0) {
      logger.warn("Writing BGEN header only as the input DataFrame has zero partitions.")
      SQLUtils.createEmptyRDD(data.sparkSession)
    } else {
      rawRdd
    }

    inputRdd.mapPartitionsWithIndex {
      case (idx, it) =>
        val baos = new ByteArrayOutputStream()

        val writeHeader = idx == 0
        val writer = new BgenRecordWriter(
          baos,
          dSchema,
          writeHeader,
          numVariants,
          parsedOptions.bitsPerProb,
          parsedOptions.maxPloidy,
          parsedOptions.defaultPloidy,
          parsedOptions.defaultPhasing
        )

        it.foreach { row =>
          writer.write(row)
        }

        writer.close()
        Iterator(baos.toByteArray)
    }
  }
}

case class BigBgenOptions(
    bitsPerProb: Int,
    maxPloidy: Int,
    defaultPloidy: Int,
    defaultPhasing: Boolean) 
Example 100
Source File: TextPiperSuite.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.transformers.pipe

import scala.collection.JavaConverters._

import org.apache.spark.SparkException
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import io.projectglow.Glow
import io.projectglow.sql.GlowBaseTest

class TextPiperSuite extends GlowBaseTest {
  override def afterEach(): Unit = {
    Glow.transform("pipe_cleanup", spark.emptyDataFrame)
    super.afterEach()
  }

  def pipeText(df: DataFrame): DataFrame = {
    val options =
      Map("inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["cat", "-"]""")
    new PipeTransformer().transform(df, options)
  }

  test("text input and output") {
    val sess = spark
    import sess.implicits._

    val output = pipeText(Seq("hello", "world").toDF())
    assert(output.count() == 2)
    assert(output.schema == StructType(Seq(StructField("text", StringType))))
    assert(output.orderBy("text").as[String].collect.toSeq == Seq("hello", "world"))
  }

  test("text input requires one column") {
    val sess = spark
    import sess.implicits._

    val df = Seq(Seq("hello", "world"), Seq("foo", "bar")).toDF()
    assertThrows[IllegalArgumentException](pipeText(df))
  }

  test("text input requires string column") {
    val sess = spark
    import sess.implicits._

    val df = Seq(Seq(5), Seq(6)).toDF()
    assertThrows[IllegalArgumentException](pipeText(df))
  }

  test("does not break on null row") {
    val sess = spark
    import sess.implicits._

    val df = Seq("hello", null, "hello").toDF()
    val output = pipeText(df)
    assert(output.count() == 2)
    assert(output.filter("text = 'hello'").count == 2)
  }

  test("command fails") {
    val sess = spark
    import sess.implicits._

    val df = Seq("hello", "world").toDF()
    val options =
      Map(
        "inputFormatter" -> "text",
        "outputFormatter" -> "text",
        "cmd" -> """["bash", "-c", "exit 1"]""")

    val ex = intercept[SparkException] {
      new PipeTransformer().transform(df, options)
    }
    assert(ex.getMessage.contains("Subprocess exited with status 1"))

    // threads should still be cleaned up
    eventually {
      assert(
        !Thread
          .getAllStackTraces
          .asScala
          .keySet
          .exists(_.getName.startsWith(ProcessHelper.STDIN_WRITER_THREAD_PREFIX)))
      assert(
        !Thread
          .getAllStackTraces
          .asScala
          .keySet
          .exists(_.getName.startsWith(ProcessHelper.STDERR_READER_THREAD_PREFIX)))
    }
  }
} 
Example 101
Source File: GlowSuite.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow

import org.apache.spark.sql.DataFrame

import io.projectglow.sql.GlowBaseTest

class GlowSuite extends GlowBaseTest {
  def checkTransform(df: DataFrame): Unit = {
    val sess = spark
    import sess.implicits._
    assert(df.count() == 2)
    assert(df.as[String].collect.toSeq == Seq("camel", "snake"))
  }

  test("uses service provider") {
    val df =
      Glow.transform(
        "dummy_transformer",
        spark.emptyDataFrame,
        Map("camel_animal" -> "camel", "snake_animal" -> "snake"))
    checkTransform(df)
  }

  test("transformer names are converted to snake case") {
    val df =
      Glow.transform(
        "dummyTransformer",
        spark.emptyDataFrame,
        Map("camel_animal" -> "camel", "snake_animal" -> "snake"))
    checkTransform(df)
  }

  test("options are converted to snake case") {
    val df =
      Glow.transform(
        "dummyTransformer",
        spark.emptyDataFrame,
        Map("camelAnimal" -> "camel", "snake_animal" -> "snake"))
    checkTransform(df)
  }

  test("java map options") {
    val javaMap = new java.util.HashMap[String, String]
    javaMap.put("camelAnimal", "camel")
    javaMap.put("snake_animal", "snake")
    val df = Glow.transform("dummyTransformer", spark.emptyDataFrame, javaMap)
    checkTransform(df)
  }

  test("tuple options") {
    val df =
      Glow.transform(
        "dummyTransformer",
        spark.emptyDataFrame,
        ("camelAnimal", "camel"),
        ("snake_animal", "snake"))
    checkTransform(df)
  }

  test("accept non-string values") {
    intercept[IllegalArgumentException] {
      Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("must_be_true" -> false))
    }
    Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("must_be_true" -> true))
  }

  test("float arguments") {
    intercept[IllegalArgumentException] {
      Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("pi" -> 15.48))
    }
    Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("pi" -> 3.14159))
    Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("pi" -> "3.14159"))
  }
}

class DummyTransformer extends DataFrameTransformer {
  override def name: String = "dummy_transformer"

  override def transform(df: DataFrame, options: Map[String, String]): DataFrame = {
    val animals = Seq(options.get("camel_animal"), options.get("snake_animal")).flatten
    if (!options.get("must_be_true").forall(_.toBoolean)) {
      throw new IllegalArgumentException("if provided, this arg must be true")
    }

    options.get("pi").foreach { pi =>
      require(Math.abs(pi.toDouble - Math.PI) < Math.PI * 0.0001)
    }

    df.sparkSession.createDataFrame(animals.map(StringWrapper)).sort()
  }
}

case class StringWrapper(s: String) 
Example 102
Source File: BigFileDatasourceSuite.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.sql

import java.nio.file.{Files, Paths}

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SaveMode}

class BigFileDatasourceSuite extends GlowBaseTest {
  test("save mode: append") {
    val outFile = Files.createTempFile("tmp", ".tmp").toString
    val e = intercept[RuntimeException] {
      spark
        .emptyDataFrame
        .write
        .mode(SaveMode.Append)
        .format("io.projectglow.sql.DummyBigFileDatasource")
        .save(outFile)
    }
    assert(
      e.getMessage
        .contains("Append mode is not supported by io.projectglow.sql.DummyBigFileDatasource"))
  }

  test("save mode: overwrite") {
    val outDir = Files.createTempDirectory("tmp").toString
    spark
      .emptyDataFrame
      .write
      .mode(SaveMode.Overwrite)
      .format("io.projectglow.sql.DummyBigFileDatasource")
      .save(outDir)

    val filePath = Paths.get(outDir)
    assert(Files.isRegularFile(filePath))
    val writtenBytes = Files.readAllBytes(filePath)
    assert(writtenBytes.toSeq == Seq(0, 1, 2).map(_.toByte))
  }

  test("save mode: error if exists") {
    val outFile = Files.createTempFile("tmp", ".tmp").toString
    val e = intercept[RuntimeException] {
      spark
        .emptyDataFrame
        .write
        .mode(SaveMode.ErrorIfExists)
        .format("io.projectglow.sql.DummyBigFileDatasource")
        .save(outFile)
    }
    assert(e.getMessage.contains(s"Path $outFile already exists"))
  }

  test("save mode: ignore") {
    val outDir = Files.createTempDirectory("tmp").toString
    spark
      .emptyDataFrame
      .write
      .mode(SaveMode.Ignore)
      .format("io.projectglow.sql.DummyBigFileDatasource")
      .save(outDir)

    val dirPath = Paths.get(outDir)
    assert(Files.isDirectory(dirPath))
  }
}

class DummyBigFileDatasource extends BigFileDatasource {
  override def serializeDataFrame(
      options: Map[String, String],
      data: DataFrame): RDD[Array[Byte]] = {
    data.sqlContext.sparkContext.parallelize(Seq(Array(0, 1, 2).map(_.toByte)))
  }
} 
Example 103
Source File: SparkOperationTestPimpers.scala    From sparkplug   with MIT License 5 votes vote down vote up
package springnz.sparkplug.testkit

import com.typesafe.scalalogging.{ LazyLogging, Logger }
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ DataFrame, SQLContext }
import springnz.sparkplug.core.SparkOperation
import springnz.sparkplug.util.Logging

import scala.reflect.ClassTag

object SparkOperationTestPimpers extends LazyLogging {

  private def persistTestResource[A: ClassTag](rdd: RDD[A], rddName: String, overwrite: Boolean = false)(
    implicit projectName: ProjectName): RDD[A] = {
    val path = RDDPersister.getPath(projectName.name, rddName)
    if (overwrite || (!overwrite && !path.exists)) {
      if (path.exists) {
        logger.info(s"deleting existing RDD at ${path.pathAsString}")
        path.delete()
      }
      RDDPersister.persistRDD(path.pathAsString, rdd)
    } else { // (!overwrite && path.exists)
      logger.info(s"Not persisting RDD that already exists at path [${path.pathAsString}]")
      rdd
    }
  }

  class RDDExtensions[A: ClassTag](operation: SparkOperation[RDD[A]]) {
    import RDDSamplers._

    def saveTo(rddName: String, sampler: RDD[A] ⇒ RDD[A] = identitySampler)(
      implicit projectName: ProjectName): SparkOperation[RDD[A]] =
      operation.map {
        rdd ⇒
          val sampled = sampler(rdd)
          persistTestResource(sampled, rddName, overwrite = false)
          sampled
      }

    def sourceFrom(rddName: String, sampler: RDD[A] ⇒ RDD[A] = identitySampler)(
      implicit projectName: ProjectName): SparkOperation[RDD[A]] =
      SparkOperation { ctx ⇒
        val path = RDDPersister.getPath(projectName.name, rddName)
        if (path.exists)
          ctx.objectFile[A](path.pathAsString)
        else {
          val rdd = operation.run(ctx)
          val sampled = sampler(rdd)
          persistTestResource(sampled, rddName, overwrite = false)
          sampled
        }
      }
  }

  class DataFrameExtensions(operation: SparkOperation[DataFrame]) {
    import RDDSamplers._

    def saveTo(rddName: String,
      overwrite: Boolean = false,
      sampler: RDD[String] ⇒ RDD[String] = identitySampler)(
        implicit projectName: ProjectName): SparkOperation[DataFrame] =
      operation.map {
        df ⇒
          val rdd: RDD[String] = df.toJSON
          val sampled = sampler(rdd)
          persistTestResource(sampled, rddName, overwrite)
          val sqlContext = new SQLContext(sampled.sparkContext)
          sqlContext.read.json(sampled)
      }

    def sourceFrom(dataFrameName: String,
      overwrite: Boolean = false,
      sampler: RDD[String] ⇒ RDD[String] = rdd ⇒ rdd)(
        implicit projectName: ProjectName, log: Logger): SparkOperation[DataFrame] =
      SparkOperation { ctx ⇒
        val path = RDDPersister.getPath(projectName.name, dataFrameName)
        val sampledRDD = if (path.exists)
          ctx.objectFile[String](path.pathAsString)
        else {
          val df = operation.run(ctx)
          val rdd: RDD[String] = df.toJSON
          val sampled = sampler(rdd)
          persistTestResource(sampled, dataFrameName, overwrite)
          sampled
        }
        val sqlContext = new SQLContext(ctx)
        sqlContext.read.json(sampledRDD)
      }

  }
} 
Example 104
Source File: QueryPeopleTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.dataset

import com.github.dnvriend.TestSpec
import org.apache.spark.sql.{ Column, DataFrame }

class QueryPeopleTest extends TestSpec {

  it should "query using DSL" in withSparkSession { spark =>
    import spark.implicits._
    import org.apache.spark.sql.functions._

    val people: DataFrame =
      spark.read.parquet(TestSpec.PeopleParquet).cache() // name, age

    people.select('name).limit(1).as[String].head() shouldBe "foo"
    people.select($"name").limit(1).as[String].head() shouldBe "foo"
    people.select("name").limit(1).as[String].head() shouldBe "foo"

    people.select('age).limit(1).as[Int].head() shouldBe 30
    people.select($"age").limit(1).as[Int].head() shouldBe 30
    people.select("age").limit(1).as[Int].head() shouldBe 30

    // select a column from the Dataset
    val col1: Column = people("name")
    val col2: Column = people.col("name")

    val departments: DataFrame =
      Seq((1, "sales"), (2, "administration"), (3, "human resources"))
        .toDF("department_id", "department_name").cache()

    people
      .withColumn("department_id", lit(1))
      .withColumn("age_plus_ten", people("age") + 10)
      .as[(String, Int, Int, Int)].limit(1).head() shouldBe ("foo", 30, 1, 40)

    people
      .withColumn("department_id", lit(1))
      .withColumn("age_plus_ten", people("age") + 10)
      .as('people_dep_age)
      .join(departments, col("people_dep_age.department_id").equalTo(departments.col("department_id")))
      .select($"people_dep_age.name", col("people_dep_age.age"), departments.col("department_name"))
      .as[(String, Int, String)].limit(1).head() shouldBe ("foo", 30, "sales")

    val peopleDepAge: DataFrame =
      people
        .withColumn("department_id", lit(1))
        .withColumn("age_plus_ten", people("age") + 10)

    peopleDepAge
      .join(departments, peopleDepAge("department_id") === departments("department_id"))
      .select(peopleDepAge("name"), peopleDepAge("age"), departments("department_name"))
      .as[(String, Int, String)].limit(1).head() shouldBe ("foo", 30, "sales")

    peopleDepAge.filter($"age" > 30)
      .join(departments, peopleDepAge("department_id") === departments("department_id"))
      .agg(avg($"age"), max($"age")).limit(1)
      .as[(Double, Int)].head() shouldBe (45.0, 50)
  }
} 
Example 105
Source File: DataFrameWordCountTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.dataframe

import com.github.dnvriend.TestSpec
import org.apache.spark.sql.{ DataFrame, Dataset }

class DataFrameWordCountTest extends TestSpec {
  it should "wordcount alice in wonderland" in withSparkSession { spark =>
    import org.apache.spark.sql.functions._
    import spark.implicits._
    val lines: Dataset[String] = spark.read.text(TestSpec.AliceInWonderlandText).as[String]
    lines.count shouldBe 3599 // alice in wonderland contains 3599 lines
    val words: DataFrame = lines.flatMap((line: String) => line.split(" ")).map(_.trim).filter(_.nonEmpty).toDF("word")
    words.count() shouldBe 26467 // there are 26,467 words in the book, excluding spaces
    val wordCount: Dataset[(String, Long)] =
      words.groupBy('word).agg(count('word).as("count")).orderBy('count.desc).as[(String, Long)].cache

    wordCount.take(1).head shouldBe ("the", 1505) // the word 'the' is used 1505 times
    wordCount.filter(lower('word) === "alice").take(1).head shouldBe ("Alice", 221)
    wordCount.filter(lower('word) === "queen").take(1).head shouldBe ("Queen", 34)
    wordCount.filter(lower('word) === "rabbit").take(1).head shouldBe ("Rabbit", 29)
    wordCount.filter(lower('word) === "cheshire").take(1).head shouldBe ("Cheshire", 6)
  }
} 
Example 106
Source File: JdbcDatasourceTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.datasources

import com.github.dnvriend.TestSpec
import com.github.dnvriend.spark._
import com.github.dnvriend.spark.datasources.SparkImplicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrame

object JdbcDatasourceTest {
  implicit val H2Options: Map[String, String] = Map(
    "url" -> "jdbc:h2:mem:test;INIT=runscript from 'src/test/resources/create.sql'\\;runscript from 'src/test/resources/init.sql'",
    "dbtable" -> "customer",
    "driver" -> "org.h2.Driver",
    "user" -> "root",
    "password" -> "root"
  )
  implicit val PostgresOptions: Map[String, String] = Map(
    "url" -> "jdbc:postgresql://localhost:5432/docker?reWriteBatchedInserts=true",
    "driver" -> "org.postgresql.Driver",
    "user" -> "postgres",
    "password" -> ""
  )
}

class JdbcDatasourceTest extends TestSpec {

  ignore should "join JDBC and parquet" in withSparkSession { spark =>
    import spark.implicits._
    //    implicit val jdbcOptions = JdbcDatasourceTest.PostgresOptions
    implicit val jdbcOptions = JdbcDatasourceTest.H2Options
    val orders = spark.read.parquet(TestSpec.OrdersParquet).as[Order].cache()
    val customers = spark.read.jdbc("customer").cache()
    customers.count() shouldBe 7

    val orderCustomer = orders
      .join(customers, orders("customer_id") === customers("customer_id"))
      .select(orders("order_id"), 'customer_name, 'customer_age)

    orderCustomer.as[(Int, String, Int)].collect() shouldBe Seq(
      (10308, "Ollie Olson", 34),
      (10309, "Craig Hahn", 21)
    )

    orderCustomer.write.append.jdbc("order_customer")
    val order_cust: DataFrame = spark.read.jdbc("order_customer")
    order_cust.printSchema()
    order_cust.show()
  }

  // http://stackoverflow.com/questions/2901453/sql-standard-to-escape-column-names
  //
  // The SQL-99 standard specifies that double quote (") is used to delimit identifiers.
  //
  //Oracle, PostgreSQL, MySQL, MSSQL and SQlite all support " as the identifier delimiter
  // (though they don't all use " as the 'default' -
  //
  // for example, you have to be running MySQL in ANSI mode and SQL Server only supports it when QUOTED_IDENTIFIER is ON.)
} 
Example 107
Source File: PersonDataSourceTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.datasources

import com.github.dnvriend.TestSpec
import com.github.dnvriend.spark.datasources.SparkImplicits._
import org.apache.spark.sql.DataFrame

class PersonDataSourceTest extends TestSpec {
  it should "read a simple person xml file using a custom data source" in withSparkSession { spark =>
    import spark.implicits._
    val result: DataFrame = spark.read
      .format("person")
      .load("src/test/resources/people.xml")

    result.as[(Long, String, Int)].collect shouldBe Seq(
      (1, "Jonathan Archer", 41),
      (2, "Reginald Barclay", 45),
      (3, "Julian Bashir", 28),
      (4, "Pavel Chekov", 52),
      (5, "Beverly Crusher", 32),
      (6, "Jadzia Dax", 21),
      (7, "Geordi La Forge", 35)
    )
  }

  it should "read a simple person xml file using implicit conversion" in withSparkSession { spark =>
    import spark.implicits._
    val result: DataFrame = spark.read.person("src/test/resources/people.xml")

    result.as[(Long, String, Int)].collect shouldBe Seq(
      (1, "Jonathan Archer", 41),
      (2, "Reginald Barclay", 45),
      (3, "Julian Bashir", 28),
      (4, "Pavel Chekov", 52),
      (5, "Beverly Crusher", 32),
      (6, "Jadzia Dax", 21),
      (7, "Geordi La Forge", 35)
    )
  }
} 
Example 108
Source File: StructuredIdentity.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.structuredstreaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

class StructuredIdentity() extends StructuredBenchBase {

  override def process(ds: DataFrame, config: SparkBenchConfig) = {

    // Get the singleton instance of SparkSession
    val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate()
    import spark.implicits._

    val query = ds.writeStream
      .foreach(new ForeachWriter[Row] {
        var reporter: KafkaReporter = _

        def open(partitionId: Long, version: Long): Boolean = {
          val reportTopic = config.reporterTopic
          val brokerList = config.brokerList
          reporter = new KafkaReporter(reportTopic, brokerList)
          true
        }

        def close(errorOrNull: Throwable): Unit = {}

        def process(record: Row): Unit = {
          val inTime = record(0).asInstanceOf[String].toLong
          val outTime = System.currentTimeMillis()
          reporter.report(inTime, outTime)
        }
      })
      .start()

    query.awaitTermination()
  }
} 
Example 109
Source File: StructuredRepartition.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.structuredstreaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

class StructuredRepartition() extends StructuredBenchBase {

  override def process(ds: DataFrame, config: SparkBenchConfig) = {

    // Get the singleton instance of SparkSession
    val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate()
    import spark.implicits._

    val results = ds.repartition(config.coreNumber)
    
    val query = results.writeStream
      .foreach(new ForeachWriter[Row] {
        var reporter: KafkaReporter = _

        def open(partitionId: Long, version: Long): Boolean = {
          val reportTopic = config.reporterTopic
          val brokerList = config.brokerList
          reporter = new KafkaReporter(reportTopic, brokerList)
          true
        }

        def close(errorOrNull: Throwable): Unit = {}

        def process(record: Row): Unit = {
          val inTime = record(0).asInstanceOf[String].toLong
          val outTime = System.currentTimeMillis()
          reporter.report(inTime, outTime)
        }
      })
      .start()

    query.awaitTermination()
  }
} 
Example 110
Source File: IntermediateYaml.scala    From sope   with Apache License 2.0 5 votes vote down vote up
package com.sope.etl.yaml

import com.sope.etl._
import com.sope.etl.transform.Transformer
import com.sope.etl.transform.exception.YamlDataTransformException
import com.sope.etl.transform.model.TransformModelWithoutSourceTarget
import org.apache.spark.sql.DataFrame


  def getTransformedDFs(dataFrames: DataFrame*): Seq[(String, DataFrame)] = {
    val sources = model.sources.data
    if (sources.size != dataFrames.size)
      throw new YamlDataTransformException("Invalid Dataframes provided or incorrect yaml config")
    val sqlContext = dataFrames.headOption.getOrElse {
      throw new YamlDataTransformException("Empty Dataframe List")
    }.sqlContext
    performRegistrations(sqlContext)
    val sourceDFMap = sources.zip(dataFrames).map {
      case (source, df) => (source, {
        df.createOrReplaceTempView(source)
        df.alias(source)
      })
    }
    new Transformer(getYamlFileName, sourceDFMap.toMap, model).transform
  }
} 
Example 111
Source File: BigQueryReader.scala    From sope   with Apache License 2.0 5 votes vote down vote up
package com.sope.spark.utils.google

import com.google.cloud.hadoop.io.bigquery.{BigQueryConfiguration, GsonBigQueryInputFormat}
import com.google.gson.JsonObject
import com.sope.utils.Logging
import org.apache.hadoop.io.LongWritable
import org.apache.spark.sql.{DataFrame, SQLContext}


  def load(): DataFrame = {
    import sqlContext.implicits._
    // Load data from BigQuery.
    val tableData = sc.newAPIHadoopRDD(
      conf,
      classOf[GsonBigQueryInputFormat],
      classOf[LongWritable],
      classOf[JsonObject])
      .map(_._2.toString)
    sqlContext.read.json(tableData.toDS)
  }
} 
Example 112
Source File: BigQueryWriter.scala    From sope   with Apache License 2.0 5 votes vote down vote up
package com.sope.spark.utils.google

import com.google.cloud.hadoop.io.bigquery.output.{BigQueryOutputConfiguration, BigQueryTableFieldSchema, BigQueryTableSchema, IndirectBigQueryOutputFormat}
import com.google.cloud.hadoop.io.bigquery.{BigQueryConfiguration, BigQueryFileFormat}
import com.sope.utils.Logging
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

import scala.collection.JavaConversions._


  def save(): Unit = {
    val projectId = hadoopConf.get("fs.gs.project.id")
    val bucket = hadoopConf.get("fs.gs.system.bucket")
    log.info("GCP Project ID: {}", projectId)
    log.info("GCP Bucket for temporary storage: {} ", bucket)
    val outputGcsPath = s"gs://$bucket/hadoop/tmp/bigquery/$targetBQTable"
    log.info("GCP Path for temporary storage: {} ", outputGcsPath)
    hadoopConf.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId)
    hadoopConf.set(BigQueryConfiguration.GCS_BUCKET_KEY, bucket)
    hadoopConf.set("mapreduce.job.outputformat.class", classOf[IndirectBigQueryOutputFormat[_, _]].getName)
    if (overwriteTable)
      hadoopConf.set(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION_KEY, "WRITE_TRUNCATE")

    BigQueryOutputConfiguration.configure(
      hadoopConf,
      targetBQTable,
      getBQSchema,
      outputGcsPath,
      BigQueryFileFormat.NEWLINE_DELIMITED_JSON,
      classOf[TextOutputFormat[_, _]])

    val jsonDF = sourceDF.withColumn("json_data",
      to_json(struct(sourceColumns.map(col): _*))).select(JsonColumn)

    jsonDF.rdd
      .map(row => (null, row.getAs[String](0)))
      .saveAsNewAPIHadoopDataset(hadoopConf)
  }
}

object BigQueryWriter {
  private val JsonColumn = "json_data"
} 
Example 113
Source File: FunctionTest.scala    From sope   with Apache License 2.0 5 votes vote down vote up
package com.sope

import com.sope.model.{Class, Person, Student}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import com.sope.spark.sql._
import com.sope.TestContext.getSQlContext
import org.apache.spark.sql.types.{StringType, IntegerType}
import org.scalatest.{FlatSpec, Matchers}


class FunctionTest extends FlatSpec with Matchers {

  private val sqlContext = getSQlContext

  import sqlContext.implicits._

  private val testSData = Seq(
    Person("Sherlock", "Holmes", "baker street", "[email protected]", "999999"),
    Person("John", "Watson", "east street", "[email protected]", "55555")
  ).toDF

  private val studentDF = Seq(
    Student("A", "B", 1, 10),
    Student("B", "C", 2, 10),
    Student("C", "E", 4, 9),
    Student("E", "F", 5, 9),
    Student("F", "G", 6, 10),
    Student("G", "H", 7, 10),
    Student("H", "I", 9, 8),
    Student("H", "I", 9, 7)
  ).toDF

  private val classDF = Seq(
    Class(1, 10, "Tenth"),
    Class(2, 9, "Ninth"),
    Class(3, 8, "Eighth")
  ).toDF

  "Dataframe Function transformations" should "generate the transformations correctly" in {
    val nameUpperFunc = (df: DataFrame) => df.withColumn("first_name", upper(col("first_name")))
    val nameConcatFunc = (df: DataFrame) => df.withColumn("name", concat(col("first_name"), col("last_name")))
    val addressUpperFunc = (df: DataFrame) => df.withColumn("address", upper(col("address")))
    val transformed = testSData.applyDFTransformations(Seq(nameUpperFunc, nameConcatFunc, addressUpperFunc))
    transformed.show(false)
    transformed.schema.fields.map(_.name) should contain("name")
  }

  "Group by as list Function Transformation" should  "generate the transformations correctly" in {
    val grouped = studentDF.groupByAsList(Seq("cls"))
        .withColumn("grouped_data", explode($"grouped_data"))
        .unstruct("grouped_data", keepStructColumn = false)
    grouped.show(false)
    grouped.filter("cls = 10").head.getAs[Long]("grouped_count") should be(4)
  }

  "Cast Transformation" should  "generate the transformations correctly" in {
    val casted = studentDF.castColumns(IntegerType, StringType)
    casted.dtypes.count(_._2 == "StringType") should be(4)
  }


  "Update Keys Transformation" should  "generate the transformations correctly" in {
    val updatedWithKey = studentDF
      .updateKeys(Seq("cls"), classDF.renameColumns(Map("cls" -> "class")), "class", "key")
      .dropColumns(Seq("last_name", "roll_no"))
    updatedWithKey.show(false)
    updatedWithKey.filter("first_name = 'A'").head.getAs[Long]("cls_key") should be(1)
  }

} 
Example 114
Source File: ConstructVector.scala    From spark-vector   with Apache License 2.0 5 votes vote down vote up
package com.actian.spark_vector.loader.command

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import com.actian.spark_vector.vector.VectorOps._
import com.actian.spark_vector.vector.VectorJDBC
import com.actian.spark_vector.vector.VectorConnectionProperties
import com.actian.spark_vector.vector.TableSchemaGenerator
import com.actian.spark_vector.loader.options.{ UserOptions, VectorOptions }
import com.actian.spark_vector.loader.parsers.Args

import resource.managed


object ConstructVector {
  
      val jdbc = new VectorJDBC(conn)
      jdbc.createTable(config.vector.targetTable, source.schema)
    }
    val mapping = getFieldMapping(source.schema, config.general.colsToLoad.getOrElse(Seq[String]()), conn, config.vector.targetTable)
    val df = checkSchemaDefaults(source, mapping, conn, config.vector.targetTable)
    df.rdd.loadVector(df.schema, conn, config.vector.targetTable, config.vector.preSQL, config.vector.postSQL, Option(mapping))
  }
  
  private def checkSchemaDefaults(source: DataFrame, fieldMapping: Map[String, String], conn: VectorConnectionProperties, table: String): DataFrame = {
    val jdbc = new VectorJDBC(conn)
    val defaults = collection.mutable.Map(jdbc.columnDefaults(table).toSeq: _*)
    jdbc.columnMetadata(table).foreach(c => if(c.nullable) defaults.remove(c.name))
    val sourceDefaults = defaults.map(f => (fieldMapping.find(_._2 == f._1).get._1 -> f._2))
    source.na.fill(sourceDefaults.toMap)
  }
  
  private def getFieldMapping(sourceSchema: StructType, colsToLoad: Seq[String], conn: VectorConnectionProperties, table: String): Map[String, String] = {
    val jdbc = new VectorJDBC(conn)
    val tableSchema = jdbc.columnMetadata(table)
    
    require(colsToLoad.size == tableSchema.size || sourceSchema.size == tableSchema.size, "Number of source columns to load does not match number of target columns in table")
    val fieldMapping = if (!colsToLoad.isEmpty) {
      require(colsToLoad.size == tableSchema.size, "Number of columns to load does not match number of target columns in table")
      (for (i <- 0 until colsToLoad.size) yield (colsToLoad(i) -> tableSchema(i).name)).toMap
    } else {
      require(sourceSchema.size == tableSchema.size, "Number of source columns do not match number of target columns in table")
      (for (i <- 0 until sourceSchema.size) yield (sourceSchema(i).name -> tableSchema(i).name)).toMap
    }
    
    fieldMapping
  }
} 
Example 115
Source File: SparkSqlTable.scala    From spark-vector   with Apache License 2.0 5 votes vote down vote up
package com.actian.spark_vector.sql

import java.util.concurrent.atomic.AtomicLong
import org.apache.spark.sql.DataFrame

sealed trait SparkSqlTable {
  def tableName: String
  def quotedName: String = sparkQuote(tableName)
  def close(): Unit
}

case class HiveTable(override val tableName: String) extends SparkSqlTable {
  override def close(): Unit = {}
}

class TempTable private (override val tableName: String, df: DataFrame) extends SparkSqlTable {
  private def register(): Unit = df.createOrReplaceTempView(tableName)
  override def close(): Unit = df.sqlContext.dropTempTable(tableName);
}

object TempTable {
  private val id = new AtomicLong(0L)

  def apply(tableNameBase: String, df: DataFrame): TempTable = {
    val tableName = s"${tableNameBase}_${id.incrementAndGet}"
    val tt = new TempTable(tableName, df)
    tt.register()
    tt
  }
} 
Example 116
Source File: DefaultSource.scala    From spark-vector   with Apache License 2.0 5 votes vote down vote up
package com.actian.spark_vector.sql

import org.apache.spark.sql.{ DataFrame, SQLContext, SaveMode }
import org.apache.spark.sql.sources.{ BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider, SchemaRelationProvider }
import org.apache.spark.sql.types.StructType

import com.actian.spark_vector.util.Logging
import com.actian.spark_vector.vector.VectorJDBC

class DefaultSource extends DataSourceRegister with RelationProvider with SchemaRelationProvider with CreatableRelationProvider with Logging {
  override def shortName(): String = "vector"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation =
    VectorRelation(TableRef(parameters), sqlContext, parameters)

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation =
    VectorRelation(TableRef(parameters), Some(schema), sqlContext, parameters)

  override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = {
    val tableRef = TableRef(parameters)
    val table = VectorRelation(tableRef, sqlContext, parameters)

    mode match {
      case SaveMode.Overwrite =>
        table.insert(data, true)
      case SaveMode.ErrorIfExists =>
        val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) }
        if (isEmpty) {
          table.insert(data, false)
        } else {
          throw new UnsupportedOperationException("Writing to a non-empty Vector table is not allowed with mode ErrorIfExists.")
        }
      case SaveMode.Append =>
        table.insert(data, false)
      case SaveMode.Ignore =>
        val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) }
        if (isEmpty) {
          table.insert(data, false)
        }
    }

    table
  }
} 
Example 117
Source File: package.scala    From sparkpipe-core   with Apache License 2.0 5 votes vote down vote up
package software.uncharted.sparkpipe.ops.core.dataframe

import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.types.{StructType, StructField}


  // Can't test because DataFrameWriter is currently marked final
  // $COVERAGE-OFF$
  def write(
    path: String,
    format: String = "parquet",
    options: Map[String, String] = Map[String, String]()
  )(input: DataFrame): DataFrame = {
    if (path.length > 0) {
      input.write.format(format).options(options).save(path)
    } else {
      input.write.format(format).options(options).save()
    }
    input
  }
  // $COVERAGE-ON$
} 
Example 118
Source File: Extractors.scala    From streamliner-starter   with Apache License 2.0 5 votes vote down vote up
package com.memsql.streamliner.starter

import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.types._
import org.apache.spark.streaming.StreamingContext
import com.memsql.spark.etl.api.{Extractor, PhaseConfig}
import com.memsql.spark.etl.utils.PhaseLogger

// This extract just returns a static range of 5 integers each batch interval
class BasicExtractor extends Extractor {
  override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long,
   logger: PhaseLogger): Option[DataFrame] = {
    logger.info("extracting a constant sequence DataFrame")

    val schema = StructType(StructField("number", IntegerType, false) :: Nil)

    val sampleData = List(1,2,3,4,5)
    val rowRDD = sqlContext.sparkContext.parallelize(sampleData).map(Row(_))

    val df = sqlContext.createDataFrame(rowRDD, schema)
    Some(df)
  }
} 
Example 119
Source File: Transformers.scala    From streamliner-starter   with Apache License 2.0 5 votes vote down vote up
package com.memsql.streamliner.starter

import org.apache.spark.sql.{Row, DataFrame, SQLContext}
import org.apache.spark.sql.types._
import com.memsql.spark.etl.api.{Transformer, PhaseConfig}
import com.memsql.spark.etl.utils.PhaseLogger

// A helper object to extract the first column of a schema
object ExtractFirstStructField {
  def unapply(schema: StructType): Option[(String, DataType, Boolean, Metadata)] = schema.fields match {
    case Array(first: StructField, _*) => Some((first.name, first.dataType, first.nullable, first.metadata))
  }
}

// This transformer expects an input DataFrame and returns it
class BasicTransformer extends Transformer {
  def transform(sqlContext: SQLContext, df: DataFrame, config: PhaseConfig, logger: PhaseLogger): DataFrame = {
    logger.info("transforming the DataFrame")

    // check that the first column is of type IntegerType and return its name
    val column = df.schema match {
      case ExtractFirstStructField(name: String, dataType: IntegerType, _, _) => name
      case _ => throw new IllegalArgumentException("The first column of the input DataFrame should be IntegerType")
    }

    // filter the dataframe, returning only even numbers
    df.filter(s"$column % 2 = 0")
  }
} 
Example 120
Source File: SparkSqlRunner.scala    From amaterasu   with Apache License 2.0 5 votes vote down vote up
package org.apache.amaterasu.executor.execution.actions.runners.spark.SparkSql

import java.io.File

import org.apache.amaterasu.common.execution.actions.Notifier
import org.apache.amaterasu.common.logging.Logging
import org.apache.amaterasu.common.runtime.Environment
import org.apache.commons.io.FilenameUtils
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}



  def findFileType(folderName: File): Array[String] = {
    // get all the files from a directory
    val files: Array[File] = folderName.listFiles()
    val extensions: Array[String] = files.map(file => FilenameUtils.getExtension(file.toString))
    extensions
  }

}

object SparkSqlRunner {

  def apply(env: Environment,
            jobId: String,
            actionName: String,
            notifier: Notifier,
            sc: SparkContext): SparkSqlRunner = {

    val sparkSqlRunnerObj = new SparkSqlRunner

    sparkSqlRunnerObj.env = env
    sparkSqlRunnerObj.jobId = jobId
    sparkSqlRunnerObj.actionName = actionName
    sparkSqlRunnerObj.notifier = notifier
    sparkSqlRunnerObj.sc = sc
    sparkSqlRunnerObj.spark = SparkSession.builder().config(sc.getConf).enableHiveSupport().getOrCreate()
    sparkSqlRunnerObj
  }
} 
Example 121
Source File: SparkConsoleEgress.scala    From pipelines-examples   with Apache License 2.0 5 votes vote down vote up
package pipelines.example

import pipelines.streamlets.StreamletShape
import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic, StreamletQueryExecution }
import pipelines.spark.sql.SQLImplicits._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrame

class SparkConsoleEgress extends SparkStreamlet {
  val in1 = AvroInlet[Data]("in1")
  val in2 = AvroInlet[Data]("in2")
  val shape = StreamletShape.withInlets(in1, in2)

  def asTimestamp = udf((t: Long) ⇒ new java.sql.Timestamp(t))
  def elapsedTime = udf((t1: Long, t0: Long) ⇒ t1 - t0)

  override def createLogic() = new SparkStreamletLogic {
    override def buildStreamingQueries = {
      val stream1 = readStream(in1).withColumn("source", lit("spark")).withColumn("elapsed", elapsedTime($"t2", $"t1"))
      val stream2 = readStream(in2).withColumn("source", lit("akka")).withColumn("elapsed", elapsedTime($"t2", $"t1"))

      // commented-out process: simple stats to compute min/max/mean on a window
      // val dataCount = stream1.union(stream2).withColumn("ts", asTimestamp($"timestamp"))
      //      val stats = dataCount
      //        .withWatermark("ts", "1 second")
      //        .groupBy(window($"ts", "5 minutes", "1 minute"), $"source")
      //        //.agg(max($"elapsed"), min($"elapsed"), avg($"elapsed"), count($"source"))

      val quantiles: (String ⇒ Long ⇒ (DataFrame, Long) ⇒ Unit) = { name ⇒ period ⇒ (df, time) ⇒
        df.cache()
        val count = df.count()
        val cps = count.toDouble / period
        val quans = df.stat.approxQuantile("elapsed", Array(0.1, 0.5, 0.9, 0.99), 0.01)
        println(s"$time, $name, $count, $cps, " + quans.mkString(", "))
      }

      val period = 60 * 5 // seconds

      val q1 = stream1.writeStream.foreachBatch(quantiles("spark")(period))
        .trigger(Trigger.ProcessingTime(s"$period seconds"))
        .option("checkpointLocation", context.checkpointDir("console-egress-q1"))
        .start()
      val q2 = stream2.writeStream.foreachBatch(quantiles("akka")(period))
        .trigger(Trigger.ProcessingTime(s"$period seconds"))
        .option("checkpointLocation", context.checkpointDir("console-egress-q2"))
        .start()

      new Thread() {
        override def run(): Unit = {
          while (true) {
            val progress = q1.lastProgress
            if (progress != null) {
              println("***************** [PROGRESS] *********************")
              println(progress.toString())
              println("**************************************************")
            }
            Thread.sleep(60 * 1000)
          }
        }
      } //.start  // uncomment to enable the query progress

      StreamletQueryExecution(q1, q2)
    }
  }
} 
Example 122
Source File: Checkers.scala    From spark3D   with Apache License 2.0 5 votes vote down vote up
package com.astrolabsoftware.spark3d

import org.apache.spark.sql.Row
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.spark_partition_id

object Checkers {

  def returnFracSize(part: Iterator[Row], numberOfElements: Long): Iterator[Double] = {
    // Number of elements in the partition
    val sizePartition = part.size

    // Use Double
    val frac : Double = sizePartition.toDouble / numberOfElements.toDouble * 100

    // Return an Iterator
    Iterator(frac)
  }

  def returnSize(part: Iterator[Row]): Iterator[Double] = {
    // Return an Iterator
    Iterator(part.size)
  }

  
  def checkLoadBalancing(df: DataFrame, kind: String = "frac", numberOfElements: Long = -1L) : DataFrame = {

    // Need to import implicits to use toDF method
    val spark2 = SparkSession.getActiveSession.get
    import spark2.implicits._

    // Total number of elements in the DF.
    val numberOfElementsPriv: Long = numberOfElements match {
      case -1 => {
        kind match {
          case "frac" => df.count()
          // If not kind="frac", we do not need to total number of rows.
          case _ => -1L
        }
      }
      case x if x > 0 => numberOfElements
      case _ => throw new AssertionError("""
        Total number of elements in the DataFrame must be Long greater than 0!
        If you do not know it, set it to -1, and we will compute it for you.
      """)
    }

    // Output a DataFrame containing detail of the load balancing.
    val dfout = kind match {
      case "frac" => df.rdd.mapPartitions(part => returnFracSize(part, numberOfElementsPriv)).toDF("Load (%)")
      case "size" => df.rdd.mapPartitions(returnSize).toDF("Load (#Rows)")
      case _ => throw new AssertionError("""
        Wrong value for `kind`! You must choose between
          - "frac": Output a DataFrame containing the size of each partition
            relative to the total size of the dataset (in percent).
          - "size": Output a DataFrame containing the size of each partition
            in terms of number of rows.
        """)
    }

    dfout.withColumn("partition_id", spark_partition_id())
  }
} 
Example 123
Source File: package.scala    From spark-athena   with Apache License 2.0 5 votes vote down vote up
package io.github.tmheo.spark

import java.util.Properties

import com.amazonaws.athena.jdbc.shaded.com.amazonaws.regions.Regions
import org.apache.spark.sql.{DataFrame, DataFrameReader}

import scala.collection.JavaConverters._

package object athena {

  implicit class AthenaDataFrameReader(reader: DataFrameReader) {

    def athena(table: String): DataFrame = {
      reader.format("io.github.tmheo.spark.athena")
        .option(JDBCOptions.JDBC_TABLE_NAME, table)
        .load
    }

    def athena(table: String, region: String, s3StatingDir: String): DataFrame = {
      reader.format("io.github.tmheo.spark.athena")
        .option(JDBCOptions.JDBC_TABLE_NAME, table)
        .option("region", region)
        .option("s3_staging_dir", s3StatingDir)
        .load
    }

    def athena(table: String, s3StatingDir: String): DataFrame = {
      athena(table, Regions.getCurrentRegion.getName, s3StatingDir)
    }

    def athena(table: String, properties: Properties): DataFrame = {
      val options = properties.asScala
      options += (JDBCOptions.JDBC_TABLE_NAME -> table)
      reader.format("io.github.tmheo.spark.athena").options(options).load
    }

  }

} 
Example 124
Source File: RddToDataFrame.scala    From spark-sframe   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package org.apache.spark.turi

import org.graphlab.create.GraphLabUtil
import org.apache.spark.sql.{SQLContext, Row, DataFrame}
import org.apache.spark.rdd.RDD
import scala.collection.JavaConversions._
import org.apache.spark.sql.types._
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.ArrayBuffer
import scala.collection.immutable.Map
import java.util.HashMap
import java.util.ArrayList
import java.util.{Date,GregorianCalendar}
import java.sql.Date

object EvaluateRDD {
  
  def inferSchema(obj: Any): DataType = {
    if(obj.isInstanceOf[Int]) { 
      IntegerType
    } else if(obj.isInstanceOf[String]) { 
      StringType
    } else if(obj.isInstanceOf[Double]) { 
      DoubleType
    } else if(obj.isInstanceOf[Long]) { 
      LongType
    } else if(obj.isInstanceOf[Float]) { 
      FloatType
    } else if(obj.isInstanceOf[Map[_,_]]) {
      MapType(inferSchema(obj.asInstanceOf[Map[_,_]].head._1),inferSchema(obj.asInstanceOf[Map[_,_]].head._2))
    } else if(obj.isInstanceOf[java.util.HashMap[_,_]]) {
      MapType(inferSchema(obj.asInstanceOf[java.util.HashMap[_,_]].head._1),inferSchema(obj.asInstanceOf[java.util.HashMap[_,_]].head._2))
    } else if(obj.isInstanceOf[Array[_]]) {
      ArrayType(inferSchema(obj.asInstanceOf[Array[_]](0)))
    } else if(obj.isInstanceOf[java.util.ArrayList[_]]) {
      ArrayType(inferSchema(obj.asInstanceOf[java.util.ArrayList[_]](0)))
    } else if(obj.isInstanceOf[java.util.GregorianCalendar]) {
      TimestampType
    } else if(obj.isInstanceOf[java.util.Date] || obj.isInstanceOf[java.sql.Date]) {
      DateType
    } else { 
      StringType
    }
  }

  def toScala(obj: Any): Any = {
    if (obj.isInstanceOf[java.util.HashMap[_,_]]) {
      val jmap = obj.asInstanceOf[java.util.HashMap[_,_]]
      jmap.map { case (k,v) => toScala(k) -> toScala(v) }.toMap
    }
    else if(obj.isInstanceOf[java.util.ArrayList[_]]) {
      val buf = ArrayBuffer[Any]()
      val jArray = obj.asInstanceOf[java.util.ArrayList[_]]
      for(item <- jArray) {
        buf += toScala(item)
      }
      buf.toArray
    } else if(obj.isInstanceOf[java.util.GregorianCalendar]) {
      new java.sql.Timestamp(obj.asInstanceOf[java.util.GregorianCalendar].getTime().getTime())
    } else {
      obj
    }
  }
  def toSparkDataFrame(sqlContext: SQLContext, rdd: RDD[java.util.HashMap[String,_]]): DataFrame = { 
    val scalaRDD = rdd.map(l => toScala(l))
    val rowRDD = scalaRDD.map(l => Row.fromSeq(l.asInstanceOf[Map[_,_]].values.toList))
    
    var sample_data: java.util.HashMap[String,_] = rdd.take(1)(0)
    
    var schema_list: ListBuffer[StructField] = new ListBuffer[StructField]()
    for ((name,v) <- sample_data) { 
      schema_list.append(StructField(name,inferSchema(v)))
    }
    sqlContext.createDataFrame(rowRDD,StructType(schema_list))
  }
} 
Example 125
Source File: DataframeToDriverCsvFileWriter.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.csv

import java.io.PrintWriter

import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types._

import io.deepsense.sparkutils.readwritedataframe.ManagedResource

object DataframeToDriverCsvFileWriter {

  def write(
       dataFrame: DataFrame,
       options: Map[String, String],
       dataSchema: StructType,
       pathWithoutScheme: String): Unit = {
    val data = dataFrame.rdd.collect()
    val params = new CSVOptions(options)
    ManagedResource(
      new LocalCsvOutputWriter(dataSchema, params, pathWithoutScheme)
    ) { writer =>
      data.foreach(row => {
        writer.write(row.toSeq.map(_.asInstanceOf[String]))
      })
    }
  }

}


class LocalCsvOutputWriter(
      dataSchema: StructType,
      params: CSVOptions,
      driverPath: String) {

  private val driverFileWriter = new PrintWriter(driverPath)

  private val FLUSH_BATCH_SIZE = 1024L
  private var records: Long = 0L
  private val csvWriter = new LineCsvWriter(params, dataSchema.fieldNames.toSeq)

  def write(row: Seq[String]): Unit = {
    csvWriter.writeRow(row, records == 0L && params.headerFlag)
    records += 1
    if (records % FLUSH_BATCH_SIZE == 0) {
      flush()
    }
  }

  private def flush(): Unit = {
    val lines = csvWriter.flush()
    if (lines.nonEmpty) {
      driverFileWriter.write(lines)
    }
  }

  def close(): Unit = {
    flush()
    driverFileWriter.close()
  }
} 
Example 126
Source File: DataframeToRawCsvRDD.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.sparkutils.readwritedataframe

import org.apache.commons.csv.QuoteMode
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame


object DataframeToRawCsvRDD {

  val defaultCsvFormat = com.databricks.spark.csv.defaultCsvFormat

  def apply(dataFrame: DataFrame, parameters: Map[String, String] = Map())
           (implicit sparkContext: SparkContext): RDD[String] = {
    val delimiter = parameters.getOrElse("delimiter", ",")
    val delimiterChar = if (delimiter.length == 1) {
      delimiter.charAt(0)
    } else {
      throw new Exception("Delimiter cannot be more than one character.")
    }

    val escape = parameters.getOrElse("escape", null)
    val escapeChar: Character = if (escape == null) {
      null
    } else if (escape.length == 1) {
      escape.charAt(0)
    } else {
      throw new Exception("Escape character cannot be more than one character.")
    }

    val quote = parameters.getOrElse("quote", "\"")
    val quoteChar: Character = if (quote == null) {
      null
    } else if (quote.length == 1) {
      quote.charAt(0)
    } else {
      throw new Exception("Quotation cannot be more than one character.")
    }

    val quoteModeString = parameters.getOrElse("quoteMode", "MINIMAL")
    val quoteMode: QuoteMode = if (quoteModeString == null) {
      null
    } else {
      QuoteMode.valueOf(quoteModeString.toUpperCase)
    }

    val nullValue = parameters.getOrElse("nullValue", "null")

    val csvFormat = defaultCsvFormat
      .withDelimiter(delimiterChar)
      .withQuote(quoteChar)
      .withEscape(escapeChar)
      .withQuoteMode(quoteMode)
      .withSkipHeaderRecord(false)
      .withNullString(nullValue)

    val generateHeader = parameters.getOrElse("header", "false").toBoolean
    val headerRdd = if (generateHeader) {
      sparkContext.parallelize(Seq(
        csvFormat.format(dataFrame.columns.map(_.asInstanceOf[AnyRef]): _*)
      ))
    } else {
      sparkContext.emptyRDD[String]
    }

    val rowsRdd = dataFrame.rdd.map(row => {
      csvFormat.format(row.toSeq.map(_.asInstanceOf[AnyRef]): _*)
    })

    headerRdd union rowsRdd
  }
} 
Example 127
Source File: DataframeToDriverCsvFileWriter.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.csv

import java.io.PrintWriter

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._

import io.deepsense.sparkutils.readwritedataframe.{DataframeToRawCsvRDD, ManagedResource}

object DataframeToDriverCsvFileWriter {

  def write(
      dataFrame: DataFrame,
      options: Map[String, String],
      dataSchema: StructType,
      pathWithoutScheme: String): Unit = {
    val rawCsvLines = DataframeToRawCsvRDD(dataFrame, options)(dataFrame.sqlContext.sparkContext)
    writeRddToDriverFile(pathWithoutScheme, rawCsvLines)
  }

  // TODO extract to commons from DriverFiles
  private def writeRddToDriverFile(driverPath: String, lines: RDD[String]): Unit = {
    val recordSeparator = System.getProperty("line.separator", "\n")
    ManagedResource(new PrintWriter(driverPath)) { writer =>
      lines.collect().foreach(line => writer.write(line + recordSeparator))
    }
  }

} 
Example 128
Source File: SerializableSparkModel.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import io.deepsense.sparkutils.ML

class SerializableSparkModel[M <: Model[M]](val sparkModel: M)
  extends ML.Model[SerializableSparkModel[M]]
  with MLWritable {

  override def copy(extra: ParamMap): SerializableSparkModel[M] =
    new SerializableSparkModel(sparkModel.copy(extra))

  override def write: MLWriter = {
    sparkModel match {
      case w: MLWritable => w.write
      case _ => new DefaultMLWriter(this)
    }
  }

  override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset)

  override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema)

  override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae"
}

// This class may seem unused, but it is used reflectively by spark deserialization mechanism
object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] {
  override def read: MLReader[SerializableSparkModel[_]] = {
    new DefaultMLReader[SerializableSparkModel[_]]()
  }
} 
Example 129
Source File: DistributionCalculator.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.dataframe.report.distribution

import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._

import io.deepsense.deeplang.doperables.dataframe.report.distribution.continuous.ContinuousDistributionBuilderFactory
import io.deepsense.deeplang.doperables.dataframe.report.distribution.discrete.DiscreteDistributionBuilderFactory
import io.deepsense.deeplang.utils.aggregators.AggregatorBatch
import io.deepsense.reportlib.model._

object DistributionCalculator {

  def distributionByColumn(
    sparkDataFrame: org.apache.spark.sql.DataFrame,
    multivarStats: MultivariateStatisticalSummary): Map[String, Distribution] = {
    val dataFrameEmpty = multivarStats.count == 0

    if (dataFrameEmpty) {
      noDistributionBecauseOfNoData(sparkDataFrame.schema)
    } else {
      distributionForNonEmptyDataFrame(sparkDataFrame, multivarStats)
    }
  }

  private def noDistributionBecauseOfNoData(schema: StructType): Map[String, Distribution] = {
    for (columnName <- schema.fieldNames) yield {
      columnName -> NoDistribution(
        columnName,
        NoDistributionReasons.NoData
      )
    }
  }.toMap

  
  private def distributionForNonEmptyDataFrame(
    sparkDataFrame: DataFrame,
    multivarStats: MultivariateStatisticalSummary): Map[String, Distribution] = {
    val schema = sparkDataFrame.schema

    val distributionBuilders = for {
      (structField, columnIndex) <- sparkDataFrame.schema.zipWithIndex
    } yield {
      DistributionType.forStructField(structField) match {
        case DistributionType.Discrete =>
          DiscreteDistributionBuilderFactory.prepareBuilder(columnIndex, structField)
        case DistributionType.Continuous =>
          ContinuousDistributionBuilderFactory.prepareBuilder(
            columnIndex, structField, multivarStats)
        case DistributionType.NotApplicable => NoDistributionBuilder(
          structField.name,
          NoDistributionReasons.NotApplicableForType(structField.dataType))
      }
    }
    val results = {
      val aggregators = distributionBuilders.flatMap(_.allAggregators)
      AggregatorBatch.executeInBatch(sparkDataFrame.rdd, aggregators)
    }
    val distributions = distributionBuilders.map(_.build(results))
    distributions.map(d => d.name -> d).toMap
  }
} 
Example 130
Source File: EstimatorModelWrapperFixtures.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.spark.wrappers.estimators

import scala.language.reflectiveCalls

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml
import org.apache.spark.ml.param.{ParamMap, Param => SparkParam}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.report.Report
import io.deepsense.deeplang.doperables.serialization.SerializableSparkModel
import io.deepsense.deeplang.doperables.{SparkEstimatorWrapper, SparkModelWrapper}
import io.deepsense.deeplang.params.wrappers.spark.SingleColumnCreatorParamWrapper
import io.deepsense.deeplang.params.{Param, Params}
import io.deepsense.sparkutils.ML

object EstimatorModelWrapperFixtures {

  class SimpleSparkModel private[EstimatorModelWrapperFixtures]()
    extends ML.Model[SimpleSparkModel] {

    def this(x: String) = this()

    override val uid: String = "modelId"

    val predictionCol = new SparkParam[String](uid, "name", "description")

    def setPredictionCol(value: String): this.type = set(predictionCol, value)

    override def copy(extra: ParamMap): this.type = defaultCopy(extra)

    override def transformDF(dataset: DataFrame): DataFrame = {
      dataset.selectExpr("*", "1 as " + $(predictionCol))
    }

    @DeveloperApi
    override def transformSchema(schema: StructType): StructType = ???
  }

  class SimpleSparkEstimator extends ML.Estimator[SimpleSparkModel] {

    def this(x: String) = this()

    override val uid: String = "estimatorId"

    val predictionCol = new SparkParam[String](uid, "name", "description")

    override def fitDF(dataset: DataFrame): SimpleSparkModel =
      new SimpleSparkModel().setPredictionCol($(predictionCol))

    override def copy(extra: ParamMap): ML.Estimator[SimpleSparkModel] = defaultCopy(extra)

    @DeveloperApi
    override def transformSchema(schema: StructType): StructType = {
      schema.add(StructField($(predictionCol), IntegerType, nullable = false))
    }
  }

  trait HasPredictionColumn extends Params {
    val predictionColumn = new SingleColumnCreatorParamWrapper[
        ml.param.Params { val predictionCol: SparkParam[String] }](
      "prediction column",
      None,
      _.predictionCol)
    setDefault(predictionColumn, "abcdefg")

    def getPredictionColumn(): String = $(predictionColumn)
    def setPredictionColumn(value: String): this.type = set(predictionColumn, value)
  }

  class SimpleSparkModelWrapper
    extends SparkModelWrapper[SimpleSparkModel, SimpleSparkEstimator]
    with HasPredictionColumn {

    override val params: Array[Param[_]] = Array(predictionColumn)
    override def report: Report = ???

    override protected def loadModel(
      ctx: ExecutionContext,
      path: String): SerializableSparkModel[SimpleSparkModel] = ???
  }

  class SimpleSparkEstimatorWrapper
    extends SparkEstimatorWrapper[SimpleSparkModel, SimpleSparkEstimator, SimpleSparkModelWrapper]
    with HasPredictionColumn {

    override val params: Array[Param[_]] = Array(predictionColumn)
    override def report: Report = ???
  }
} 
Example 131
Source File: CustomCodeEntryPoint.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.workflowexecutor.customcode

import java.util.concurrent.TimeoutException
import java.util.concurrent.atomic.AtomicReference

import scala.annotation.tailrec
import scala.concurrent.duration._
import scala.concurrent.{Await, Promise}

import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.{SparkConf, SparkContext}

import io.deepsense.commons.utils.Logging
import io.deepsense.deeplang._
import io.deepsense.sparkutils.SparkSQLSession


class CustomCodeEntryPoint(
    val sparkContext: SparkContext,
    val sparkSQLSession: SparkSQLSession,
    val dataFrameStorage: DataFrameStorage,
    val operationExecutionDispatcher: OperationExecutionDispatcher)
  extends Logging {
  import io.deepsense.workflowexecutor.customcode.CustomCodeEntryPoint._
  def getSparkContext: JavaSparkContext = sparkContext

  def getSparkSQLSession: SparkSQLSession = sparkSQLSession

  def getNewSparkSQLSession: SparkSQLSession = sparkSQLSession.newSession()

  def getSparkConf: SparkConf = sparkContext.getConf

  private val codeExecutor: AtomicReference[Promise[CustomCodeExecutor]] =
    new AtomicReference(Promise())

  private val pythonPort: AtomicReference[Promise[Int]] =
    new AtomicReference(Promise())

  def getCodeExecutor(timeout: Duration): CustomCodeExecutor =
    getFromPromise(codeExecutor.get, timeout)

  def getPythonPort(timeout: Duration): Int =
    getFromPromise(pythonPort.get, timeout)

  def registerCodeExecutor(newCodeExecutor: CustomCodeExecutor): Unit =
    replacePromise(codeExecutor, newCodeExecutor)

  def registerCallbackServerPort(newPort: Int): Unit =
    replacePromise(pythonPort, newPort)

  def retrieveInputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame =
    dataFrameStorage.getInputDataFrame(workflowId, nodeId, portNumber).get

  def retrieveOutputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame =
    dataFrameStorage.getOutputDataFrame(workflowId, nodeId, portNumber).get

  def registerOutputDataFrame(
      workflowId: String, nodeId: String, portNumber: Int, dataFrame: DataFrame): Unit =
    dataFrameStorage.setOutputDataFrame(workflowId, nodeId, portNumber, dataFrame)

  def executionCompleted(workflowId: String, nodeId: String): Unit =
    operationExecutionDispatcher.executionEnded(workflowId, nodeId, Right(()))

  def executionFailed(workflowId: String, nodeId: String, error: String): Unit =
    operationExecutionDispatcher.executionEnded(workflowId, nodeId, Left(error))
}

object CustomCodeEntryPoint {
  private case class PromiseReplacedException() extends Exception

  @tailrec
  private def getFromPromise[T](promise: => Promise[T], timeout: Duration): T = {
    try {
      Await.result(promise.future, timeout)
    } catch {
      case e: TimeoutException => throw e
      case e: PromiseReplacedException => getFromPromise(promise, timeout)
    }
  }

  private def replacePromise[T](promise: AtomicReference[Promise[T]], newValue: T): Unit = {
    val oldPromise = promise.getAndSet(Promise.successful(newValue))
    try {
      oldPromise.failure(new PromiseReplacedException)
    } catch {
      // The oldPromise will have been completed always, except for the first time.
      // The illegal state is expected, but we have to complete the oldPromise,
      // since someone might be waiting on it.
      case e: IllegalStateException => ()
    }
  }

  case class CustomCodeEntryPointConfig(
    pyExecutorSetupTimeout: Duration = 5.seconds)
} 
Example 132
Source File: LinearRegressionDataGen.scala    From spark-bench   with Apache License 2.0 5 votes vote down vote up
package com.ibm.sparktc.sparkbench.datageneration.mlgenerator

import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{getOrDefault, getOrThrow, time}
import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object LinearRegressionDataGen extends WorkloadDefaults {
  val name = "data-generation-lr"
  // Application parameters #1million points have 200M data size
  val numOfExamples: Int = 40000
  val numOfFeatures: Int = 4
  val eps: Double = 0.5
  val intercepts: Double = 0.1
  val numOfPartitions: Int = 10
  val maxIteration: Int = 3
  override def apply(m: Map[String, Any]) = new LinearRegressionDataGen(
    numRows = getOrThrow(m, "rows").asInstanceOf[Int],
    numCols = getOrThrow(m, "cols").asInstanceOf[Int],
    output = Some(getOrThrow(m, "output").asInstanceOf[String]),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    eps = getOrDefault[Double](m, "eps", eps),
    intercepts = getOrDefault[Double](m, "intercepts", intercepts),
    numPartitions = getOrDefault[Int](m, "partitions", numOfPartitions)
  )
}

case class LinearRegressionDataGen (
                                      numRows: Int,
                                      numCols: Int,
                                      input: Option[String] = None,
                                      output: Option[String],
                                      saveMode: String,
                                      eps: Double,
                                      intercepts: Double,
                                      numPartitions: Int
                                   ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {

    val timestamp = System.currentTimeMillis()

    val (generateTime, data): (Long, RDD[LabeledPoint]) = time {
      LinearDataGenerator.generateLinearRDD(
        spark.sparkContext,
        numRows,
        numCols,
        eps,
        numPartitions,
        intercepts
      )
    }

    import spark.implicits._
    val (convertTime, dataDF) = time {
      data.toDF
    }

    val (saveTime, _) = time {
      val outputstr = output.get
      if(outputstr.endsWith(".csv")) throw SparkBenchException("LabeledPoints cannot be saved to CSV. Please try outputting to Parquet instead.")
      writeToDisk(output.get, saveMode, dataDF, spark)
    }//TODO you can't output this to CSV. Parquet is fine

    val timeResultSchema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)
      )
    )

    val total = generateTime + convertTime + saveTime

    val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total)))

    spark.createDataFrame(timeList, timeResultSchema)

  }
} 
Example 133
Source File: KMeansDataGen.scala    From spark-bench   with Apache License 2.0 5 votes vote down vote up
package com.ibm.sparktc.sparkbench.datageneration.mlgenerator

import com.ibm.sparktc.sparkbench.workload.ml.KMeansWorkload
import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._
import com.ibm.sparktc.sparkbench.utils.SaveModes
import org.apache.spark.mllib.util.KMeansDataGenerator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types._

object KMeansDataGen extends WorkloadDefaults {
  val name = "data-generation-kmeans"
  override def apply(m: Map[String, Any]) = new KMeansDataGen(
    numRows = getOrThrow(m, "rows").asInstanceOf[Int],
    numCols = getOrThrow(m, "cols").asInstanceOf[Int],
    output = Some(getOrThrow(m, "output").asInstanceOf[String]),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    k = getOrDefault[Int](m, "k", KMeansWorkload.numOfClusters),
    scaling = getOrDefault[Double](m, "scaling", KMeansWorkload.scaling),
    numPartitions = getOrDefault[Int](m, "partitions", KMeansWorkload.numOfPartitions)
  )
}

case class KMeansDataGen(
                          numRows: Int,
                          numCols: Int,
                          input: Option[String] = None,
                          output: Option[String],
                          saveMode: String,
                          k: Int,
                          scaling: Double,
                          numPartitions: Int
                        ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {
    val timestamp = System.currentTimeMillis()

    val (generateTime, data): (Long, RDD[Array[Double]]) = time {
      KMeansDataGenerator.generateKMeansRDD(
        spark.sparkContext,
        numRows,
        k,
        numCols,
        scaling,
        numPartitions
      )
    }

    val (convertTime, dataDF) = time {
      val schemaString = data.first().indices.map(i => "c" + i.toString).mkString(" ")
      val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false))
      val schema = StructType(fields)
      val rowRDD = data.map(arr => Row(arr:_*))
      spark.createDataFrame(rowRDD, schema)
    }

    val (saveTime, _) = time { writeToDisk(output.get, saveMode, dataDF, spark) }

    val timeResultSchema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)
      )
    )

    val total = generateTime + convertTime + saveTime

    val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total)))

    spark.createDataFrame(timeList, timeResultSchema)
  }
} 
Example 134
Source File: KMeansWorkloadTest.scala    From spark-bench   with Apache License 2.0 5 votes vote down vote up
package com.ibm.sparktc.sparkbench.workload.ml

import java.io.File

import com.holdenkarau.spark.testing.Utils
import com.ibm.sparktc.sparkbench.testfixtures.SparkSessionProvider
import com.ibm.sparktc.sparkbench.utils.SaveModes
import com.ibm.sparktc.sparkbench.utils.SparkFuncs.{load, writeToDisk}
import org.apache.spark.mllib.util.KMeansDataGenerator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}

class KMeansWorkloadTest extends FlatSpec with Matchers with BeforeAndAfterEach {
  private val spark = SparkSessionProvider.spark
  private val fileName = s"/tmp/spark-bench-scalatest/kmeans-${java.util.UUID.randomUUID.toString}.csv"

  override def afterEach() {
    Utils.deleteRecursively(new File(fileName))
  }

  def makeDataFrame(): DataFrame = {
    val data: RDD[Array[Double]] = KMeansDataGenerator.generateKMeansRDD(
      spark.sparkContext, 1, 1, 1, KMeansWorkload.scaling, KMeansWorkload.numOfPartitions
    )
    val schemaString = data.first().indices.map(_.toString).mkString(" ")
    val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false))
    val schema = StructType(fields)
    val rowRDD = data.map(arr => Row(arr: _*))
    spark.createDataFrame(rowRDD, schema)
  }

  "reconcileSchema" should "handle a StringType schema and turn it into a DoubleType Schema" in {
    val df2Disk = makeDataFrame()
    writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv"))
    val conf = Map("name" -> "kmeans", "input" -> fileName)
    val work = KMeansWorkload(conf)
    val df = load(spark, fileName)
    val ddf = work.reconcileSchema(df)
    ddf.schema.head.dataType shouldBe DoubleType
  }

  "The load function" should "parse the DataFrame it's given into an RDD[Vector]" in {
    val df = makeDataFrame()
    val conf = Map("name" -> "kmeans", "input" -> "")
    val work = KMeansWorkload(conf)
    val ddf = work.reconcileSchema(df)
    val (_, rdd) = work.loadToCache(ddf, spark)
    rdd.first()
  }

  it should "work even when we've pulled the data from disk" in {
    val df2Disk = makeDataFrame()
    writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv"))
    val conf = Map("name" -> "kmeans", "input" -> fileName)
    val work = KMeansWorkload(conf)
    val df = load(spark, fileName)
    val ddf = work.reconcileSchema(df)
    val (_, rdd) = work.loadToCache(ddf, spark)
    rdd.first()
  }

  "doWorkload" should "work" in {
    val df2Disk = makeDataFrame()
    writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv"))
    val conf = Map("name" -> "kmeans", "input" -> fileName)
    val work = KMeansWorkload(conf)
    val df = load(spark, fileName)
    val ddf = work.reconcileSchema(df)
    work.doWorkload(Some(ddf), spark)
  }
} 
Example 135
Source File: DataFrameExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

import java.io.File

import scopt.OptionParser

import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.util.Utils


object DataFrameExample {

  case class Params(input: String = "data/mllib/sample_libsvm_data.txt")
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DataFrameExample") {
      head("DataFrameExample: an example app using DataFrame for ML.")
      opt[String]("input")
        .text(s"input path to dataframe")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>
        success
      }
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val spark = SparkSession
      .builder
      .appName(s"DataFrameExample with $params")
      .getOrCreate()

    // Load input data
    println(s"Loading LIBSVM file with UDT from ${params.input}.")
    val df: DataFrame = spark.read.format("libsvm").load(params.input).cache()
    println("Schema from LIBSVM:")
    df.printSchema()
    println(s"Loaded training data as a DataFrame with ${df.count()} records.")

    // Show statistical summary of labels.
    val labelSummary = df.describe("label")
    labelSummary.show()

    // Convert features column to an RDD of vectors.
    val features = df.select("features").rdd.map { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(Vectors.fromML(feat)),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    // Save the records in a parquet file.
    val tmpDir = Utils.createTempDir()
    val outputDir = new File(tmpDir, "dataframe").toString
    println(s"Saving to $outputDir as Parquet file.")
    df.write.parquet(outputDir)

    // Load the records back.
    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDF = spark.read.parquet(outputDir)
    println(s"Schema from Parquet:")
    newDF.printSchema()

    spark.stop()
  }
}
// scalastyle:on println 
Example 136
Source File: HashingTF.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  @Since("2.0.0")
  def setBinary(value: Boolean): this.type = set(binary, value)

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
    // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion.
    val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  @Since("1.4.0")
  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  @Since("1.4.1")
  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
} 
Example 137
Source File: SQLTransformer.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType


  @Since("1.6.0")
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val tableName = Identifiable.randomUID(uid)
    dataset.createOrReplaceTempView(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val result = dataset.sparkSession.sql(realStatement)
    dataset.sparkSession.catalog.dropTempView(tableName)
    result
  }

  @Since("1.6.0")
  override def transformSchema(schema: StructType): StructType = {
    val spark = SparkSession.builder().getOrCreate()
    val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty))
    val dummyDF = spark.createDataFrame(dummyRDD, schema)
    val tableName = Identifiable.randomUID(uid)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    dummyDF.createOrReplaceTempView(tableName)
    val outputSchema = spark.sql(realStatement).schema
    spark.catalog.dropTempView(tableName)
    outputSchema
  }

  @Since("1.6.0")
  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}

@Since("1.6.0")
object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {

  @Since("1.6.0")
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 138
Source File: MultilayerPerceptronClassifierWrapper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  val weights: Array[Double] = mlpModel.weights.toArray
  val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
} 
Example 139
Source File: Transformer.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import scala.annotation.varargs

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


  protected def validateInputType(inputType: DataType): Unit = {}

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    validateInputType(inputType)
    if (schema.fieldNames.contains($(outputCol))) {
      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
    }
    val outputFields = schema.fields :+
      StructField($(outputCol), outputDataType, nullable = false)
    StructType(outputFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val transformUDF = udf(this.createTransformFunc, outputDataType)
    dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): T = defaultCopy(extra)
} 
Example 140
Source File: VectorSlicerSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{StructField, StructType}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    val slicer = new VectorSlicer().setInputCol("feature")
    ParamsSuite.checkParams(slicer)
    assert(slicer.getIndices.length === 0)
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {
        slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true))))
      }
    }
  }

  test("feature validity checks") {
    import VectorSlicer._
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(validIndices(Array.empty[Int]))
    assert(!validIndices(Array(-1)))
    assert(!validIndices(Array(1, 2, 1)))

    assert(validNames(Array("a", "b")))
    assert(validNames(Array.empty[String]))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))
  }

  test("Test vector slicer") {
    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())
    )

    // Expected after selecting indices 1, 4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) }
    val df = spark.createDataFrame(rdd,
      StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))

    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {
      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      }
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
      resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
        assert(a === b)
      }
    }

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    validateResults(vectorSlicer.transform(df))
  }

  test("read/write") {
    val t = new VectorSlicer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setIndices(Array(1, 3))
      .setNames(Array("a", "d"))
    testDefaultReadWrite(t)
  }
} 
Example 141
Source File: BinarizerSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  @transient var data: Array[Double] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
  }

  test("params") {
    ParamsSuite.checkParams(new Binarizer)
  }

  test("Binarize continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize continuous features with setter") {
    val threshold: Double = 0.2
    val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize vector of continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = Seq(
      (Vectors.dense(data), Vectors.dense(defaultBinarized))
    ).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x == y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize vector of continuous features with setter") {
    val threshold: Double = 0.2
    val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = Seq(
      (Vectors.dense(data), Vectors.dense(defaultBinarized))
    ).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x == y, "The feature value is not correct after binarization.")
    }
  }


  test("read/write") {
    val t = new Binarizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setThreshold(0.1)
    testDefaultReadWrite(t)
  }
} 
Example 142
Source File: SQLBuilderTest.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst

import scala.util.control.NonFatal

import org.apache.spark.sql.{DataFrame, Dataset, QueryTest}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.hive.test.TestHiveSingleton


abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton {
  protected def checkSQL(e: Expression, expectedSQL: String): Unit = {
    val actualSQL = e.sql
    try {
      assert(actualSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following expression:
             |
             |${e.prettyName}
             |
             |$cause
           """.stripMargin)
    }
  }

  protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = {
    val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) =>
      fail(
        s"""Cannot convert the following logical query plan to SQL:
           |
           |${plan.treeString}
         """.stripMargin)
    }

    try {
      assert(generatedSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following logical query plan:
             |
             |${plan.treeString}
             |
             |$cause
           """.stripMargin)
    }

    checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan))
  }

  protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = {
    checkSQL(df.queryExecution.analyzed, expectedSQL)
  }
} 
Example 143
Source File: JdbcRelationProvider.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val url = jdbcOptions.url
    val table = jdbcOptions.table
    val createTableOptions = jdbcOptions.createTableOptions
    val isTruncate = jdbcOptions.isTruncate

    val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, url, table)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, table)
              saveTable(df, url, table, jdbcOptions)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, table)
              createTable(df.schema, url, table, createTableOptions, conn)
              saveTable(df, url, table, jdbcOptions)
            }

          case SaveMode.Append =>
            saveTable(df, url, table, jdbcOptions)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(df.schema, url, table, createTableOptions, conn)
        saveTable(df, url, table, jdbcOptions)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
} 
Example 144
Source File: FrequentItems.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 145
Source File: FileStreamSink.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.apache.hadoop.fs.Path

import org.apache.spark.internal.Logging
import org.apache.spark.internal.io.FileCommitProtocol
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.datasources.{FileFormat, FileFormatWriter}

object FileStreamSink {
  // The name of the subdirectory that is used to store metadata about which files are valid.
  val metadataDir = "_spark_metadata"
}


class FileStreamSink(
    sparkSession: SparkSession,
    path: String,
    fileFormat: FileFormat,
    partitionColumnNames: Seq[String],
    options: Map[String, String]) extends Sink with Logging {

  private val basePath = new Path(path)
  private val logPath = new Path(basePath, FileStreamSink.metadataDir)
  private val fileLog =
    new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, logPath.toUri.toString)
  private val hadoopConf = sparkSession.sessionState.newHadoopConf()

  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) {
      logInfo(s"Skipping already committed batch $batchId")
    } else {
      val committer = FileCommitProtocol.instantiate(
        className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
        jobId = batchId.toString,
        outputPath = path,
        isAppend = false)

      committer match {
        case manifestCommitter: ManifestFileCommitProtocol =>
          manifestCommitter.setupManifestOptions(fileLog, batchId)
        case _ =>  // Do nothing
      }

      // Get the actual partition columns as attributes after matching them by name with
      // the given columns names.
      val partitionColumns: Seq[Attribute] = partitionColumnNames.map { col =>
        val nameEquality = data.sparkSession.sessionState.conf.resolver
        data.logicalPlan.output.find(f => nameEquality(f.name, col)).getOrElse {
          throw new RuntimeException(s"Partition column $col not found in schema ${data.schema}")
        }
      }

      FileFormatWriter.write(
        sparkSession = sparkSession,
        queryExecution = data.queryExecution,
        fileFormat = fileFormat,
        committer = committer,
        outputSpec = FileFormatWriter.OutputSpec(path, Map.empty),
        hadoopConf = hadoopConf,
        partitionColumns = partitionColumns,
        bucketSpec = None,
        refreshFunction = _ => (),
        options = options)
    }
  }

  override def toString: String = s"FileSink[$path]"
} 
Example 146
Source File: console.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
import org.apache.spark.sql.streaming.OutputMode

class ConsoleSink(options: Map[String, String]) extends Sink with Logging {
  // Number of rows to display, by default 20 rows
  private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20)

  // Truncate the displayed data if it is too long, by default it is true
  private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true)

  // Track the batch id
  private var lastBatchId = -1L

  override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized {
    val batchIdStr = if (batchId <= lastBatchId) {
      s"Rerun batch: $batchId"
    } else {
      lastBatchId = batchId
      s"Batch: $batchId"
    }

    // scalastyle:off println
    println("-------------------------------------------")
    println(batchIdStr)
    println("-------------------------------------------")
    // scalastyle:off println
    data.sparkSession.createDataFrame(
      data.sparkSession.sparkContext.parallelize(data.collect()), data.schema)
      .show(numRowsToShow, isTruncated)
  }
}

class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister {
  def createSink(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new ConsoleSink(parameters)
  }

  def shortName(): String = "console"
} 
Example 147
Source File: RowDataSourceStrategySuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import java.sql.DriverManager
import java.util.Properties

import org.scalatest.BeforeAndAfter

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types._
import org.apache.spark.util.Utils

class RowDataSourceStrategySuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext {
  import testImplicits._

  val url = "jdbc:h2:mem:testdb0"
  val urlWithUserAndPass = "jdbc:h2:mem:testdb0;user=testUser;password=testPass"
  var conn: java.sql.Connection = null

  before {
    Utils.classForName("org.h2.Driver")
    // Extra properties that will be specified for our database. We need these to test
    // usage of parameters from OPTIONS clause in queries.
    val properties = new Properties()
    properties.setProperty("user", "testUser")
    properties.setProperty("password", "testPass")
    properties.setProperty("rowId", "false")

    conn = DriverManager.getConnection(url, properties)
    conn.prepareStatement("create schema test").executeUpdate()
    conn.prepareStatement("create table test.inttypes (a INT, b INT, c INT)").executeUpdate()
    conn.prepareStatement("insert into test.inttypes values (1, 2, 3)").executeUpdate()
    conn.commit()
    sql(
      s"""
        |CREATE TEMPORARY TABLE inttypes
        |USING org.apache.spark.sql.jdbc
        |OPTIONS (url '$url', dbtable 'TEST.INTTYPES', user 'testUser', password 'testPass')
      """.stripMargin.replaceAll("\n", " "))
  }

  after {
    conn.close()
  }

  test("SPARK-17673: Exchange reuse respects differences in output schema") {
    val df = sql("SELECT * FROM inttypes")
    val df1 = df.groupBy("a").agg("b" -> "min")
    val df2 = df.groupBy("a").agg("c" -> "min")
    val res = df1.union(df2)
    assert(res.distinct().count() == 2)  // would be 1 if the exchange was incorrectly reused
  }
} 
Example 148
Source File: TakeOrderedAndProjectSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import scala.util.Random

import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types._


class TakeOrderedAndProjectSuite extends SparkPlanTest with SharedSQLContext {

  private var rand: Random = _
  private var seed: Long = 0

  protected override def beforeAll(): Unit = {
    super.beforeAll()
    seed = System.currentTimeMillis()
    rand = new Random(seed)
  }

  private def generateRandomInputData(): DataFrame = {
    val schema = new StructType()
      .add("a", IntegerType, nullable = false)
      .add("b", IntegerType, nullable = false)
    val inputData = Seq.fill(10000)(Row(rand.nextInt(), rand.nextInt()))
    spark.createDataFrame(sparkContext.parallelize(Random.shuffle(inputData), 10), schema)
  }

  
  private def noOpFilter(plan: SparkPlan): SparkPlan = FilterExec(Literal(true), plan)

  val limit = 250
  val sortOrder = 'a.desc :: 'b.desc :: Nil

  test("TakeOrderedAndProject.doExecute without project") {
    withClue(s"seed = $seed") {
      checkThatPlansAgree(
        generateRandomInputData(),
        input =>
          noOpFilter(TakeOrderedAndProjectExec(limit, sortOrder, input.output, input)),
        input =>
          GlobalLimitExec(limit,
            LocalLimitExec(limit,
              SortExec(sortOrder, true, input))),
        sortAnswers = false)
    }
  }

  test("TakeOrderedAndProject.doExecute with project") {
    withClue(s"seed = $seed") {
      checkThatPlansAgree(
        generateRandomInputData(),
        input =>
          noOpFilter(
            TakeOrderedAndProjectExec(limit, sortOrder, Seq(input.output.last), input)),
        input =>
          GlobalLimitExec(limit,
            LocalLimitExec(limit,
              ProjectExec(Seq(input.output.last),
                SortExec(sortOrder, true, input)))),
        sortAnswers = false)
    }
  }
} 
Example 149
Source File: XGBoost.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import eleflow.uberdata.models.UberXGBOOSTModel
import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType}

import scala.reflect.ClassTag


class XGBoost[I](override val uid: String,
                 val models: RDD[(I, (UberXGBOOSTModel,
                   Seq[(ModelParamEvaluation[I])]))])(
  implicit kt: ClassTag[I],
  ord: Ordering[I] = null)
    extends ForecastBaseModel[XGBoostSmallModel[I]]
    with HasInputCol
    with HasOutputCol
    with DefaultParamsWritable
    with HasFeaturesCol
    with HasNFutures
    with HasGroupByCol {

  def this(
    models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))]
  )(implicit kt: ClassTag[I], ord: Ordering[I] ) =
    this(Identifiable.randomUID("xgboost"), models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val schema = dataSet.schema
    val predSchema = transformSchema(schema)
    val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)})

    val predictions = joined.map {
      case (id, ((bestModel, metrics), row)) =>
        val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]](
          IUberdataForecastUtil.FEATURES_COL_NAME
        )
        val label = DataTransformer.toFloat(row.getAs($(featuresCol)))
        val labelPoint = features.map { vec =>
          val array = vec.toArray.map(_.toFloat)
          LabeledPoint(label, null, array)
        }
        val matrix = new DMatrix(labelPoint.toIterator)
        val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance
          .predict(matrix)
          .flatMap(_.map(_.toDouble))
          .splitAt(features.length)
        Row(
          row.toSeq :+ Vectors
            .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params
            .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _*
        )
    }
    dataSet.sqlContext.createDataFrame(predictions, predSchema)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra)
} 
Example 150
Source File: TimeSeriesGenerator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, StructType}

import scala.reflect.ClassTag


  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val rdd = dataSet.rdd

    val sparkContext = dataSet.sqlContext.sparkContext
    val index = sparkContext.broadcast(dataSet.schema.fieldIndex($(timeCol).get))
    val labelColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(groupByCol).get))
    val featuresColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(featuresCol)))
    val grouped = rdd.map { case (row: Row) =>
      val timeColRow =
        IUberdataForecastUtil.convertColumnToLong(row, index.value)
      convertColumnToDouble(timeColRow, featuresColIndex)
    }.groupBy { row =>
      row.getAs[L](labelColIndex.value)
    }.map {
      case (key, values) =>
        val toBeUsed =
          values.toArray.sortBy(row => row.getAs[Long](index.value))
        (key, toBeUsed)
    }

    val toBeTrained = grouped.map {
      case (key, values) =>
        org.apache.spark.sql.Row(
          key,
          Vectors.dense(values.map(_.getAs[Double](featuresColIndex.value)))
        )
    }

    val trainSchema = transformSchema(dataSet.schema)
    dataSet.sqlContext.createDataFrame(toBeTrained, trainSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    val labelIndex = schema.fieldIndex($(groupByCol).get)
    StructType(
      Seq(
        schema.fields(labelIndex),
        StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT)
      )
    )
  }

  override def copy(extra: ParamMap): TimeSeriesGenerator[L] =
    defaultCopy(extra)

}

object TimeSeriesGenerator extends DefaultParamsReadable[TimeSeriesGenerator[_]] {

  override def load(path: String): TimeSeriesGenerator[_] = super.load(path)
} 
Example 151
Source File: XGBoostBigModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml


import com.cloudera.sparkts.models.UberXGBoostModel
import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint}
import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)])
    extends ForecastBaseModel[XGBoostBigModel[I]]
    with HasLabelCol
    with HasIdCol {

  def setLabelcol(label: String): this.type = set(labelCol, label)

  def setIdcol(id: String): this.type = set(idCol, id)

  override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
            )
      }
      .join(prediction)
      .map {
        case (id, (features, predictValue)) =>
          Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }

  protected def predict(dataSet: Dataset[_]) = {
    val features = dataSet.rdd.map { case (row: Row) =>
      val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
      val id = row.getAs[I]($(idCol))
      SparkLabeledPoint(DataTransformer.toFloat(id), features)
    }.cache
    val (_, model) = models.head
    UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(getPredictionSchema)

  protected def getPredictionSchema: Array[StructField] = {
    Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    )
  }
} 
Example 152
Source File: ArimaBestModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.TimeSeriesModel
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType


class ArimaBestModel[L, M <: TimeSeriesModel](
  override val uid: String,
  val bestPrediction: RDD[(L, M)],
  val validationMetrics: RDD[(L, Seq[ModelParamEvaluation[L]])]
) extends Model[ArimaBestModel[L, M]]
    with TimeSeriesBestModelFinderParam[L] {

  //TODO avaliar necessidade
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)

    dataset.toDF()
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def copy(extra: ParamMap): ArimaBestModel[L, M] = {
    val copied =
      new ArimaBestModel[L, M](uid, bestPrediction, validationMetrics)
    copyValues(copied, extra)
  }
} 
Example 153
Source File: MovingAverage.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types._


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(windowSize -> 3)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataSet.schema)
    val sparkContext = dataSet.sqlContext.sparkContext
    val inputType = outputSchema($(inputCol)).dataType
    val inputTypeBr = sparkContext.broadcast(inputType)
    val dataSetRdd = dataSet.rdd
    val inputColName = sparkContext.broadcast($(inputCol))
    val inputColIndex = dataSet.columns.indexOf($(inputCol))
    val inputColIndexBr = sparkContext.broadcast(inputColIndex)
    val windowSizeBr = sparkContext.broadcast($(windowSize))
    val maRdd = dataSetRdd.map { case (row: Row) =>
      val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) {
        val vector =
          row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value)
        (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1)))
      } else {
        val iterable = row.getAs[Iterable[Double]](inputColName.value)
        (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1)))
      }
      val (before, after) = row.toSeq.splitAt(inputColIndexBr.value)
      Row(
        (before :+ rawValue) ++ after.tail :+ MovingAverageCalc
          .simpleMovingAverageArray(array, windowSizeBr.value): _*
      )
    }
    dataSet.sqlContext.createDataFrame(maRdd, outputSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra)
}

object MovingAverageCalc {
  private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = {
    (for (i <- 1 to values.length)
      yield
      //TODO rollback this comment with the right size of features to make the meanaverage return
      // the features values for the first values of the calc
      if (i < period) 0d //values(i)
      else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d)
  }
}

object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] {

  override def load(path: String): MovingAverage[_] = super.load(path)
} 
Example 154
Source File: VectorizeEncoder.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.core.data.DataTransformer
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, StructType}


class VectorizeEncoder(override val uid: String)
    extends Transformer
    with HasIdCol
    with HasTimeCol
    with HasInputCols
    with HasLabelCol
    with HasGroupByCol
    with HasOutputCol
    with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("vectorizer"))

  def setIdCol(input: String) = set(idCol, input)

  def setLabelCol(input: String) = set(labelCol, input)

  def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy))

  def setInputCol(input: Array[String]) = set(inputCols, input)

  def setTimeCol(time: String) = set(timeCol, Some(time))

  def setOutputCol(output: String) = set(outputCol, output)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val context = dataSet.sqlContext.sparkContext
    val input = context.broadcast($(inputCols))
    val allColumnNames = dataSet.schema.map(_.name)

    val nonInputColumnIndexes = context.broadcast(
      allColumnNames.zipWithIndex.filter(
        f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol)
          || f._1 == $(timeCol).getOrElse("")))
    val result = dataSet.rdd.map { case (row: Row) =>
      val rowSeq = row.toSeq
      val nonInputColumns = nonInputColumnIndexes.value.map {
        case (_, index) => rowSeq(index)
      }
      val size = input.value.length
      val (values, indices) = input.value
        .filter(col => row.getAs(col) != null)
        .map { column =>
          DataTransformer.toDouble(row.getAs(column))
        }
        .zipWithIndex
        .filter(f => f._1 != 0d)
        .unzip
      Row(
        nonInputColumns :+ org.apache.spark.ml.linalg.Vectors
          .sparse(size, indices.toArray, values.toArray): _*
      )
    }
    val newSchema = transformSchema(dataSet.schema)
    dataSet.sqlContext.createDataFrame(result, newSchema)
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(
      schema.filter(
        col =>
          !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol)
            || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("")
      )
    ).add(StructField($(outputCol), new VectorUDT))
} 
Example 155
Source File: AllColumnsTimeSeriesGenerator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset

import scala.reflect.ClassTag


  def setOutputCol(value: String): this.type = set(outputCol, value)

//  override def transform(dataSet: DataFrame): DataFrame = {
  override def transform(dataSet: Dataset[_] ): DataFrame = {
    val rdd = dataSet.rdd
    val sparkContext = dataSet.sqlContext.sparkContext
    val labelColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(labelCol)))
    val keyValueDataSet = rdd.map { case (row: Row) =>
      Row(
        row.getAs[T](labelColIndex.value),
        row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol))
      )
    }
    val trainSchema = transformSchema(dataSet.schema)

    dataSet.sqlContext.createDataFrame(keyValueDataSet, trainSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    StructType(
      schema.filter(_.name == $(labelCol)).head +: Seq(
        StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT)
      )
    )
  }

  override def copy(extra: ParamMap): AllColumnsTimeSeriesGenerator[T, U] =
    defaultCopy(extra)
}

object AllColumnsTimeSeriesGenerator
    extends DefaultParamsReadable[AllColumnsTimeSeriesGenerator[_, _]] {

  override def load(path: String): AllColumnsTimeSeriesGenerator[_, _] =
    super.load(path)
} 
Example 156
Source File: HoltWintersEstimator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.TimeSeriesModel

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.Dataset


class HoltWintersBestModel[T, M <: TimeSeriesModel](
  override val uid: String,
  val bestPrediction: RDD[(T, M)],
  val validationMetrics: RDD[(T, ModelParamEvaluation[T])]
) extends Model[HoltWintersBestModel[T, M]]
    with TimeSeriesBestModelFinderParam[T] {

  //TODO look for this method usage to see if it can be removed
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    dataset.toDF()
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def copy(extra: ParamMap): HoltWintersBestModel[T, M] = {
    val copied =
      new HoltWintersBestModel[T, M](uid, bestPrediction, validationMetrics)
    copyValues(copied, extra)
  }
} 
Example 157
Source File: XGBoostBigModelTimeSeries.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import java.sql.Timestamp

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasTimeCol
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModelTimeSeries[I](override val uid: String,
                                   override val models: Seq[(ParamMap, XGBoostModel)])
                                  extends XGBoostBigModel[I](uid, models) with HasTimeCol{

  def setTimecol(time: String): this.type = set(timeCol, Some(time))

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME),
              row.getAs[java.sql.Timestamp]($(timeCol).get)))
      }
      .join(prediction)
      .map {
        case (id, ((features, time), predictValue)) =>
          Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }


  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField($(timeCol).get, TimestampType),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    ) )
} 
Example 158
Source File: HoltWintersBestModelFinder.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.UberHoltWintersModel
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset

import scala.reflect.ClassTag


class HoltWintersBestModelFinder[G](
  override val uid: String
)(implicit kt: ClassTag[G])
    extends HoltWintersBestModelEvaluation[G, HoltWintersModel[G]]
    with DefaultParamsWritable
    with HasGroupByCol
    with TimeSeriesBestModelFinder {

  def setTimeSeriesEvaluator(eval: TimeSeriesEvaluator[G]): this.type =
    set(timeSeriesEvaluator, eval)

  def setEstimatorParamMaps(value: Array[ParamMap]): this.type =
    set(estimatorParamMaps, value)

  def setNFutures(value: Int): this.type = set(nFutures, value)

  override def setValidationCol(value: String): this.type = set(validationCol, value)

  def setLabelCol(label: String): this.type = set(labelCol, label)

  def setGroupByCol(groupBy: String): this.type = set(groupByCol, Some(groupBy))

  def this()(implicit kt: ClassTag[G]) = this(Identifiable.randomUID("arima"))

  def modelEvaluation(
    idModels: RDD[(G, Row, Option[UberHoltWintersModel])]
  ): RDD[(G, (UberHoltWintersModel, ModelParamEvaluation[G]))] = {
    val eval = $(timeSeriesEvaluator)
    val broadcastEvaluator = idModels.context.broadcast(eval)
    idModels.filter(_._3.isDefined).map {
      case (id, row, models) =>
        val evaluatedModels = models.map { model =>
          holtWintersEvaluation(row, model, broadcastEvaluator, id)
        }.head
        log.warn(s"best model reach ${evaluatedModels._2.metricResult}")
        (id, evaluatedModels)
    }
  }

  override protected def train(dataSet: Dataset[_]): HoltWintersModel[G] = {
    val splitDs = split(dataSet, $(nFutures))
    val idModels = splitDs.rdd.map(train)
    new HoltWintersModel[G](uid, modelEvaluation(idModels))
      .setValidationCol($(validationCol))
      .asInstanceOf[HoltWintersModel[G]]
  }

  def train(row: Row): (G, Row, Option[UberHoltWintersModel]) = {
    val id = row.getAs[G]($(groupByCol).get)

    val result = try {
      val dense = row.getAs[org.apache.spark.ml.linalg.DenseVector]($(featuresCol))
      val ts:org.apache.spark.mllib.linalg.Vector  = org.apache.spark.mllib.linalg.Vectors.dense(dense.toArray);
      Some(
        UberHoltWintersModel.fitModelWithBOBYQA(ts, $(nFutures))
      )
    } catch {
      case e: Exception =>
        log.error(
          s"Got the following Exception ${e.getLocalizedMessage} in id $id"
        )
        None
    }
    (id, row, result)
  }
}

object HoltWintersBestModelFinder extends DefaultParamsReadable[HoltWintersBestModelFinder[_]] {

  override def load(path: String): HoltWintersBestModelFinder[_] =
    super.load(path)
} 
Example 159
Source File: IUberdataForecastUtil.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package eleflow.uberdata

import eleflow.uberdata.core.IUberdataContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions.lit


object IUberdataForecastUtil {

  lazy val FEATURES_PREDICTION_COL_NAME = "featuresPrediction"
  lazy val FEATURES_COL_NAME = "features"
  lazy val ALGORITHM = "algorithm"
  lazy val PARAMS = "parameters"
  lazy val METRIC_COL_NAME = "metric"

  def convertColumnToLong(row: Row, columnIndex: Int): Row = {
    row.get(columnIndex) match {
      case s: java.sql.Timestamp =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ s.getTime) ++ after.tail :+ s
        Row(result: _*)
      case d: Double =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ d.toLong) ++ after.tail :+ d
        Row(result: _*)
      case i: Int =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ i.toLong) ++ after.tail :+ i
        Row(result: _*)
      case s: Short =>
        val (prior, after) = row.toSeq.splitAt(columnIndex)
        val result = (prior :+ s.toLong) ++ after.tail :+ s
        Row(result: _*)
      case _ => row
    }
  }

  def convertColumnToLongAddAtEnd(row: Row, columnIndex: Int): Row = {
    val result = row.get(columnIndex) match {
      case s: java.sql.Timestamp =>
        val result = row.toSeq :+ s.getTime
        Row(result: _*)
      case d: Double =>
        val result = row.toSeq :+ d.toLong
        Row(result: _*)
      case i: Int =>
        val result = row.toSeq :+ i.toLong
        Row(result: _*)
      case s: Short =>
        val result = row.toSeq :+ s.toLong
        Row(result: _*)
      case _ => row
    }
    result
  }

  def createIdColColumn(dataFrame : DataFrame, context : IUberdataContext) : DataFrame = {
    val arrId = dataFrame.rdd.zipWithIndex.map(
      x => x._1.toSeq :+ x._2
    ).map(
      x => Row.fromSeq(x))
    context.sqlContext.createDataFrame(arrId,
      dataFrame.withColumn("idCol", lit(1L : Long)).schema)
  }

} 
Example 160
Source File: TnViewCreator.scala    From TopNotch   with Apache License 2.0 5 votes vote down vote up
package com.bfm.topnotch.tnview

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SparkSession


/**
 * The class for combining multiple data sets into one that can be used as an input to the diff and assertion commands.
 * This one data set is a "view" of the many used to create it.
 * @param spark The SparkSession to use for creating views
 */
class TnViewCreator(spark: SparkSession) {

  /**
   * Create a view using from multiple data sets using a sql statement
   * @param inputs The inputs to create views from
   * @param params The HiveQL statement used to create the new view and the input tables' names in the statement
   * @return The new view in the form of a dataframe
   */
  def createView(inputs: Seq[DataFrame], params: TnViewParams): DataFrame = {
    // register the views as temporary tables accessible from sql queries
    inputs.zip(params.tableAliases).foreach{
      case (view, name) => view.createOrReplaceTempView(name)
    }
    spark.sql(params.query)
  }
} 
Example 161
Source File: TnTestHelper.scala    From TopNotch   with Apache License 2.0 5 votes vote down vote up
package com.bfm.topnotch

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.scalatest.Matchers
import scala.io.Source
import org.json4s._
import org.json4s.native.JsonMethods._

/**
 * This class handles some of the TopNotch reusable test code
 */
object TnTestHelper extends Matchers {
  val INDEX_COL_NAME = "__INDEX_COL__"
  /**
   * Read a file from the resources/src/test/scala/com/bfm/topnotch folder
   * @param fileName The path to the file relative to the path resources/src/test/scala/com/bfm/topnotch
   * @return The contents of the file as one string
   */
  def readResourceFileToJson[T](fileName: String, classType: Class[_]): JValue = {
    parse(Source.fromFile(classType.getResource(fileName).getFile).getLines().mkString("\n"))
  }

  /**
   * Attach an index to rows into a dataframe so we can track them throughout a series of operations
   * @param df The dataframe to index
   * @return A dataframe equal to df but with an index column
   */
  def attachIdx(df: DataFrame): DataFrame = df.withColumn(INDEX_COL_NAME, monotonicallyIncreasingId()).cache

  /**
   * Get a number greater than or equal to num that is divisible by denomiator
   */
  def numDivisibleBy(num: Int, denomiator: Int) = num / denomiator * denomiator

  /**
   * Grow a data frame to a desired size by duplicating rows.
   */
  def growDataFrame(initDF: DataFrame, newSize: Int): DataFrame = {
    val initCount = initDF.count
    if (initCount < 1) throw new IllegalArgumentException("initDF's size must be greater than 0")
    List.fill((newSize / initCount + 1).toInt)(initDF).reduce(_.unionAll(_)).limit(newSize)
  }

  /**
   * Compares two dataframes and ensures that they have the same schema (ignore nullable) and the same values
   * @param actualDF The DF we want to check for correctness
   * @param correctDF The correct DF we use for comparison
   * @param onlySchema only compare the schemas of the dataframes
   */
  def dfEquals(actualDF: DataFrame, correctDF: DataFrame, onlySchema: Boolean = false): Unit = {
    actualDF.schema.map(f => (f.name, f.dataType)).toSet shouldBe correctDF.schema.map(f => (f.name, f.dataType)).toSet
    if (!onlySchema) {
      actualDF.collect.map(_.toSeq.toSet).toSet shouldBe correctDF.collect.map(_.toSeq.toSet).toSet
    }
  }
} 
Example 162
Source File: TreeUtils.scala    From spark-sql-perf   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute}
import org.apache.spark.sql.DataFrame

object TreeUtils {
  
  def setMetadata(
      data: DataFrame,
      featuresColName: String,
      featureArity: Array[Int]): DataFrame = {
    val featuresAttributes = featureArity.zipWithIndex.map { case (arity: Int, feature: Int) =>
      if (arity > 0) {
        NominalAttribute.defaultAttr.withIndex(feature).withNumValues(arity)
      } else {
        NumericAttribute.defaultAttr.withIndex(feature)
      }
    }
    val featuresMetadata = new AttributeGroup("features", featuresAttributes).toMetadata()
    data.select(data(featuresColName).as(featuresColName, featuresMetadata))
  }
} 
Example 163
Source File: Word2Vec.scala    From spark-sql-perf   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.sql.perf.mllib.feature

import scala.util.Random

import org.apache.spark.ml
import org.apache.spark.ml.{PipelineStage, Transformer}
import org.apache.spark.ml.feature.Word2VecModel
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, split}

import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator


object Word2Vec extends BenchmarkAlgorithm with TestFromTraining {

  override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
    import ctx.params._

    val df = DataGenerator.generateDoc(
      ctx.sqlContext,
      numExamples,
      ctx.seed(),
      numPartitions,
      vocabSize,
      docLength,
      "text"
    )
    df.select(split(col("text"), " ").as("text"))
  }

  override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
    new ml.feature.Word2Vec().setInputCol("text")
  }

  override def testAdditionalMethods(
      ctx: MLBenchContext,
      model: Transformer): Map[String, () => _] = {
    import ctx.params._

    val rng = new Random(ctx.seed())
    val word2vecModel = model.asInstanceOf[Word2VecModel]
    val testWord = Vectors.dense(Array.fill(word2vecModel.getVectorSize)(rng.nextGaussian()))

    Map("findSynonyms" -> (() => {
      word2vecModel.findSynonyms(testWord, numSynonymsToFind)
    }))
  }

} 
Example 164
Source File: GaussianMixture.scala    From spark-sql-perf   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.sql.perf.mllib.clustering

import org.apache.spark.ml
import org.apache.spark.ml.PipelineStage
import org.apache.spark.sql.DataFrame

import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
import com.databricks.spark.sql.perf.mllib.data.DataGenerator

object GaussianMixture extends BenchmarkAlgorithm with TestFromTraining {

  override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
    import ctx.params._
    DataGenerator.generateGaussianMixtureData(ctx.sqlContext, numCenters = k,
      numExamples = numExamples, seed = ctx.seed(), numPartitions = numPartitions,
      numFeatures = numFeatures)
  }

  override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
    import ctx.params._
    new ml.clustering.GaussianMixture()
      .setK(k)
      .setSeed(randomSeed.toLong)
      .setMaxIter(maxIter)
      .setTol(tol)
  }

  // TODO(?) add a scoring method here.
} 
Example 165
Source File: FPGrowth.scala    From spark-sql-perf   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.sql.perf.mllib.fpm

import org.apache.spark.ml
import org.apache.spark.ml.{PipelineStage, Transformer}
import org.apache.spark.ml.fpm.FPGrowthModel
import org.apache.spark.sql.DataFrame

import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator



object FPGrowth extends BenchmarkAlgorithm with TestFromTraining {

  def trainingDataSet(ctx: MLBenchContext): DataFrame = {
    import ctx.params._

    DataGenerator.generateItemSet(
      ctx.sqlContext,
      numExamples,
      ctx.seed(),
      numPartitions,
      numItems,
      itemSetSize)
  }

  override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
    new ml.fpm.FPGrowth()
      .setItemsCol("items")
  }

  override def testAdditionalMethods(
      ctx: MLBenchContext,
      model: Transformer): Map[String, () => _] = {

    val fpModel = model.asInstanceOf[FPGrowthModel]
    Map("associationRules" -> (() => {
      fpModel.associationRules.count()
    }))
  }
} 
Example 166
Source File: MLLib.scala    From spark-sql-perf   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.sql.perf.mllib


import scala.io.Source
import scala.language.implicitConversions

import org.slf4j.LoggerFactory

import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}

import com.databricks.spark.sql.perf._


class MLLib(sqlContext: SQLContext)
  extends Benchmark(sqlContext) with Serializable {

  def this() = this(SQLContext.getOrCreate(SparkContext.getOrCreate()))
}

object MLLib {

  
  def run(yamlFile: String = null, yamlConfig: String = null): DataFrame = {
    logger.info("Starting run")
    val conf = getConf(yamlFile, yamlConfig)
    val sparkConf = new SparkConf().setAppName("MLlib QA").setMaster("local[2]")
    val sc = SparkContext.getOrCreate(sparkConf)
    sc.setLogLevel("INFO")
    val b = new com.databricks.spark.sql.perf.mllib.MLLib()
    val benchmarks = getBenchmarks(conf)
    println(s"${benchmarks.size} benchmarks identified:")
    val str = benchmarks.map(_.prettyPrint).mkString("\n")
    println(str)
    logger.info("Starting experiments")
    val e = b.runExperiment(
      executionsToRun = benchmarks,
      iterations = 1, // If you want to increase the number of iterations, add more seeds
      resultLocation = conf.output,
      forkThread = false)
    e.waitForFinish(conf.timeout.toSeconds.toInt)
    logger.info("Run finished")
    e.getCurrentResults()
  }
} 
Example 167
Source File: CarbonLoadParams.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command.management

import java.text.SimpleDateFormat
import java.util

import scala.collection.mutable

import org.apache.hadoop.conf.Configuration
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.command.UpdateTableModel
import org.apache.spark.sql.execution.datasources.LogicalRelation

import org.apache.carbondata.core.indexstore.PartitionSpec
import org.apache.carbondata.core.statusmanager.SegmentStatus
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.events.OperationContext
import org.apache.carbondata.processing.loading.model.CarbonLoadModel


case class CarbonLoadParams(
    sparkSession: SparkSession,
    tableName: String,
    sizeInBytes: Long,
    isOverwriteTable: Boolean,
    carbonLoadModel: CarbonLoadModel,
    hadoopConf: Configuration,
    logicalPartitionRelation: LogicalRelation,
    dateFormat : SimpleDateFormat,
    timeStampFormat : SimpleDateFormat,
    optionsOriginal: Map[String, String],
    finalPartition : Map[String, Option[String]],
    currPartitions: util.List[PartitionSpec],
    partitionStatus : SegmentStatus,
    var dataFrame: Option[DataFrame],
    scanResultRDD : Option[RDD[InternalRow]],
    updateModel: Option[UpdateTableModel],
    operationContext: OperationContext) {
} 
Example 168
Source File: DoubleDataTypeTestCase.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.integration.spark.testsuite.primitiveTypes

import java.util.Random

import org.apache.spark.sql.test.util.QueryTest
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.scalatest.BeforeAndAfterAll


class DoubleDataTypeTestCase extends QueryTest with BeforeAndAfterAll {

  lazy val df: DataFrame = generateDataFrame

  private def generateDataFrame(): DataFrame = {
    val r = new Random()
    val rdd = sqlContext.sparkContext
      .parallelize(1 to 10, 2)
      .map { x =>
        Row(x, "London" + (x % 2), x.toDouble / 13, x.toDouble / 11)
      }

    val schema = StructType(
      Seq(
        StructField("id", IntegerType, nullable = false),
        StructField("city", StringType, nullable = false),
        StructField("m1", DoubleType, nullable = false),
        StructField("m2", DoubleType, nullable = false)
      )
    )

    sqlContext.createDataFrame(rdd, schema)
  }

  override def beforeAll {
    sql("drop table if exists uniq_carbon")
    sql("drop table if exists uniq_hive")
    sql("drop table if exists doubleTypeCarbonTable")
    sql("drop table if exists doubleTypeHiveTable")

    df.write
      .format("carbondata")
      .option("tableName", "doubleTypeCarbonTable")
      .option("tempCSV", "false")
      .option("table_blocksize", "32")
      .mode(SaveMode.Overwrite)
      .save()

    df.write
      .mode(SaveMode.Overwrite)
      .saveAsTable("doubleTypeHiveTable")

  }

  test("detail query") {
    checkAnswer(sql("select * from doubleTypeCarbonTable order by id"),
      sql("select * from doubleTypeHiveTable order by id"))

  }

  test("duplicate values") {
    sql("create table uniq_carbon(name string, double_column double) STORED AS carbondata ")
    sql(s"load data inpath '$resourcesPath/uniq.csv' into table uniq_carbon")
    sql("create table uniq_hive(name string, double_column double) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','")
    sql(s"load data local inpath '$resourcesPath/uniqwithoutheader.csv' into table uniq_hive")
    checkAnswer(sql("select * from uniq_carbon where double_column>=11"),
      sql("select * from uniq_hive where double_column>=11"))
  }

//  test("agg query") {
//    checkAnswer(sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeCarbonTable group by city"),
//      sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeHiveTable group by city"))
//
//    checkAnswer(sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeCarbonTable group by city"),
//      sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeHiveTable group by city"))
//  }

  override def afterAll {
    sql("drop table if exists uniq_carbon")
    sql("drop table if exists uniq_hive")
    sql("drop table if exists doubleTypeCarbonTable")
    sql("drop table if exists doubleTypeHiveTable")
  }
} 
Example 169
Source File: TestUpdateAndDeleteWithLargeData.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.testsuite.iud

import java.text.SimpleDateFormat

import org.apache.spark.sql.test.util.QueryTest
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.CarbonProperties

class TestUpdateAndDeleteWithLargeData extends QueryTest with BeforeAndAfterAll {
  var df: DataFrame = _

  override def beforeAll {
    dropTable()
    buildTestData()
  }

  private def buildTestData(): Unit = {

    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy-MM-dd")

    // Simulate data and write to table orders
    import sqlContext.implicits._

    val sdf = new SimpleDateFormat("yyyy-MM-dd")
    df = sqlContext.sparkSession.sparkContext.parallelize(1 to 1500000)
      .map(value => (value, new java.sql.Date(sdf.parse("2015-07-" + (value % 10 + 10)).getTime),
        "china", "aaa" + value, "phone" + 555 * value, "ASD" + (60000 + value), 14999 + value,
        "ordersTable" + value))
      .toDF("o_id", "o_date", "o_country", "o_name",
        "o_phonetype", "o_serialname", "o_salary", "o_comment")
    createTable()

  }

  private def createTable(): Unit = {
    df.write
      .format("carbondata")
      .option("tableName", "orders")
      .option("tempCSV", "true")
      .option("compress", "true")
      .mode(SaveMode.Overwrite)
      .save()
  }

  private def dropTable() = {
    sql("DROP TABLE IF EXISTS orders")

  }

  test("test the update and delete delete functionality for large data") {

    sql(
      """
            update ORDERS set (o_comment) = ('yyy')""").show()
    checkAnswer(sql(
      """select o_comment from orders limit 2 """), Seq(Row("yyy"), Row("yyy")))

    sql("delete from orders where exists (select 1 from orders)")

    checkAnswer(sql(
      """
           SELECT count(*) FROM orders
           """), Row(0))
  }

} 
Example 170
Source File: BloomCoarseGrainIndexTestUtil.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.index.bloom

import java.io.{File, PrintWriter}
import java.util.UUID

import scala.util.Random

import org.apache.spark.sql.test.util.QueryTest
import org.apache.spark.sql.DataFrame

object BloomCoarseGrainIndexTestUtil extends QueryTest {

  def createFile(fileName: String, line: Int = 10000, start: Int = 0): Unit = {
    if (!new File(fileName).exists()) {
      val write = new PrintWriter(new File(fileName))
      for (i <- start until (start + line)) {
        write.println(
          s"$i,n$i,city_$i,${ Random.nextInt(80) }," +
          s"${ UUID.randomUUID().toString },${ UUID.randomUUID().toString }," +
          s"${ UUID.randomUUID().toString },${ UUID.randomUUID().toString }," +
          s"${ UUID.randomUUID().toString },${ UUID.randomUUID().toString }," +
          s"${ UUID.randomUUID().toString },${ UUID.randomUUID().toString }")
      }
      write.close()
    }
  }

  def deleteFile(fileName: String): Unit = {
    val file = new File(fileName)
    if (file.exists()) {
      file.delete()
    }
  }

  private def checkSqlHitIndex(sqlText: String, indexName: String, shouldHit: Boolean): DataFrame = {
    // we will not check whether the query will hit the index because index may be skipped
    // if the former index pruned all the blocklets
    sql(sqlText)
  }

  def checkBasicQuery(indexName: String, bloomDMSampleTable: String, normalTable: String, shouldHit: Boolean = true): Unit = {
    checkAnswer(
      checkSqlHitIndex(s"select * from $bloomDMSampleTable where id = 1", indexName, shouldHit),
      sql(s"select * from $normalTable where id = 1"))
    checkAnswer(
      checkSqlHitIndex(s"select * from $bloomDMSampleTable where id = 999", indexName, shouldHit),
      sql(s"select * from $normalTable where id = 999"))
    checkAnswer(
      checkSqlHitIndex(s"select * from $bloomDMSampleTable where city = 'city_1'", indexName, shouldHit),
      sql(s"select * from $normalTable where city = 'city_1'"))
    checkAnswer(
      checkSqlHitIndex(s"select * from $bloomDMSampleTable where city = 'city_999'", indexName, shouldHit),
      sql(s"select * from $normalTable where city = 'city_999'"))
    checkAnswer(
      sql(s"select min(id), max(id), min(name), max(name), min(city), max(city)" +
          s" from $bloomDMSampleTable"),
      sql(s"select min(id), max(id), min(name), max(name), min(city), max(city)" +
          s" from $normalTable"))
  }
} 
Example 171
Source File: CaseClassDataFrameAPIExample.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

import org.apache.carbondata.examples.util.ExampleUtils

case class People(name: String, occupation: String, id: Int)

object CaseClassDataFrameAPIExample {

  def main(args: Array[String]) {
    val spark = ExampleUtils.createSparkSession("CaseClassDataFrameAPIExample")
    exampleBody(spark)
    spark.close()
  }

  def exampleBody(spark : SparkSession): Unit = {
    val people = List(People("sangeeta", "engineer", 1), People("pallavi", "consultant", 2))
    val peopleRDD: RDD[People] = spark.sparkContext.parallelize(people)
    import spark.implicits._
    val peopleDF: DataFrame = peopleRDD.toDF("name", "occupation", "id")

    // writing data to carbon table
    peopleDF.write
      .format("carbondata")
      .option("tableName", "caseclass_table")
      .option("compress", "true")
      .mode(SaveMode.Overwrite)
      .save()

    spark.sql("SELECT * FROM caseclass_table").show()

    spark.sql("DROP TABLE IF EXISTS caseclass_table")
  }
} 
Example 172
Source File: TestLikeQueryWithIndex.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.testsuite.secondaryindex

import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, DataFrame, Row}
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll


class TestLikeQueryWithIndex extends QueryTest with BeforeAndAfterAll {

  override def beforeAll {
    sql("drop table if exists TCarbon")

    sql("CREATE TABLE IF NOT EXISTS TCarbon(ID Int, country String, "+
          "name String, phonetype String, serialname String) "+
        "STORED AS carbondata"
    )
    var csvFilePath = s"$resourcesPath/secindex/secondaryIndexLikeTest.csv"

    sql(
      s"LOAD DATA LOCAL INPATH '" + csvFilePath + "' INTO TABLE " +
      s"TCarbon " +
      s"OPTIONS('DELIMITER'= ',')"

    )

    sql("create index insert_index on table TCarbon (name) AS 'carbondata'"
    )
  }

  test("select secondary index like query Contains") {
    val df = sql("select * from TCarbon where name like '%aaa1%'")
    secondaryIndexTableCheck(df,_.equalsIgnoreCase("TCarbon"))

    checkAnswer(
      sql("select * from TCarbon where name like '%aaa1%'"),
      Seq(Row(1, "china", "aaa1", "phone197", "A234"),
        Row(9, "china", "aaa1", "phone756", "A455"))
    )
  }

    test("select secondary index like query ends with") {
      val df = sql("select * from TCarbon where name like '%aaa1'")
      secondaryIndexTableCheck(df,_.equalsIgnoreCase("TCarbon"))

      checkAnswer(
        sql("select * from TCarbon where name like '%aaa1'"),
        Seq(Row(1, "china", "aaa1", "phone197", "A234"),
          Row(9, "china", "aaa1", "phone756", "A455"))
      )
    }

      test("select secondary index like query starts with") {
        val df = sql("select * from TCarbon where name like 'aaa1%'")
        secondaryIndexTableCheck(df, Set("insert_index","TCarbon").contains(_))

        checkAnswer(
          sql("select * from TCarbon where name like 'aaa1%'"),
          Seq(Row(1, "china", "aaa1", "phone197", "A234"),
            Row(9, "china", "aaa1", "phone756", "A455"))
        )
      }

  def secondaryIndexTableCheck(dataFrame:DataFrame,
      tableNameMatchCondition :(String) => Boolean): Unit ={
    dataFrame.queryExecution.sparkPlan.collect {
      case bcf: CarbonDatasourceHadoopRelation =>
        if(!tableNameMatchCondition(bcf.carbonTable.getTableUniqueName)){
          assert(true)
        }
    }
  }

  override def afterAll {
    sql("DROP INDEX if exists insert_index ON TCarbon")
    sql("drop table if exists TCarbon")
  }
} 
Example 173
Source File: ITSelectorSuite.scala    From spark-infotheoretic-feature-selection   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.sql.{DataFrame, SQLContext}
import org.junit.runner.RunWith
import org.scalatest.{BeforeAndAfterAll, FunSuite}
import org.scalatest.junit.JUnitRunner
import TestHelper._



  test("Run ITFS on nci data (nPart = 10, nfeat = 10)") {

    val df = readCSVData(sqlContext, "test_nci9_s3.csv")
    val cols = df.columns
    val pad = 2
    val allVectorsDense = true
    val model = getSelectorModel(sqlContext, df, cols.drop(1), cols.head, 
        10, 10, allVectorsDense, pad)

    assertResult("443, 755, 1369, 1699, 3483, 5641, 6290, 7674, 9399, 9576") {
      model.selectedFeatures.mkString(", ")
    }
  }
} 
Example 174
Source File: SavingStream.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService}
import com.kakao.mango.text.ThreadSafeDateFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

import java.util.concurrent.{Future => JFuture}
import scala.reflect.runtime.universe.TypeTag

object SavingStream {
  val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd")
  val hh = ThreadSafeDateFormat("HH")
  val mm = ThreadSafeDateFormat("mm")
  val m0 = (ms: Long) => mm(ms).charAt(0) + "0"
}


  @transient var executor: RichExecutorService = _

  def ex: RichExecutorService = {
    if (executor == null) {
      this.synchronized {
        if (executor == null) {
          executor = new RichExecutorService(es.get())
        }
      }
    }
    executor
  }

  def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = {
    stream.foreachRDD { (rdd, time) =>
      ex.submit {
        toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*)
      }
    }
  }

  def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms))
    }
  }

  def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms))
    }
  }

  def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms))
    }
  }

  def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms))
    }
  }

}

class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) {
  override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd)
}

class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) {
  override def toDF(rdd: RDD[String]) = ctx.read.json(rdd)
}

class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) {
  import com.kakao.mango.json._

  override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson))
}

class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) {
  override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema)
} 
Example 175
Source File: DefaultSource.scala    From memsql-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.memsql.spark

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.metrics.source.MetricsHandler
import org.apache.spark.sql.sources.{
  BaseRelation,
  CreatableRelationProvider,
  DataSourceRegister,
  RelationProvider
}
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

object DefaultSource {
  val MEMSQL_SOURCE_NAME          = "com.memsql.spark"
  val MEMSQL_SOURCE_NAME_SHORT    = "memsql"
  val MEMSQL_GLOBAL_OPTION_PREFIX = "spark.datasource.memsql."
}

class DefaultSource
    extends RelationProvider
    with DataSourceRegister
    with CreatableRelationProvider
    with LazyLogging {

  override def shortName: String = DefaultSource.MEMSQL_SOURCE_NAME_SHORT

  private def includeGlobalParams(sqlContext: SQLContext,
                                  params: Map[String, String]): Map[String, String] =
    sqlContext.getAllConfs.foldLeft(params)({
      case (params, (k, v)) if k.startsWith(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) =>
        params + (k.stripPrefix(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) -> v)
      case (params, _) => params
    })

  override def createRelation(sqlContext: SQLContext,
                              parameters: Map[String, String]): BaseRelation = {
    val params  = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters))
    val options = MemsqlOptions(params)
    if (options.disablePushdown) {
      SQLPushdownRule.ensureRemoved(sqlContext.sparkSession)
      MemsqlReaderNoPushdown(MemsqlOptions.getQuery(params), options, sqlContext)
    } else {
      SQLPushdownRule.ensureInjected(sqlContext.sparkSession)
      MemsqlReader(MemsqlOptions.getQuery(params), Nil, options, sqlContext)
    }
  }

  override def createRelation(sqlContext: SQLContext,
                              mode: SaveMode,
                              parameters: Map[String, String],
                              data: DataFrame): BaseRelation = {
    val opts = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters))
    val conf = MemsqlOptions(opts)

    val table = MemsqlOptions
      .getTable(opts)
      .getOrElse(
        throw new IllegalArgumentException(
          s"To write a dataframe to MemSQL you must specify a table name via the '${MemsqlOptions.TABLE_NAME}' parameter"
        )
      )
    JdbcHelpers.prepareTableForWrite(conf, table, mode, data.schema)
    val isReferenceTable = JdbcHelpers.isReferenceTable(conf, table)
    val partitionWriterFactory =
      if (conf.onDuplicateKeySQL.isEmpty) {
        new LoadDataWriterFactory(table, conf)
      } else {
        new BatchInsertWriterFactory(table, conf)
      }

    val schema        = data.schema
    var totalRowCount = 0L
    data.foreachPartition(partition => {
      val writer = partitionWriterFactory.createDataWriter(schema,
                                                           TaskContext.getPartitionId(),
                                                           0,
                                                           isReferenceTable,
                                                           mode)
      try {
        partition.foreach(record => {
          writer.write(record)
          totalRowCount += 1
        })
        writer.commit()
        MetricsHandler.setRecordsWritten(totalRowCount)
      } catch {
        case e: Exception => {
          writer.abort()
          throw e
        }
      }
    })

    createRelation(sqlContext, parameters)
  }
} 
Example 176
Source File: ReferenceTableTest.scala    From memsql-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.memsql.spark

import com.github.mrpowers.spark.daria.sql.SparkSessionExt._
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.{DataFrame, SaveMode}

import scala.util.Try

class ReferenceTableTest extends IntegrationSuiteBase {

  val childAggregatorHost = "localhost"
  val childAggregatorPort = "5508"

  val dbName                  = "testdb"
  val commonCollectionName    = "test_table"
  val referenceCollectionName = "reference_table"

  override def beforeEach(): Unit = {
    super.beforeEach()

    // Set child aggregator as a dmlEndpoint
    spark.conf
      .set("spark.datasource.memsql.dmlEndpoints", s"${childAggregatorHost}:${childAggregatorPort}")
  }

  def writeToTable(tableName: String): Unit = {
    val df = spark.createDF(
      List(4, 5, 6),
      List(("id", IntegerType, true))
    )
    df.write
      .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT)
      .mode(SaveMode.Append)
      .save(s"${dbName}.${tableName}")
  }

  def readFromTable(tableName: String): DataFrame = {
    spark.read
      .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT)
      .load(s"${dbName}.${tableName}")
  }

  def writeAndReadFromTable(tableName: String): Unit = {
    writeToTable(tableName)
    val dataFrame = readFromTable(tableName)
    val sqlRows   = dataFrame.collect();
    assert(sqlRows.length == 3)
  }

  def dropTable(tableName: String): Unit = executeQuery(s"drop table if exists $dbName.$tableName")

  describe("Success during write operations") {

    it("to common table") {
      dropTable(commonCollectionName)
      executeQuery(
        s"create table if not exists $dbName.$commonCollectionName (id INT NOT NULL, PRIMARY KEY (id))")
      writeAndReadFromTable(commonCollectionName)
    }

    it("to reference table") {
      dropTable(referenceCollectionName)
      executeQuery(
        s"create reference table if not exists $dbName.$referenceCollectionName (id INT NOT NULL, PRIMARY KEY (id))")
      writeAndReadFromTable(referenceCollectionName)
    }
  }

  describe("Success during creating") {

    it("common table") {
      dropTable(commonCollectionName)
      writeAndReadFromTable(commonCollectionName)
    }
  }

  describe("Failure because of") {

    it("database name not specified") {
      spark.conf.set("spark.datasource.memsql.database", "")
      val df = spark.createDF(
        List(4, 5, 6),
        List(("id", IntegerType, true))
      )
      val result = Try {
        df.write
          .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT)
          .mode(SaveMode.Append)
          .save(s"${commonCollectionName}")
      }
      
      assert(SQLHelper.isSQLExceptionWithCode(result.failed.get, List(1046)))
    }
  }
} 
Example 177
Source File: DeltaLoad.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.algo

import com.adidas.analytics.algo.DeltaLoad._
import com.adidas.analytics.algo.core.Algorithm
import com.adidas.analytics.algo.shared.DateComponentDerivation
import com.adidas.analytics.config.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration
import com.adidas.analytics.util.DataFrameUtils._
import com.adidas.analytics.util._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.slf4j.{Logger, LoggerFactory}


  private def getUpsertRecords(deltaRecords: Dataset[Row], resultColumns: Seq[String]): Dataset[Row] = {
    // Create partition window - Partitioning by delta records logical key (i.e. technical key of active records)
    val partitionWindow = Window
      .partitionBy(businessKey.map(col): _*)
      .orderBy(technicalKey.map(component => col(component).desc): _*)

    // Ranking & projection
    val rankedDeltaRecords = deltaRecords
      .withColumn(rankingColumnName, row_number().over(partitionWindow))
      .filter(upsertRecordsModesFilterFunction)

    rankedDeltaRecords
      .filter(rankedDeltaRecords(rankingColumnName) === 1)
      .selectExpr(resultColumns: _*)
  }

  protected def withDatePartitions(spark: SparkSession, dfs: DFSWrapper, dataFrames: Vector[DataFrame]): Vector[DataFrame] = {
    logger.info("Adding partitioning information if needed")
    try {
      dataFrames.map { df =>
        if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions){
          df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions))
        }
        else df
      }
    } catch {
      case e: Throwable =>
        logger.error("Cannot add partitioning information for data frames.", e)
        //TODO: Handle failure case properly
        throw new RuntimeException("Unable to transform data frames.", e)
    }
  }
}


object DeltaLoad {

  private val logger: Logger = LoggerFactory.getLogger(getClass)

  def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = {
    new DeltaLoad(spark, dfs, configLocation)
  }
} 
Example 178
Source File: PartitionHelpers.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.algo.core

import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}


trait PartitionHelpers {

  protected def getDistinctPartitions(outputDataFrame: DataFrame, targetPartitions: Seq[String]): Dataset[Row] = {
    val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString))

    outputDataFrame.select(targetPartitionsColumns: _*).distinct
  }

  protected def getParameterValue(row: Row, partitionString: String): String =
    createParameterValue(row.get(row.fieldIndex(partitionString)))

  protected def createParameterValue(partitionRawValue: Any): String =
    partitionRawValue match {
      case value: java.lang.Short => value.toString
      case value: java.lang.Integer => value.toString
      case value: scala.Predef.String => "'" + value + "'"
      case null => throw new Exception("Partition Value is null. No support for null partitions!")
      case value => throw new Exception("Unsupported partition DataType: " + value.getClass)
    }
} 
Example 179
Source File: DataFrameUtils.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.util

import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, functions}
import org.slf4j.{Logger, LoggerFactory}


object DataFrameUtils {

  private val logger: Logger = LoggerFactory.getLogger(getClass)

  type FilterFunction = Row => Boolean

  type PartitionCriteria = Seq[(String, String)]

  def mapPartitionsToDirectories(partitionCriteria: PartitionCriteria): Seq[String] = {
    partitionCriteria.map {
      case (columnName, columnValue) => s"$columnName=$columnValue"
    }
  }

  def buildPartitionsCriteriaMatcherFunc(multiplePartitionsCriteria: Seq[PartitionCriteria], schema: StructType): FilterFunction = {
    val targetPartitions = multiplePartitionsCriteria.flatten.map(_._1).toSet
    val fieldNameToMatchFunctionMapping = schema.fields.filter {
      case StructField(name, _, _, _) => targetPartitions.contains(name)
    }.map {
      case StructField(name, _: ByteType, _, _)    => name -> ((r: Row, value: String) => r.getAs[Byte](name)    == value.toByte)
      case StructField(name, _: ShortType, _, _)   => name -> ((r: Row, value: String) => r.getAs[Short](name)   == value.toShort)
      case StructField(name, _: IntegerType, _, _) => name -> ((r: Row, value: String) => r.getAs[Int](name)     == value.toInt)
      case StructField(name, _: LongType, _, _)    => name -> ((r: Row, value: String) => r.getAs[Long](name)    == value.toLong)
      case StructField(name, _: FloatType, _, _)   => name -> ((r: Row, value: String) => r.getAs[Float](name)   == value.toFloat)
      case StructField(name, _: DoubleType, _, _)  => name -> ((r: Row, value: String) => r.getAs[Double](name)  == value.toDouble)
      case StructField(name, _: BooleanType, _, _) => name -> ((r: Row, value: String) => r.getAs[Boolean](name) == value.toBoolean)
      case StructField(name, _: StringType, _, _)  => name -> ((r: Row, value: String) => r.getAs[String](name)  == value)
    }.toMap

    def convertPartitionCriteriaToFilterFunctions(partitionCriteria: PartitionCriteria): Seq[FilterFunction] = partitionCriteria.map {
      case (name, value) => (row: Row) => fieldNameToMatchFunctionMapping(name)(row, value)
    }

    def joinSinglePartitionFilterFunctionsWithAnd(partitionFilterFunctions: Seq[FilterFunction]): FilterFunction =
      partitionFilterFunctions
        .reduceOption((predicate1, predicate2) => (row: Row) => predicate1(row) && predicate2(row))
        .getOrElse((_: Row) => false)

    multiplePartitionsCriteria
      .map(convertPartitionCriteriaToFilterFunctions)
      .map(joinSinglePartitionFilterFunctionsWithAnd)
      .reduceOption((predicate1, predicate2) => (row: Row) => predicate1(row) || predicate2(row))
      .getOrElse((_: Row) => false)
  }


  implicit class DataFrameHelper(df: DataFrame) {

    def collectPartitions(targetPartitions: Seq[String]): Seq[PartitionCriteria] = {
      logger.info(s"Collecting unique partitions for partitions columns (${targetPartitions.mkString(", ")})")
      val partitions = df.selectExpr(targetPartitions: _*).distinct().collect()

      partitions.map { row =>
        targetPartitions.map { columnName =>
          Option(row.getAs[Any](columnName)) match {
            case Some(columnValue) => columnName -> columnValue.toString
            case None => throw new RuntimeException(s"Partition column '$columnName' contains null value")
          }
        }
      }
    }

    def addMissingColumns(targetSchema: StructType): DataFrame = {
      val dataFieldsSet = df.schema.fieldNames.toSet
      val selectColumns = targetSchema.fields.map { field =>
        if (dataFieldsSet.contains(field.name)) {
          functions.col(field.name)
        } else {
          functions.lit(null).cast(field.dataType).as(field.name)
        }
      }
      df.select(selectColumns: _*)
    }

    def isEmpty: Boolean = df.head(1).isEmpty

    def nonEmpty: Boolean = df.head(1).nonEmpty
  }
} 
Example 180
Source File: InputReader.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.util

import org.apache.spark.sql.{DataFrame, SparkSession}
import org.slf4j.{Logger, LoggerFactory}


  def newTableLocationReader(table: String, format: DataFormat, options: Map[String, String] = Map.empty): TableLocationReader = {
    TableLocationReader(table, format, options)
  }

  case class TableReader(table: String, options: Map[String, String]) extends InputReader {
    override def read(sparkSession: SparkSession): DataFrame = {
      logger.info(s"Reading data from table $table")
      sparkSession.read.options(options).table(table)
    }
  }

  case class FileSystemReader(location: String, format: DataFormat, options: Map[String, String]) extends InputReader {
    override def read(sparkSession: SparkSession): DataFrame = {
      logger.info(s"Reading data from location $location")
      format.read(sparkSession.read.options(options), location)
    }
  }

  case class TableLocationReader(table: String, format: DataFormat, options: Map[String, String]) extends InputReader {
    override def read(sparkSession: SparkSession): DataFrame = {
      val location = HiveTableAttributeReader(table, sparkSession).getTableLocation
      logger.info(s"Reading data from location $location")
      format.read(sparkSession.read.options(options), location)
    }
  }
} 
Example 181
Source File: TestUtils.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.utils

import org.apache.spark.sql.functions.{col, count, lit}
import org.apache.spark.sql.{DataFrame, Row}

object TestUtils {

  implicit class ExtendedDataFrame(df: DataFrame) {

    def hasDiff(anotherDf: DataFrame): Boolean = {
      def printDiff(incoming: Boolean)(row: Row): Unit = {
        if (incoming) print("+ ") else print("- ")
        println(row)
      }

      val groupedDf = df.groupBy(df.columns.map(col): _*).agg(count(lit(1))).collect().toSet
      val groupedAnotherDf = anotherDf.groupBy(anotherDf.columns.map(col): _*).agg(count(lit(1))).collect().toSet

      groupedDf.diff(groupedAnotherDf).foreach(printDiff(incoming = true))
      groupedAnotherDf.diff(groupedDf).foreach(printDiff(incoming = false))

      groupedDf.diff(groupedAnotherDf).nonEmpty || groupedAnotherDf.diff(groupedDf).nonEmpty
    }
  }
} 
Example 182
Source File: FileReader.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.utils

import com.adidas.analytics.util.DataFormat
import com.adidas.analytics.util.DataFormat.{DSVFormat, JSONFormat, ParquetFormat}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SparkSession}


class FileReader(format: DataFormat, options: Map[String, String]) {

  def read(spark: SparkSession, location: String, fillNulls: Boolean = false): DataFrame = {
    val df = format.read(spark.read.options(options), location)
    if (fillNulls) {
      df.na.fill("")
    } else {
      df
    }
  }
}


object FileReader {

  def newDSVFileReader(optionalSchema: Option[StructType] = None, delimiter: Char = '|', header: Boolean = false): FileReader = {
    val options = Map("delimiter" -> delimiter.toString, "header" -> header.toString)
    if (optionalSchema.isEmpty) {
      new FileReader(DSVFormat(optionalSchema), options + ("inferSchema" -> "true"))
    } else {
      new FileReader(DSVFormat(optionalSchema), options)
    }
  }

  def newParquetFileReader(): FileReader = {
    new FileReader(ParquetFormat(), Map.empty[String, String])
  }

  def newJsonFileReader(optionalSchema: Option[StructType] = None): FileReader = {
      new FileReader(JSONFormat(optionalSchema), Map.empty[String, String])
  }

  def apply(format: DataFormat, options: (String, String)*): FileReader = {
    new FileReader(format, options.toMap)
  }

  def apply(format: DataFormat, options: Map[String, String]): FileReader = {
    new FileReader(format, options)
  }
} 
Example 183
Source File: AggregateImpressionLog.scala    From spark-hyperloglog   with MIT License 5 votes vote down vote up
package com.collective.analytics

import com.collective.analytics.schema.{ActivityLog, SegmentLog, ImpressionLog}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.hyperloglog.functions
import org.slf4j.LoggerFactory

class AggregateImpressionLog(impressionLog: DataFrame) extends Serializable {
  private val log = LoggerFactory.getLogger(classOf[AggregateImpressionLog])

  
  def segmentLog(): DataFrame = {
    log.info(s"Compute segment log")

    import org.apache.spark.sql.functions._
    import functions._

    impressionLog.select(
      col(ImpressionLog.ad_id),
      col(ImpressionLog.site_id),
      col(ImpressionLog.cookie_id),
      col(ImpressionLog.impressions),
      col(ImpressionLog.clicks),
      explode(col(ImpressionLog.segments))   as SegmentLog.segment
    ).groupBy(
        col(SegmentLog.segment)
      ).agg(
        hyperLogLog(ImpressionLog.cookie_id) as SegmentLog.cookies_hll,
        sum(ImpressionLog.impressions)       as SegmentLog.impressions,
        sum(ImpressionLog.clicks)            as SegmentLog.clicks
      )
  }

} 
Example 184
Source File: SparkEsDataFrameFunctions.scala    From Spark2Elasticsearch   with Apache License 2.0 5 votes vote down vote up
package com.github.jparkie.spark.elasticsearch.sql

import com.github.jparkie.spark.elasticsearch.SparkEsBulkWriter
import com.github.jparkie.spark.elasticsearch.conf.{ SparkEsMapperConf, SparkEsTransportClientConf, SparkEsWriteConf }
import com.github.jparkie.spark.elasticsearch.transport.SparkEsTransportClientManager
import org.apache.spark.sql.{ DataFrame, Row }


  def bulkLoadToEs(
    esIndex:                    String,
    esType:                     String,
    sparkEsTransportClientConf: SparkEsTransportClientConf = SparkEsTransportClientConf.fromSparkConf(sparkContext.getConf),
    sparkEsMapperConf:          SparkEsMapperConf          = SparkEsMapperConf.fromSparkConf(sparkContext.getConf),
    sparkEsWriteConf:           SparkEsWriteConf           = SparkEsWriteConf.fromSparkConf(sparkContext.getConf)
  )(implicit sparkEsTransportClientManager: SparkEsTransportClientManager = sparkEsTransportClientManager): Unit = {
    val sparkEsWriter = new SparkEsBulkWriter[Row](
      esIndex = esIndex,
      esType = esType,
      esClient = () => sparkEsTransportClientManager.getTransportClient(sparkEsTransportClientConf),
      sparkEsSerializer = new SparkEsDataFrameSerializer(dataFrame.schema),
      sparkEsMapper = new SparkEsDataFrameMapper(sparkEsMapperConf),
      sparkEsWriteConf = sparkEsWriteConf
    )

    sparkContext.runJob(dataFrame.rdd, sparkEsWriter.write _)
  }
} 
Example 185
Source File: BigQueryDataFrame.scala    From spark-bigquery   with Apache License 2.0 5 votes vote down vote up
package com.samelamin.spark.bigquery

import com.google.api.services.bigquery.model.{TableReference, TableSchema}
import com.google.cloud.hadoop.io.bigquery._
import com.google.gson._
import com.samelamin.spark.bigquery.converters.{BigQueryAdapter, SchemaConverters}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{LongWritable, NullWritable}
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import org.apache.spark.sql.DataFrame
import org.slf4j.LoggerFactory

import scala.util.Random

  def saveAsBigQueryTable(fullyQualifiedOutputTableId: String,
                          isPartitionedByDay: Boolean = false,
                          timePartitionExpiration: Long = 0,
                          writeDisposition: WriteDisposition.Value = null,
                          createDisposition: CreateDisposition.Value = null): Unit = {
    val destinationTable = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId)
    val bigQuerySchema = SchemaConverters.SqlToBQSchema(adaptedDf)
    val gcsPath = writeDFToGoogleStorage(adaptedDf,destinationTable,bigQuerySchema)
    bq.load(destinationTable,
      bigQuerySchema,
      gcsPath,
      isPartitionedByDay,
      timePartitionExpiration,
      writeDisposition,
      createDisposition)
    delete(new Path(gcsPath))
  }

  def writeDFToGoogleStorage(adaptedDf: DataFrame,
                             destinationTable: TableReference,
                             bqSchema: TableSchema): String = {
    val tableName = BigQueryStrings.toString(destinationTable)

    BigQueryConfiguration.configureBigQueryOutput(hadoopConf, tableName, bqSchema.toPrettyString())
    hadoopConf.set("mapreduce.job.outputformat.class", classOf[BigQueryOutputFormat[_, _]].getName)
    val bucket = self.sparkSession.conf.get(BigQueryConfiguration.GCS_BUCKET_KEY)
    val temp = s"spark-bigquery-${System.currentTimeMillis()}=${Random.nextInt(Int.MaxValue)}"
    val gcsPath = s"gs://$bucket/hadoop/tmp/spark-bigquery/$temp"
    if(hadoopConf.get(BigQueryConfiguration.TEMP_GCS_PATH_KEY) == null) {
      hadoopConf.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, gcsPath)
    }

    logger.info(s"Loading $gcsPath into $tableName")
    adaptedDf
      .toJSON
      .rdd
      .map(json => (null, jsonParser.parse(json)))
      .saveAsNewAPIHadoopFile(gcsPath,
        classOf[GsonBigQueryInputFormat],
        classOf[LongWritable],
        classOf[TextOutputFormat[NullWritable, JsonObject]],
        hadoopConf)
    gcsPath
  }



  private def delete(path: Path): Unit = {
    val fs = FileSystem.get(path.toUri, hadoopConf)
    fs.delete(path, true)
  }
} 
Example 186
Source File: BigQueryAdapter.scala    From spark-bigquery   with Apache License 2.0 5 votes vote down vote up
package com.samelamin.spark.bigquery.converters

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.functions.current_timestamp
import org.apache.spark.sql.types._


object BigQueryAdapter {

  private def adaptName(name: String, siblings: Array[String]): String = {
    var newName = name.replaceAll("\\W", "_")
    if (!newName.equals(name)) {
      // Avoid duplicates:
      var counter = 1;
      while (!siblings.find(_.equals(newName)).isEmpty) {
        newName = newName + "_" + counter
        counter = counter + 1
      }
    }
    newName
  }

  private def adaptField(structField: StructField, parentType: StructType): StructField = {
    new StructField(adaptName(structField.name, parentType.fieldNames), adaptType(structField.dataType), structField.nullable)
  }

  private def adaptType(dataType: DataType): DataType = {
    dataType match {
      case structType: StructType =>
        new StructType(structType.fields.map(adaptField(_, structType)))
      case arrayType: ArrayType =>
        new ArrayType(adaptType(arrayType.elementType), arrayType.containsNull)
      case mapType: MapType =>
        new MapType(adaptType(mapType.keyType), adaptType(mapType.valueType), mapType.valueContainsNull)
      case other => other
    }
  }

  def apply(df: DataFrame): DataFrame = {
    val sqlContext = df.sparkSession.sqlContext
    val sparkContext = df.sparkSession.sparkContext
    val timestampColumn = sparkContext
      .hadoopConfiguration.get("timestamp_column","bq_load_timestamp")
    val newSchema = adaptType(df.schema).asInstanceOf[StructType]
    val encoder = RowEncoder.apply(newSchema).resolveAndBind()
    val encodedDF = df
      .queryExecution
      .toRdd.map(x=>encoder.fromRow(x))
   sqlContext.createDataFrame(encodedDF,newSchema).withColumn(timestampColumn,current_timestamp())
  }
} 
Example 187
Source File: BigQuerySource.scala    From spark-bigquery   with Apache License 2.0 5 votes vote down vote up
package com.samelamin.spark.bigquery.streaming

import java.math.BigInteger
import com.google.cloud.hadoop.io.bigquery.BigQueryStrings
import com.samelamin.spark.bigquery.BigQueryClient
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.execution.streaming.{Offset, _}
import org.apache.spark.sql.types.{BinaryType, StringType, StructField, StructType}
import com.samelamin.spark.bigquery._
import com.samelamin.spark.bigquery.converters.SchemaConverters
import org.joda.time.DateTime
import org.slf4j.LoggerFactory


  override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
    val startIndex = start.getOrElse(LongOffset(0L)).asInstanceOf[LongOffset].offset.toLong
    val endIndex = end.asInstanceOf[LongOffset].offset.toLong
    val startPartitionTime = new DateTime(startIndex).toLocalDate
    val endPartitionTime = new DateTime(endIndex).toLocalDate.toString
    logger.info(s"Fetching data between $startIndex and $endIndex")
    val query =
      s"""
         |SELECT
         |  *
         |FROM
         |  `${fullyQualifiedOutputTableId.replace(':','.')}`
         |WHERE
         |  $timestampColumn BETWEEN TIMESTAMP_MILLIS($startIndex) AND TIMESTAMP_MILLIS($endIndex)
         |  AND _PARTITIONTIME BETWEEN TIMESTAMP('$startPartitionTime') AND TIMESTAMP('$endPartitionTime')
         |  """.stripMargin
    val bigQuerySQLContext = new BigQuerySQLContext(sqlContext)
    val df = bigQuerySQLContext.bigQuerySelect(query)
    df
  }

  override def stop(): Unit = {}
  def getConvertedSchema(sqlContext: SQLContext): StructType = {
    val bigqueryClient = BigQueryClient.getInstance(sqlContext)
    val tableReference = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId)
    SchemaConverters.BQToSQLSchema(bigqueryClient.getTableSchema(tableReference))
  }
}

object BigQuerySource {
  val DEFAULT_SCHEMA = StructType(
    StructField("Sample Column", StringType) ::
      StructField("value", BinaryType) :: Nil
  )
} 
Example 188
Source File: BigQuerySink.scala    From spark-bigquery   with Apache License 2.0 5 votes vote down vote up
package com.samelamin.spark.bigquery.streaming

import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.execution.streaming.Sink
import com.samelamin.spark.bigquery._
import org.slf4j.LoggerFactory
import scala.util.Try
import org.apache.hadoop.fs.Path


class BigQuerySink(sparkSession: SparkSession, path: String, options: Map[String, String]) extends Sink {
  private val logger = LoggerFactory.getLogger(classOf[BigQuerySink])
  private val basePath = new Path(path)
  private val logPath = new Path(basePath, new Path(BigQuerySink.metadataDir,"transaction.json"))

  private val fileLog = new BigQuerySinkLog(sparkSession, logPath.toUri.toString)
  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    if (batchId <= fileLog.getLatest().getOrElse(-1L)) {
      logger.info(s"Skipping already committed batch $batchId")
    } else {
      val fullyQualifiedOutputTableId = options.get("tableReferenceSink").get
      val isPartitionByDay = Try(options.get("partitionByDay").get.toBoolean).getOrElse(true)

      val bqDF = new BigQueryDataFrame(data)
      bqDF.saveAsBigQueryTable(fullyQualifiedOutputTableId, isPartitionByDay)
      fileLog.writeBatch(batchId)
    }
  }
}

object BigQuerySink {
  // The name of the subdirectory that is used to store metadata about which files are valid.
  val metadataDir = "_spark_metadata"
} 
Example 189
Source File: DataFrameReaderFunctions.scala    From spark-bigquery   with Apache License 2.0 5 votes vote down vote up
package com.samelamin.spark.bigquery

import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, DataFrameReader}


  private def buildFrame(options: Map[String, String] = null, schema: StructType = null): DataFrame = {
    val builder = dfr
      .format(source)
      .schema(schema)

    if (options != null) {
      builder.options(options)
    }

    builder.load()
  }
} 
Example 190
Source File: SqsSource.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.sqs

import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation}
import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.execution.streaming.FileStreamSource._
import org.apache.spark.sql.types.StructType


class SqsSource(sparkSession: SparkSession,
                metadataPath: String,
                options: Map[String, String],
                override val schema: StructType) extends Source with Logging {

  private val sourceOptions = new SqsSourceOptions(options)

  private val hadoopConf = sparkSession.sessionState.newHadoopConf()

  private val metadataLog =
    new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath)
  private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L)

  private val maxFilesPerTrigger = sourceOptions.maxFilesPerTrigger

  private val maxFileAgeMs: Long = sourceOptions.maxFileAgeMs

  private val fileFormatClassName = sourceOptions.fileFormatClassName

  private val shouldSortFiles = sourceOptions.shouldSortFiles

  private val sqsClient = new SqsClient(sourceOptions, hadoopConf)

  metadataLog.allFiles().foreach { entry =>
    sqsClient.sqsFileCache.add(entry.path, MessageDescription(entry.timestamp, true, ""))
  }
  sqsClient.sqsFileCache.purge()

  logInfo(s"maxFilesPerBatch = $maxFilesPerTrigger, maxFileAgeMs = $maxFileAgeMs")

   
    val batchFiles = sqsClient.sqsFileCache.getUncommittedFiles(maxFilesPerTrigger, shouldSortFiles)

    if (batchFiles.nonEmpty) {
      metadataLogCurrentOffset += 1
      metadataLog.add(metadataLogCurrentOffset, batchFiles.map {
        case (path, timestamp, receiptHandle) =>
          FileEntry(path = path, timestamp = timestamp, batchId = metadataLogCurrentOffset)
      }.toArray)
      logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files")
      val messageReceiptHandles = batchFiles.map {
        case (path, timestamp, receiptHandle) =>
          sqsClient.sqsFileCache.markCommitted(path)
          logDebug(s"New file: $path")
          receiptHandle
      }.toList
      sqsClient.addToDeleteMessageQueue(messageReceiptHandles)
    }

    val numPurged = sqsClient.sqsFileCache.purge()

    if (!sqsClient.deleteMessageQueue.isEmpty) {
      sqsClient.deleteMessagesFromQueue()
    }

    logTrace(
      s"""
         |Number of files selected for batch = ${batchFiles.size}
         |Number of files purged from tracking map = $numPurged
       """.stripMargin)

    FileStreamSourceOffset(metadataLogCurrentOffset)
  }

  override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1)

  override def commit(end: Offset): Unit = {
    // No-op for now; SqsSource currently garbage-collects files based on timestamp
    // and the value of the maxFileAge parameter.
  }

  override def stop(): Unit = {
    if (!sqsClient.sqsScheduler.isTerminated) {
      sqsClient.sqsScheduler.shutdownNow()
    }
  }

  override def toString: String = s"SqsSource[${sqsClient.sqsUrl}]"

} 
Example 191
Source File: MQTTStreamSink.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
package org.apache.bahir.sql.streaming.mqtt

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.eclipse.paho.client.mqttv3.MqttException

import org.apache.spark.SparkEnv
import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport}
import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage}
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StructType

import org.apache.bahir.utils.Logging
import org.apache.bahir.utils.Retry


class MQTTStreamWriter (schema: StructType, parameters: DataSourceOptions)
    extends StreamWriter with Logging {
  override def createWriterFactory(): DataWriterFactory[InternalRow] = {
    // Skipping client identifier as single batch can be distributed to multiple
    // Spark worker process. MQTT server does not support two connections
    // declaring same client ID at given point in time.
    val params = parameters.asMap().asScala.filterNot(
      _._1.equalsIgnoreCase("clientId")
    )
    MQTTDataWriterFactory(params)
  }

  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}

  override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
}

case class MQTTDataWriterFactory(config: mutable.Map[String, String])
    extends DataWriterFactory[InternalRow] {
  override def createDataWriter(
    partitionId: Int, taskId: Long, epochId: Long
  ): DataWriter[InternalRow] = new MQTTDataWriter(config)
}

case object MQTTWriterCommitMessage extends WriterCommitMessage

class MQTTDataWriter(config: mutable.Map[String, String]) extends DataWriter[InternalRow] {
  private lazy val publishAttempts: Int =
    SparkEnv.get.conf.getInt("spark.mqtt.client.publish.attempts", -1)
  private lazy val publishBackoff: Long =
    SparkEnv.get.conf.getTimeAsMs("spark.mqtt.client.publish.backoff", "5s")

  private lazy val (_, _, topic, _, _, qos, _, _, _) = MQTTUtils.parseConfigParams(config.toMap)

  override def write(record: InternalRow): Unit = {
    val client = CachedMQTTClient.getOrCreate(config.toMap)
    val message = record.getBinary(0)
    Retry(publishAttempts, publishBackoff, classOf[MqttException]) {
      // In case of errors, retry sending the message.
      client.publish(topic, message, qos, false)
    }
  }

  override def commit(): WriterCommitMessage = MQTTWriterCommitMessage

  override def abort(): Unit = {}
}

case class MQTTRelation(override val sqlContext: SQLContext, data: DataFrame)
    extends BaseRelation {
  override def schema: StructType = data.schema
}

class MQTTStreamSinkProvider extends DataSourceV2 with StreamWriteSupport
    with DataSourceRegister with CreatableRelationProvider {
  override def createStreamWriter(queryId: String, schema: StructType,
      mode: OutputMode, options: DataSourceOptions): StreamWriter = {
    new MQTTStreamWriter(schema, options)
  }

  override def createRelation(sqlContext: SQLContext, mode: SaveMode,
      parameters: Map[String, String], data: DataFrame): BaseRelation = {
    MQTTRelation(sqlContext, data)
  }

  override def shortName(): String = "mqtt"
} 
Example 192
Source File: StarsAnalysisDemo.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package applications.analysis

import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter}

import functions.segment.Segmenter
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}


object StarsAnalysisDemo {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("Stars Analysis Demo")
      .getOrCreate()

    val filePath = "E:/data/chinaNews/entertainment.txt"


    // 加载数据,并保留年份和内容字段,并对内容字段进行过滤
    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) {
        var year: String = tokens(2).split("-")(0)
        if (tokens(2).contains("年")) year = tokens(2).split("年")(0)

        var content = tokens(3)
        if (content.length > 22 && content.substring(0, 20).contains("日电")) {
          content = content.substring(content.indexOf("日电") + 2, content.length).trim
        }

        if (content.startsWith("(")) content = content.substring(content.indexOf(")") + 1, content.length)
        if (content.length > 20 && content.substring(content.length - 20, content.length).contains("记者")) {
          content = content.substring(0, content.lastIndexOf("记者")).trim
        }

        Some(year, content)
      } else None
    }.toDF("year", "content")

    // 分词,去除长度为1的词,每个词保留词性
    val segmenter = new Segmenter()
      .isAddNature(true)
      .isDelEn(true)
      .isDelNum(true)
      .setMinTermLen(2)
      .setMinTermNum(5)
      .setSegType("StandardSegment")
      .setInputCol("content")
      .setOutputCol("segmented")
    val segDF: DataFrame = segmenter.transform(data)
    segDF.cache()

    val segRDD: RDD[(Int, Seq[String])] = segDF.select("year", "segmented").rdd.map { case Row(year: String, terms: Seq[String]) =>
      (Integer.parseInt(year), terms)
    }

    val result: Array[String] = segRDD.map(line => line._1.toString + "\u00ef" + line._2.mkString(",")).collect()
    val writer: BufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E:/entertainment_seg.txt")))
    result.foreach(line => writer.write(line + "\n"))
    writer.close()

    // 统计2016出现在新闻中最多的明星
    val stars2016 = segRDD.filter(_._1 == 2016)
      .flatMap { case (year: Int, termStr: Seq[String]) =>
        val person = termStr
          .map(term => (term.split("/")(0), term.split("/")(1)))
          .filter(_._2.equalsIgnoreCase("nr"))
          .map(term => (term._1, 1L))

        person
      }
      .reduceByKey(_ + _)
      .sortBy(_._2, ascending = false)

    segDF.unpersist()

    stars2016.take(100).foreach(println)

    spark.stop()
  }
} 
Example 193
Source File: Preprocessor.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package functions

import config.paramconf.PreprocessParams
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer}
import org.apache.spark.sql.DataFrame


  def preprocess(data: DataFrame): Pipeline = {
    val spark = data.sparkSession
    val params = new PreprocessParams

    val indexModel = new StringIndexer()
      .setHandleInvalid(params.handleInvalid)
      .setInputCol("label")
      .setOutputCol("indexedLabel")
      .fit(data)

    val cleaner = new Cleaner()
      .setFanJian(params.fanjian)
      .setQuanBan(params.quanban)
      .setMinLineLen(params.minLineLen)
      .setInputCol("content")
      .setOutputCol("cleand")

    val segmenter = new Segmenter()
      .isAddNature(params.addNature)
      .isDelEn(params.delEn)
      .isDelNum(params.delNum)
      .isNatureFilter(params.natureFilter)
      .setMinTermLen(params.minTermLen)
      .setMinTermNum(params.minTermNum)
      .setSegType(params.segmentType)
      .setInputCol(cleaner.getOutputCol)
      .setOutputCol("segmented")

    val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect()
    val remover = new StopWordsRemover()
      .setStopWords(stopwords)
      .setInputCol(segmenter.getOutputCol)
      .setOutputCol("removed")

    val vectorizer = new CountVectorizer()
      .setMinTF(params.minTF)
      .setVocabSize(params.vocabSize)
      .setInputCol(remover.getOutputCol)
      .setOutputCol("vectorized")

    val idf = new IDF()
      .setMinDocFreq(params.minDocFreq)
      .setInputCol(vectorizer.getOutputCol)
      .setOutputCol("features")

    val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf)
    new Pipeline().setStages(stages)
  }
} 
Example 194
Source File: Cleaner.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package functions.clean

import com.hankcs.hanlp.HanLP
import config.paramconf.{HasOutputCol, HasInputCol}
import functions.MySchemaUtils
import functions.clean.chinese.BCConvert
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}



  setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1)

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema, logging = true)

    val cleanFunc = udf {line: String =>
      var cleaned = ""
      getFanJian match {
        case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line)
        case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line)
        case _ => cleaned = line
      }

      getQuanBan match {
        case "q2b" => cleaned = BCConvert.qj2bj(cleaned)
        case "b2q" => cleaned = BCConvert.bj2qj(cleaned)
        case _ => cleaned = cleaned
      }

      cleaned
    }

    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record =>
      val outputIndex = record.fieldIndex($(outputCol))
      record.getString(outputIndex).length >= getMinLineLen
    }
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.typeName.equals(StringType.typeName),
      s"Input type must be StringType but got $inputType.")
    MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
  }
}


object Cleaner extends DefaultParamsReadable[Cleaner] {
  override def load(path: String): Cleaner = super.load(path)
} 
Example 195
Source File: QueryTest.scala    From spark-netezza   with Apache License 2.0 5 votes vote down vote up
package com.ibm.spark.netezza.integration

import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.{DataFrame, Row}
import org.scalatest.FunSuite

  def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
    val isSorted = df.queryExecution.logical.collect { case s: logical.Sort => s }.nonEmpty

    val sparkAnswer = try df.collect().toSeq catch {
      case e: Exception =>
        val errorMessage =
          s"""
             |Exception thrown while executing query:
             |${df.queryExecution}
             |== Exception ==
             |$e
             |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
          """.stripMargin
        return Some(errorMessage)
    }

    sameRows(expectedAnswer, sparkAnswer, isSorted).map { results =>
      s"""
         |Results do not match for query:
         |${df.queryExecution}
         |== Results ==
         |$results
       """.stripMargin
    }
  }

  def prepareAnswer(answer: Seq[Row], isSorted: Boolean): Seq[Row] = {
    // Converts data to types that we can do equality comparison using Scala collections.
    // For BigDecimal type, the Scala type has a better definition of equality test (similar to
    // Java's java.math.BigDecimal.compareTo).
    // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for
    // equality test.
    val converted: Seq[Row] = answer.map(prepareRow)
    if (!isSorted) converted.sortBy(_.toString()) else converted
  }

  // We need to call prepareRow recursively to handle schemas with struct types.
  def prepareRow(row: Row): Row = {
    Row.fromSeq(row.toSeq.map {
      case null => null
      case d: java.math.BigDecimal => BigDecimal(d)
      // Convert array to Seq for easy equality check.
      case b: Array[_] => b.toSeq
      case r: Row => prepareRow(r)
      case o => o
    })
  }

  def sameRows(
                expectedAnswer: Seq[Row],
                sparkAnswer: Seq[Row],
                isSorted: Boolean = false): Option[String] = {
    if (prepareAnswer(expectedAnswer, isSorted) != prepareAnswer(sparkAnswer, isSorted)) {
      val errorMessage =
        s"""
           |== Results ==
           |${sideBySide(
          s"== Correct Answer - ${expectedAnswer.size} ==" +:
            prepareAnswer(expectedAnswer, isSorted).map(_.toString()),
          s"== Spark Answer - ${sparkAnswer.size} ==" +:
            prepareAnswer(sparkAnswer, isSorted).map(_.toString())).mkString("\n")}
        """.stripMargin
      return Some(errorMessage)
    }
    None
  }

  def sideBySide(left: Seq[String], right: Seq[String]): Seq[String] = {
    val maxLeftSize = left.map(_.size).max
    val leftPadded = left ++ Seq.fill(math.max(right.size - left.size, 0))("")
    val rightPadded = right ++ Seq.fill(math.max(left.size - right.size, 0))("")

    leftPadded.zip(rightPadded).map {
      case (l, r) => (if (l == r) " " else "!") + l + (" " * ((maxLeftSize - l.size) + 3)) + r
    }
  }
} 
Example 196
Source File: TablePartitionColIntegrationTestSuite.scala    From spark-netezza   with Apache License 2.0 5 votes vote down vote up
package com.ibm.spark.netezza.integration

import org.apache.spark.sql.{DataFrame, Row}
import org.netezza.error.NzSQLException


class TablePartitionColIntegrationTestSuite extends IntegrationSuiteBase with QueryTest {
  val tabName = "staff"
  val expected = Seq(
    Row(1, "John Doe"),
    Row(2, "Jeff Smith"),
    Row(3, "Kathy Saunders"),
    Row(4, null))

  val expectedFiltered = Seq(Row(1, "John Doe"), Row(2, "Jeff Smith"))


  override def beforeAll(): Unit = {
    super.beforeAll()
    try {executeJdbcStmt(s"drop table $tabName")} catch { case e: NzSQLException => }
    executeJdbcStmt(s"create table $tabName(id int , name varchar(20))")
    executeJdbcStmt(s"insert into $tabName values(1 , 'John Doe')")
    executeJdbcStmt(s"insert into $tabName values(2 , 'Jeff Smith')")
    executeJdbcStmt(s"insert into $tabName values(3 , 'Kathy Saunders')")
    executeJdbcStmt(s"insert into $tabName values(4 , null)")
  }

  override def afterAll(): Unit = {
    try {
      executeJdbcStmt(s"DROP TABLE $tabName")
    } finally {
      super.afterAll()
    }
  }

  private def defaultOpts() = {
    Map("url" -> testURL,
      "user" -> user,
      "password" -> password,
      "numPartitions" -> Integer.toString(1))
  }


  test("Test table read with column partitions") {
    val opts = defaultOpts +
      ("dbtable" -> s"$tabName") +
      ("partitioncol" -> "ID") +
      ("numPartitions" -> Integer.toString(4)) +
      ("lowerbound" -> "1") +
      ("upperbound" -> "100")

    val testDf = sqlContext.read.format("com.ibm.spark.netezza").options(opts).load()
    verifyAnswer(testDf, expected)
    verifyAnswer(testDf.filter("ID < 3"), expectedFiltered)
  }

  test("Test table read specifying lower or upper boundary") {
    var opts = defaultOpts +
      ("dbtable" -> s"$tabName") +
      ("partitioncol" -> "ID") +
      ("numPartitions" -> Integer.toString(4))

    val testOpts = Seq(opts , opts + ("lowerbound" -> "1"), opts + ("upperbound" -> "10"))
    for (opts <- testOpts) {
      val testDf = sqlContext.read.format("com.ibm.spark.netezza").options(opts).load()
      verifyAnswer(testDf, expected)
      verifyAnswer(testDf.filter("ID < 3"), expectedFiltered)
    }
  }

  test("Test table read with single partition") {
    val opts = defaultOpts +
      ("dbtable" -> s"$tabName") +
      ("partitioncol" -> "ID") +
      ("numPartitions" -> Integer.toString(1))

    val testDf = sqlContext.read.format("com.ibm.spark.netezza").options(opts).load()
    verifyAnswer(testDf, expected)
    verifyAnswer(testDf.filter("ID < 3"), expectedFiltered)
  }

  test("Test table with number of partitions set to zero.") {
    val opts = defaultOpts +
      ("dbtable" -> s"$tabName") +
      ("partitioncol" -> "ID") +
      ("numPartitions" -> Integer.toString(0))

    val testDf = sqlContext.read.format("com.ibm.spark.netezza").options(opts).load()
    verifyAnswer(testDf, expected)
  }
} 
Example 197
Source File: IntegrationSuiteBase.scala    From spark-netezza   with Apache License 2.0 5 votes vote down vote up
package com.ibm.spark.netezza.integration

import java.sql.Connection

import com.ibm.spark.netezza.NetezzaJdbcUtils
import com.typesafe.config.ConfigFactory
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, DataFrame, SQLContext}
import org.scalatest.{BeforeAndAfterAll, FunSuite}
import org.slf4j.LoggerFactory

trait IntegrationSuiteBase extends FunSuite with BeforeAndAfterAll with QueryTest{
  private val log = LoggerFactory.getLogger(getClass)

  protected var sc: SparkContext = _
  protected var sqlContext: SQLContext = _
  protected var conn: Connection = _
  protected val prop = new java.util.Properties

  // Configurable vals
  protected var configFile = "application"
  protected var testURL: String = _
  protected var testTable: String = _
  protected var user: String = _
  protected var password: String = _
  protected var numPartitions: Int = _
  protected var sampleDbmaxNumTables: Int = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    sc = new SparkContext("local[*]", "IntegrationTest", new SparkConf())
    sqlContext = new SQLContext(sc)

    val conf = ConfigFactory.load(configFile)
    testURL = conf.getString("test.integration.dbURL")
    testTable = conf.getString("test.integration.table")
    user = conf.getString("test.integration.user")
    password = conf.getString("test.integration.password")
    numPartitions = conf.getInt("test.integration.partition.number")
    sampleDbmaxNumTables = conf.getInt("test.integration.max.numtables")
    prop.setProperty("user", user)
    prop.setProperty("password", password)
    log.info("Attempting to get connection from" + testURL)
    conn = NetezzaJdbcUtils.getConnector(testURL, prop)()
    log.info("got connection.")
  }

  override def afterAll(): Unit = {
    try {
      sc.stop()
    }
    finally {
      conn.close()
      super.afterAll()
    }
  }

  
  def withTable(tableNames: String*)(f: => Unit): Unit = {
    try f finally {
      tableNames.foreach { name =>
        executeJdbcStmt(s"DROP TABLE $name")
      }
    }
  }
} 
Example 198
Source File: DataFrameExtensions.scala    From spark-powerbi-connector   with Apache License 2.0 5 votes vote down vote up
package com.microsoft.azure.powerbi.extensions

import java.sql.Timestamp
import java.util.Date

import scala.collection.mutable.ListBuffer

import com.microsoft.azure.powerbi.authentication.PowerBIAuthentication
import com.microsoft.azure.powerbi.common.PowerBIUtils
import com.microsoft.azure.powerbi.models.{table, PowerBIDatasetDetails}

import org.apache.spark.sql.DataFrame

object DataFrameExtensions {

  implicit def PowerBIDataFrame(dataFrame: DataFrame): PowerBIDataFrame =
    new PowerBIDataFrame(dataFrame: DataFrame)

  class PowerBIDataFrame(dataFrame: DataFrame) extends Serializable{

    def toPowerBI(powerbiDatasetDetails: PowerBIDatasetDetails, powerbiTable: table,
                  powerBIAuthentication: PowerBIAuthentication): Unit = {

      var authenticationToken: String = powerBIAuthentication.getAccessToken

      dataFrame.foreachPartition { partition =>

        // PowerBI row limit in single request is 10,000. We limit it to 1000.

        partition.grouped(1000).foreach {
          group => {
            val powerbiRowListBuffer: ListBuffer[Map[String, Any]] = ListBuffer[Map[String, Any]]()
            group.foreach {
              record => {
                var powerbiRow: Map[String, Any] = Map[String, Any]()

                for (i <- 0 until record.length) {
                  powerbiRow += (powerbiTable.columns(i).name -> record(i))
                }

                powerbiRowListBuffer += powerbiRow
              }

              var attemptCount = 0
              var pushSuccessful = false

              while (!pushSuccessful && attemptCount < this.retryCount) {
                try {

                    PowerBIUtils.addMultipleRows(powerbiDatasetDetails, powerbiTable,
                      powerbiRowListBuffer, authenticationToken)
                    pushSuccessful = true
                }
                catch {
                  case e: Exception =>
                    println(f"Exception inserting multiple rows: ${e.getMessage}")
                    Thread.sleep(secondsBetweenRetry * 1000)
                    attemptCount += 1

                    authenticationToken = powerBIAuthentication.refreshAccessToken
                }
              }
            }
          }
        }
      }
    }

    def countTimelineToPowerBI(powerbiDatasetDetails: PowerBIDatasetDetails, powerbiTable: table,
                               powerBIAuthentication: PowerBIAuthentication): Unit = {

      var authenticationToken: String = powerBIAuthentication.getAccessToken
      val currentTimestamp = new Timestamp(new Date().getTime)

      val powerbiRow = Map(powerbiTable.columns.head.name -> currentTimestamp,
        powerbiTable.columns(1).name -> dataFrame.count())

      var attemptCount = 0
      var pushSuccessful = false

      while (!pushSuccessful && attemptCount < this.retryCount) {
        try {
          PowerBIUtils.addRow(powerbiDatasetDetails, powerbiTable, powerbiRow, authenticationToken)
          pushSuccessful = true
        }
        catch {
          case e: Exception => println("Exception inserting row: " + e.getMessage)
            Thread.sleep(secondsBetweenRetry * 1000)
            attemptCount += 1

            authenticationToken = powerBIAuthentication.refreshAccessToken
        }
      }
    }

    private val retryCount: Int = 3
    private val secondsBetweenRetry: Int = 1
  }
} 
Example 199
Source File: ClassifierDatasetEncoder.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.ml.tensorflow

import com.johnsnowlabs.nlp.Annotation
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{size, explode, col}

import scala.collection.mutable

class ClassifierDatasetEncoder(val params: ClassifierDatasetEncoderParams) extends Serializable {

  val tags2Id: Map[String, Int] = params.tags.zipWithIndex
    .map(p => (p._1, p._2))
    .toMap

  val tags: Array[String] = tags2Id
    .map(p => (p._2, p._1))
    .toArray
    .sortBy(p => p._1)
    .map(p => p._2)

  def encodeTags(labels: Array[String]): Array[Array[Int]] = {
    labels.map { t =>
      val labelIDsArray = Array.fill(tags.length)(0)
      labelIDsArray(tags2Id(t)) = 1
      labelIDsArray
    }
  }

  
  def decodeOutputData(tagIds: Array[Array[Float]]): Array[Array[(String, Float)]] = {
    val scoresMetadata = tagIds.map { scores =>
      scores.zipWithIndex.flatMap {
        case (score, idx) =>
          val tag = tags2Id.find(_._2 == idx).map(_._1).getOrElse("NA")
          Map(tag -> score)
      }
    }

    scoresMetadata
  }
}

case class ClassifierDatasetEncoderParams(tags: Array[String]) 
Example 200
Source File: LightPipeline.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import org.apache.spark.ml.{PipelineModel, Transformer}
import org.apache.spark.sql.{DataFrame, Dataset}

import scala.collection.JavaConverters._

class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddingsVectors: Boolean = false) {

  private var ignoreUnsupported = false

  def setIgnoreUnsupported(v: Boolean): Unit = ignoreUnsupported = v
  def getIgnoreUnsupported: Boolean = ignoreUnsupported

  def getStages: Array[Transformer] = pipelineModel.stages

  def transform(dataFrame: Dataset[_]): DataFrame = pipelineModel.transform(dataFrame)

  def fullAnnotate(target: String, startWith: Map[String, Seq[Annotation]] = Map.empty[String, Seq[Annotation]]): Map[String, Seq[Annotation]] = {
    getStages.foldLeft(startWith)((annotations, transformer) => {
      transformer match {
        case documentAssembler: DocumentAssembler =>
          annotations.updated(documentAssembler.getOutputCol, documentAssembler.assemble(target, Map.empty[String, String]))
        case lazyAnnotator: AnnotatorModel[_] if lazyAnnotator.getLazyAnnotator => annotations
        case recursiveAnnotator: HasRecursiveTransform[_] with AnnotatorModel[_] =>
          val combinedAnnotations =
            recursiveAnnotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil))
          annotations.updated(recursiveAnnotator.getOutputCol, recursiveAnnotator.annotate(combinedAnnotations, pipelineModel))
        case annotator: AnnotatorModel[_] =>
          val combinedAnnotations =
            annotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil))
          annotations.updated(annotator.getOutputCol, annotator.annotate(combinedAnnotations))
        case finisher: Finisher =>
          annotations.filterKeys(finisher.getInputCols.contains)
        case rawModel: RawAnnotator[_] =>
          if (ignoreUnsupported) annotations
          else throw new IllegalArgumentException(s"model ${rawModel.uid} does not support LightPipeline." +
            s" Call setIgnoreUnsupported(boolean) on LightPipeline to ignore")
        case pipeline: PipelineModel =>
          new LightPipeline(pipeline, parseEmbeddingsVectors).fullAnnotate(target, annotations)
        case _ => annotations
      }
    })
  }

  def fullAnnotate(targets: Array[String]): Array[Map[String, Seq[Annotation]]] = {
    targets.par.map(target => {
      fullAnnotate(target)
    }).toArray
  }

  def fullAnnotateJava(target: String): java.util.Map[String, java.util.List[JavaAnnotation]] = {
    fullAnnotate(target).mapValues(_.map(aa =>
      JavaAnnotation(aa.annotatorType, aa.begin, aa.end, aa.result, aa.metadata.asJava)).asJava).asJava
  }

  def fullAnnotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[JavaAnnotation]]] = {
    targets.asScala.par.map(target => {
      fullAnnotateJava(target)
    }).toList.asJava
  }

  def annotate(target: String): Map[String, Seq[String]] = {
    fullAnnotate(target).mapValues(_.map(a => {
      a.annotatorType match {
        case (AnnotatorType.WORD_EMBEDDINGS |
             AnnotatorType.SENTENCE_EMBEDDINGS) if (parseEmbeddingsVectors) =>  a.embeddings.mkString(" ")
        case _ => a.result
      }
    }))
  }

  def annotate(targets: Array[String]): Array[Map[String, Seq[String]]] = {
    targets.par.map(target => {
      annotate(target)
    }).toArray
  }

  def annotateJava(target: String): java.util.Map[String, java.util.List[String]] = {
    annotate(target).mapValues(_.asJava).asJava
  }

  def annotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[String]]] = {
    targets.asScala.par.map(target => {
      annotateJava(target)
    }).toList.asJava
  }

}