org.apache.spark.sql.DataFrame Scala Examples
The following examples show how to use org.apache.spark.sql.DataFrame.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingConsumer.scala From Scala-Programming-Projects with MIT License | 11 votes |
package coinyser import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.functions._ object StreamingConsumer { def fromJson(df: DataFrame): Dataset[Transaction] = { import df.sparkSession.implicits._ val schema = Seq.empty[Transaction].toDS().schema df.select(from_json(col("value").cast("string"), schema).alias("v")) .select("v.*").as[Transaction] } def transactionStream(implicit spark: SparkSession, config: KafkaConfig): Dataset[Transaction] = fromJson(spark.readStream.format("kafka") .option("kafka.bootstrap.servers", config.bootStrapServers) .option("startingoffsets", "earliest") .option("subscribe", config.transactionsTopic) .load() ) }
Example 2
Source File: MultilayerPerceptronClassifierWrapper.scala From drizzle-spark with Apache License 2.0 | 8 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel, val labelCount: Long, val layers: Array[Int], val weights: Array[Double] ) extends MLWritable { def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val labelCount = (rMetadata \ "labelCount").extract[Long] val layers = (rMetadata \ "layers").extract[Array[Int]] val weights = (rMetadata \ "weights").extract[Array[Double]] val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = ("class" -> instance.getClass.getName) ~ ("labelCount" -> instance.labelCount) ~ ("layers" -> instance.layers.toSeq) ~ ("weights" -> instance.weights.toArray.toSeq) val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 3
Source File: DefaultSource.scala From spark-snowflake with Apache License 2.0 | 7 votes |
package net.snowflake.spark.snowflake import net.snowflake.spark.snowflake.streaming.SnowflakeSink import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_SHORT_NAME import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import org.slf4j.LoggerFactory override def createRelation(sqlContext: SQLContext, saveMode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val params = Parameters.mergeParameters(parameters) // check spark version for push down if (params.autoPushdown) { SnowflakeConnectorUtils.checkVersionAndEnablePushdown( sqlContext.sparkSession ) } // pass parameters to pushdown functions pushdowns.setGlobalParameter(params) val table = params.table.getOrElse { throw new IllegalArgumentException( "For save operations you must specify a Snowfake table name with the 'dbtable' parameter" ) } def tableExists: Boolean = { val conn = jdbcWrapper.getConnector(params) try { jdbcWrapper.tableExists(conn, table.toString) } finally { conn.close() } } val (doSave, dropExisting) = saveMode match { case SaveMode.Append => (true, false) case SaveMode.Overwrite => (true, true) case SaveMode.ErrorIfExists => if (tableExists) { sys.error( s"Table $table already exists! (SaveMode is set to ErrorIfExists)" ) } else { (true, false) } case SaveMode.Ignore => if (tableExists) { log.info(s"Table $table already exists -- ignoring save request.") (false, false) } else { (true, false) } } if (doSave) { val updatedParams = parameters.updated("overwrite", dropExisting.toString) new SnowflakeWriter(jdbcWrapper) .save( sqlContext, data, saveMode, Parameters.mergeParameters(updatedParams) ) } createRelation(sqlContext, parameters) } override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = new SnowflakeSink(sqlContext, parameters, partitionColumns, outputMode) }
Example 4
Source File: DataFrameExample.scala From drizzle-spark with Apache License 2.0 | 7 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 5
Source File: JdbcRelationProvider.scala From drizzle-spark with Apache License 2.0 | 7 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 6
Source File: Cleaner.scala From cleanframes with Apache License 2.0 | 6 votes |
package cleanframes import org.apache.spark.sql.{Column, DataFrame, functions} import shapeless.labelled.FieldType import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness} trait Cleaner[A] { def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column] } object Cleaner { def apply[A](frame: DataFrame, name: Option[String], alias: Option[String])(implicit env: Cleaner[A]): DataFrame = { frame.select( env.clean(frame, name, alias): _* ) } def materialize[A](func: (DataFrame, Option[String], Option[String]) => List[Column]): Cleaner[A] = new Cleaner[A] { override def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column] = func(frame, name, alias) } implicit val hnilCleaner: Cleaner[HNil] = materialize((_, _, _) => Nil) implicit def genericObjectCleaner[A, H <: HList](implicit gen: LabelledGeneric.Aux[A, H], hCleaner: Lazy[Cleaner[H]]): Cleaner[A] = materialize((frame, name, alias) => { val structColumn = functions.struct( hCleaner.value.clean(frame, name, alias): _* ) List( alias .map(structColumn.as) .getOrElse(structColumn) ) }) implicit def hlistObjectCleaner[K <: Symbol, H, T <: HList](implicit witness: Witness.Aux[K], hCleaner: Lazy[Cleaner[H]], tCleaner: Cleaner[T]): Cleaner[FieldType[K, H] :: T] = { val fieldName: String = witness.value.name materialize { (frame, name, alias) => val columnName = alias match { case None | Some(`reserved_root_level_alias`) => fieldName case Some(alias) => s"$alias.$fieldName" } val hColumns = hCleaner.value.clean(frame, Some(columnName), alias = Some(fieldName)) val tColumns = tCleaner.clean(frame, name, alias) hColumns ::: tColumns } } }
Example 7
Source File: OnErrorSuite.scala From spark-snowflake with Apache License 2.0 | 6 votes |
package net.snowflake.spark.snowflake import net.snowflake.client.jdbc.SnowflakeSQLException import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.apache.spark.sql.types.{StringType, StructField, StructType} class OnErrorSuite extends IntegrationSuiteBase { lazy val table = s"spark_test_table_$randomSuffix" lazy val schema = new StructType( Array(StructField("var", StringType, nullable = false)) ) lazy val df: DataFrame = sparkSession.createDataFrame( sc.parallelize( Seq(Row("{\"dsadas\nadsa\":12311}"), Row("{\"abc\":334}")) // invalid json key ), schema ) override def beforeAll(): Unit = { super.beforeAll() jdbcUpdate(s"create or replace table $table(var variant)") } override def afterAll(): Unit = { jdbcUpdate(s"drop table $table") super.afterAll() } test("continue_on_error off") { assertThrows[SnowflakeSQLException] { df.write .format(SNOWFLAKE_SOURCE_NAME) .options(connectorOptionsNoTable) .option("dbtable", table) .mode(SaveMode.Append) .save() } } test("continue_on_error on") { df.write .format(SNOWFLAKE_SOURCE_NAME) .options(connectorOptionsNoTable) .option("continue_on_error", "on") .option("dbtable", table) .mode(SaveMode.Append) .save() val result = sparkSession.read .format(SNOWFLAKE_SOURCE_NAME) .options(connectorOptionsNoTable) .option("dbtable", table) .load() assert(result.collect().length == 1) } }
Example 8
Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0 | 6 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.log4j import scala.collection.mutable object MNISTBenchmark { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt) val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2" val numPartitions = if(args.length >= 3) args(2).toInt else 10 val models = if(args.length >=4) args(3).split(',') else Array("tree","naive") val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, path) .zipWithIndex() .filter(_._2 < ns.max) .sortBy(_._2, numPartitions = numPartitions) .keys .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) .cache() dataset.count() //force persist val limiter = new Limiter() val knn = new KNNClassifier() .setTopTreeSize(numPartitions * 10) .setFeaturesCol("features") .setPredictionCol("prediction") .setK(1) val naiveKNN = new NaiveKNNClassifier() val pipeline = new Pipeline() .setStages(Array(limiter, knn)) val naivePipeline = new Pipeline() .setStages(Array(limiter, naiveKNN)) val paramGrid = new ParamGridBuilder() .addGrid(limiter.n, ns) .build() val bm = new Benchmarker() .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumTimes(3) val metrics = mutable.ArrayBuffer[String]() if(models.contains("tree")) { val bmModel = bm.setEstimator(pipeline).fit(dataset) metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}" } if(models.contains("naive")) { val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset) metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}" } logger.info(metrics.mkString("\n")) } } class Limiter(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("limiter")) val n: IntParam = new IntParam(this, "n", "number of rows to limit") def setN(value: Int): this.type = set(n, value) // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN) override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF() override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = schema }
Example 9
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 10
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 11
Source File: SparkPFASuiteBase.scala From aardpfark with Apache License 2.0 | 6 votes |
package com.ibm.aardpfark.pfa import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.SparkConf import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.scalactic.Equality import org.scalatest.FunSuite abstract class SparkPFASuiteBase extends FunSuite with DataFrameSuiteBase with PFATestUtils { val sparkTransformer: Transformer val input: Array[String] val expectedOutput: Array[String] val sparkConf = new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID). set("spark.driver.host", "localhost") override lazy val spark = SparkSession.builder().config(sparkConf).getOrCreate() override val reuseContextIfPossible = true // Converts column containing a vector to an array def withColumnAsArray(df: DataFrame, colName: String) = { val vecToArray = udf { v: Vector => v.toArray } df.withColumn(colName, vecToArray(df(colName))) } def withColumnAsArray(df: DataFrame, first: String, others: String*) = { val vecToArray = udf { v: Vector => v.toArray } var result = df.withColumn(first, vecToArray(df(first))) others.foreach(c => result = result.withColumn(c, vecToArray(df(c)))) result } // Converts column containing a vector to a sparse vector represented as a map def getColumnAsSparseVectorMap(df: DataFrame, colName: String) = { val vecToMap = udf { v: Vector => v.toSparse.indices.map(i => (i.toString, v(i))).toMap } df.withColumn(colName, vecToMap(df(colName))) } } abstract class Result object ApproxEquality extends ApproxEquality trait ApproxEquality { import org.scalactic.Tolerance._ import org.scalactic.TripleEquals._ implicit val seqApproxEq: Equality[Seq[Double]] = new Equality[Seq[Double]] { override def areEqual(a: Seq[Double], b: Any): Boolean = { b match { case d: Seq[Double] => a.zip(d).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } implicit val vectorApproxEq: Equality[Vector] = new Equality[Vector] { override def areEqual(a: Vector, b: Any): Boolean = { b match { case v: Vector => a.toArray.zip(v.toArray).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } }
Example 12
Source File: TopTransformer.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.transform import com.collective.modelmatrix.CategoricalColumn.AllOther import com.collective.modelmatrix.transform.FeatureTransformationError.{FeatureColumnNotFound, UnsupportedTransformDataType} import com.collective.modelmatrix.{CategoricalColumn, ModelFeature} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types._ import org.slf4j.LoggerFactory import scalaz.{@@, \/} import scalaz.syntax.either._ class TopTransformer(input: DataFrame @@ Transformer.Features) extends CategoricalTransformer(input) { private val log = LoggerFactory.getLogger(classOf[TopTransformer]) private val supportedDataTypes = Seq(ShortType, IntegerType, LongType, DoubleType, StringType) def validate: PartialFunction[ModelFeature, FeatureTransformationError \/ TypedModelFeature] = { case f@ModelFeature(_, _, _, _, Top(_, _)) if featureDataType(f.feature).isEmpty => FeatureColumnNotFound(f.feature).left case f@ModelFeature(_, _, _, _, Top(_, _)) if featureDataType(f.feature).isDefined && supportedDataTypes.contains(featureDataType(f.feature).get) => TypedModelFeature(f, featureDataType(f.feature).get).right case f@ModelFeature(_, _, _, _, t@Top(_, _)) => UnsupportedTransformDataType(f.feature, featureDataType(f.feature).get, t).left } def transform(feature: TypedModelFeature): Seq[CategoricalColumn] = { require(feature.feature.transform.isInstanceOf[Top], s"Illegal transform type: ${feature.feature.transform}") val ModelFeature(_, _, f, _, Top(cover, allOther)) = feature.feature log.info(s"Calculate top transformation for feature: ${feature.feature.feature}. " + s"Cover: $cover. " + s"All other: $allOther. " + s"Extract type: ${feature.extractType}") // Group and count by extract value val df = scalaz.Tag.unwrap(input) val grouped: DataFrame = df.filter(df(f).isNotNull).groupBy(f).count() val featureValues: Seq[Value] = grouped.collect().toSeq.map { row => val value = row.get(0) val cnt = row.getLong(1) Value(value, cnt) } log.debug(s"Collected '$f' values: ${featureValues.size}") val topValues = featureValues.sortBy(_.count)(implicitly[Ordering[Long]].reverse) // Get number of columns below cover threshold val threshold = (cover / 100) * topValues.map(_.count).sum val columnsBelowThreshold = topValues.map(_.count).scanLeft(0L)((cum, cnt) => cum + cnt).takeWhile(_ < threshold).size // Transform categorical values val valueColumns = topValues.take(columnsBelowThreshold).foldLeft(Scan()) { case (state@Scan(columnId, cumulativeCnt, columns), value) => val column = valueColumn(feature.extractType)(columnId, cumulativeCnt, value) Scan(column.columnId, column.cumulativeCount, columns :+ column) } // Get all other columns if required val allOtherColumns = if (allOther) { val allOtherCnt = topValues.drop(columnsBelowThreshold).map(_.count).sum Seq(AllOther(valueColumns.columnId + 1, allOtherCnt, valueColumns.cumulativeCnt + allOtherCnt)) } else Seq.empty // Add them together valueColumns.columns ++ allOtherColumns.filter(_.count > 0) } }
Example 13
Source File: IndexTransformer.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.transform import com.collective.modelmatrix.CategoricalColumn.AllOther import com.collective.modelmatrix.{CategoricalColumn, ModelFeature} import com.collective.modelmatrix.transform.FeatureTransformationError.{UnsupportedTransformDataType, FeatureColumnNotFound} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types._ import org.slf4j.LoggerFactory import scalaz.{@@, \/} import scalaz.syntax.either._ class IndexTransformer(input: DataFrame @@ Transformer.Features) extends CategoricalTransformer(input) { private val log = LoggerFactory.getLogger(classOf[IndexTransformer]) private val supportedDataTypes = Seq(ShortType, IntegerType, LongType, DoubleType, StringType) def validate: PartialFunction[ModelFeature, FeatureTransformationError \/ TypedModelFeature] = { case f@ModelFeature(_, _, _, _, Index(_, _)) if featureDataType(f.feature).isEmpty => FeatureColumnNotFound(f.feature).left case f@ModelFeature(_, _, _, _, Index(_, _)) if featureDataType(f.feature).isDefined && supportedDataTypes.contains(featureDataType(f.feature).get) => TypedModelFeature(f, featureDataType(f.feature).get).right case f@ModelFeature(_, _, _, _, t@Index(_, _)) => UnsupportedTransformDataType(f.feature, featureDataType(f.feature).get, t).left } def transform(feature: TypedModelFeature): Seq[CategoricalColumn] = { require(feature.feature.transform.isInstanceOf[Index], s"Illegal transform type: ${feature.feature.transform}") val ModelFeature(_, _, f, _, Index(support, allOther)) = feature.feature log.info(s"Calculate index transformation for feature: ${feature.feature.feature}. " + s"Support: $support. " + s"All other: $allOther. " + s"Extract type: ${feature.extractType}") val df = scalaz.Tag.unwrap(input) import org.apache.spark.sql.functions._ // Group and count by extract value val grouped: DataFrame = df.filter(df(f).isNotNull).groupBy(f).count() // Get support threshold val totalCount = grouped.sumOf("count") val threshold = (support / 100) * totalCount // Collect only support values val supportValues: Seq[Value] = grouped.filter(col("count") > threshold).collect().toSeq.map { row => val value = row.get(0) val cnt = row.getLong(1) Value(value, cnt) } val topSupportValues = supportValues.sortBy(_.count)(implicitly[Ordering[Long]].reverse) log.debug(s"Collected '$f' support values: ${supportValues.size}") // Transform categorical values val valueColumns = topSupportValues.foldLeft(Scan()) { case (state@Scan(columnId, cumulativeCnt, columns), value) => val column = valueColumn(feature.extractType)(columnId, cumulativeCnt, value) Scan(column.columnId, column.cumulativeCount, columns :+ column) } // Get all other column if required val allOtherColumns = if (allOther && support < 100.0) { // Count for values that are not in support set val allOtherCnt = grouped.filter(col("count") <= threshold).sumOf("count") Seq(AllOther(valueColumns.columnId + 1, allOtherCnt, valueColumns.cumulativeCnt + allOtherCnt)) } else Seq.empty // Add them together valueColumns.columns ++ allOtherColumns.filter(_.count > 0) } }
Example 14
Source File: Transformers.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.transform import com.collective.modelmatrix.ModelFeature import org.apache.spark.sql.{SQLContext, DataFrame} import scalaz._ trait Transformers { protected class Transformers(input: DataFrame @@ Transformer.Features)(implicit sqlContext: SQLContext) { val identity = new IdentityTransformer(input) val top = new TopTransformer(input) val index = new IndexTransformer(input) val bins = new BinsTransformer(input) private val unknownFeature: PartialFunction[ModelFeature, FeatureTransformationError \/ TypedModelFeature] = { case feature => sys.error(s"Feature can't be validated by any of transformers: $feature") } def validate(feature: ModelFeature): FeatureTransformationError \/ TypedModelFeature = (identity.validate orElse top.validate orElse index.validate orElse bins.validate orElse unknownFeature )(feature) } }
Example 15
Source File: BinsTransformer.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.transform import com.collective.modelmatrix.BinColumn.BinValue import com.collective.modelmatrix.transform.FeatureTransformationError.{FeatureColumnNotFound, UnsupportedTransformDataType} import com.collective.modelmatrix.{BinColumn, ModelFeature} import com.typesafe.config.ConfigFactory import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types._ import org.slf4j.LoggerFactory import scalaz._ import scalaz.syntax.either._ class BinsTransformer(input: DataFrame @@ Transformer.Features) extends Transformer(input) with Binner { private val log = LoggerFactory.getLogger(classOf[BinsTransformer]) private val config = ConfigFactory.load() private val sampleSize = config.getLong("modelmatrix.transform.bins.sample-size") private val supportedDataTypes = Seq(ShortType, IntegerType, LongType, DoubleType) protected case class Scan(columnId: Int = 0, columns: Seq[BinValue] = Seq.empty) def validate: PartialFunction[ModelFeature, FeatureTransformationError \/ TypedModelFeature] = { case f@ModelFeature(_, _, _, _, Bins(_, _, _)) if featureDataType(f.feature).isEmpty => FeatureColumnNotFound(f.feature).left case f@ModelFeature(_, _, _, _, Bins(_, _, _)) if featureDataType(f.feature).isDefined && supportedDataTypes.contains(featureDataType(f.feature).get) => TypedModelFeature(f, featureDataType(f.feature).get).right case f@ModelFeature(_, _, _, _, b@Bins(_, _, _)) => UnsupportedTransformDataType(f.feature, featureDataType(f.feature).get, b).left } def transform(feature: TypedModelFeature): Seq[BinColumn] = { require(feature.feature.transform.isInstanceOf[Bins], s"Illegal transform type: ${feature.feature.transform}") val ModelFeature(_, _, f, _, Bins(nbins, minPoints, minPct)) = feature.feature log.info(s"Calculate bins transformation for feature: ${feature.feature.feature}. " + s"Bins: $nbins. " + s"Min points: $minPoints. " + s"Min percentage: $minPct. " + s"Extract type: ${feature.extractType}") val df = scalaz.Tag.unwrap(input) val inputSize = df.count() val fraction = if (sampleSize >= inputSize) 1.0D else sampleSize.toDouble / inputSize val sample = df.select(f).filter(df(f).isNotNull).sample(withReplacement = false, fraction) // Collect sample values val x = sample.collect().map { case row if feature.extractType == ShortType => row.getShort(0).toDouble case row if feature.extractType == IntegerType => row.getInt(0).toDouble case row if feature.extractType == LongType => row.getLong(0).toDouble case row if feature.extractType == DoubleType => row.getDouble(0) } log.debug(s"Collected sample size of: ${x.length}") // Doesn't make any sense to do binning if no enough sample points available require(x.length > nbins * 10, s"Number of sample points for binning is too small") // Find optimal split val bins = optimalSplit(x, nbins, minPoints, minPct) log.debug(s"Calculated optimal split: ${bins.size}. " + s"Bins: ${bins.map(bin => s"${bin.count} in [${bin.low}, ${bin.high})").mkString(", ")}") require(bins.size >= 2, s"Got less than 2 bins, probably sample size is too small or data is too skewed") // Transform bins to Bin columns val scan = bins.foldLeft(Scan()) { case (state@Scan(columnId, cols), bin) => val column = BinColumn.BinValue(columnId + 1, bin.low, bin.high, bin.count, x.length) Scan(column.columnId, cols :+ column) } val columns = scan.columns // Update first and last bins to catch out-of-sample values BinColumn.toLowerBin(columns.head) +: columns.drop(1).dropRight(1) :+ BinColumn.toUpperBin(columns.last) } }
Example 16
Source File: ModelFeatureSpec.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix import com.collective.modelmatrix.transform.{Bins, Identity, Index, Top} import com.typesafe.config.ConfigFactory import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.SqlParser import org.scalatest.FlatSpec import scalaz.syntax.validation._ class ModelFeatureSpec extends FlatSpec { private val isActive = true private val notActive = false private val isAllOther = true private val features = ConfigFactory.load("./matrix-model.conf").getConfig("features") "Model Feature" should "parse 'identity' feature" in { val adNetwork = ModelFeature.parse("ad_network", features.getConfig("ad_network")) assert(adNetwork == ModelFeature( isActive, "advertisement", "ad_network", "network", Identity ).successNel) } it should "parse 'top' feature" in { val adType = ModelFeature.parse("ad_type", features.getConfig("ad_type")) assert(adType == ModelFeature( isActive, "advertisement", "ad_type", "type", Top(95.0, isAllOther) ).successNel) } it should "parse 'index' feature" in { val adSize = ModelFeature.parse("ad_size", features.getConfig("ad_size")) assert(adSize == ModelFeature( isActive, "advertisement", "ad_size", "size", Index(0.5, isAllOther) ).successNel) } it should "parse 'bins' feature" in { val adPerformance = ModelFeature.parse("ad_performance", features.getConfig("ad_performance")) assert(adPerformance == ModelFeature( isActive, "performance", "ad_performance", "pct_clicks", Bins(10, 100, 1.0) ).successNel) } it should "parse extract expression" in { val popDensity = ModelFeature.parse("pop_density", features.getConfig("pop_density")) assert(popDensity.isSuccess) } it should "fail to parse bad extract expression" in { val popDensity = ModelFeature.parse("pop_density_err", features.getConfig("pop_density_err")) assert(popDensity.isFailure) } it should "parse deactivated feature" in { val adVisibility = ModelFeature.parse("ad_visibility", features.getConfig("ad_visibility")) assert(adVisibility == ModelFeature( notActive, "advertisement", "ad_visibility", "visibility", Top(95.0, isAllOther) ).successNel) } it should "fail on wrong transformation type" in { val adTag = ModelFeature.parse("ad_tag", features.getConfig("ad_tag")) assert(adTag == "Unknown transform type: magic-transform".failureNel) } it should "fail on wrong transformation parameter" in { val adPosition = ModelFeature.parse("ad_position", features.getConfig("ad_position")) assert(adPosition.isFailure) } }
Example 17
Source File: Sink.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.cli import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext} import scala.util.{Failure, Success, Try} sealed trait Sink { def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit } object Sink { private val hive = "hive://(.*)".r private val parquet = "parquet://(.*)".r def validate(sink: String): Either[String, Unit] = { Try(apply(sink)) match { case Success(s) => Right(()) case Failure(err) => Left(s"Unsupported sink type: $sink") } } def apply(sink: String): Sink = sink match { case hive(table) => HiveSink(table) case parquet(path) => ParquetSink(path) } } object NoSink extends Sink { def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = { sys.error(s"Sink is not defined") } override def toString: String = "Sink is not defined" } case class HiveSink( tableName: String ) extends Sink { def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = { df.saveAsTable(tableName, SaveMode.Overwrite) } override def toString: String = s"Hive table: $tableName" } case class ParquetSink( path: String ) extends Sink { def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = { df.saveAsParquetFile(path) } override def toString: String = s"Parquet: $path" }
Example 18
Source File: Source.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.cli import org.apache.spark.sql.{DataFrame, SQLContext} import scala.util.{Failure, Success, Try} sealed trait Source { def asDataFrame(implicit sqlContext: SQLContext): DataFrame } object Source { private val hive = "hive://(.*)".r private val parquet = "parquet://(.*)".r def validate(source: String): Either[String, Unit] = { Try(apply(source)) match { case Success(s) => Right(()) case Failure(err) => Left(s"Unsupported source type: $source") } } def apply(source: String): Source = source match { case hive(table) => HiveSource(table) case parquet(path) => ParquetSource(path) } } object NoSource extends Source { def asDataFrame(implicit sqlContext: SQLContext): DataFrame = { sys.error(s"Source is not defined") } override def toString: String = "Source is not defined" } case class HiveSource( tableName: String ) extends Source { def asDataFrame(implicit sqlContext: SQLContext): DataFrame = { sqlContext.sql(s"SELECT * FROM $tableName") } override def toString: String = { s"Hive table: $tableName" } } case class ParquetSource( path: String ) extends Source { def asDataFrame(implicit sqlContext: SQLContext): DataFrame = { sqlContext.parquetFile(path) } override def toString: String = { s"Parquet: $path" } }
Example 19
Source File: LogisticRegressionRecommender.scala From wordpress-posts-recommender with Apache License 2.0 | 5 votes |
package wordpressworkshop import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.param.ParamMap import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame case class LogisticRegressionRecommender(training: DataFrame) { val lr = new LogisticRegression() val paramMap = ParamMap(lr.maxIter -> 20) .put(lr.regParam -> 0.01) .put(lr.probabilityCol -> "probability") val model: LogisticRegressionModel = lr.fit(training, paramMap) def metrics(testData: DataFrame) = { val predictionAndLabels: RDD[(Double, Double)] = model.transform(testData).map(row => row.getAs[Vector]("probability")(1) -> row.getAs[Double]("label")) new BinaryClassificationMetrics(predictionAndLabels) } def likeScores(testData: DataFrame): RDD[(Long, Long, Double)] = model.transform(testData) .map(row => (row.getAs[Long]("userId"), row.getAs[Long]("postId"), row.getAs[Vector]("probability")(1))) }
Example 20
Source File: IrisKMeansClustering.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.clustering import com.github.mrpowers.spark.spec.Config import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper import org.apache.spark.ml.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.DataFrame object IrisKMeansClustering extends SparkSessionWrapper { val irisDF = spark .read .option("header", "true") .option("inferSchema", "true") .csv(Config.get("irisData")) val Array(trainingDF, testDF) = irisDF.randomSplit(Array(0.7, 0.3), seed = 12345) def withVectorizedFeatures( featureColNames: Array[String] = Array("SepalLengthCm", "SepalLengthCm", "PetalLengthCm", "PetalWidthCm"), outputColName: String = "features" )(df: DataFrame): DataFrame = { val assembler: VectorAssembler = new VectorAssembler() .setInputCols(featureColNames) .setOutputCol(outputColName) assembler.transform(df) } def model(df: DataFrame = trainingDF): KMeansModel = { val trainFeatures: DataFrame = df .transform(withVectorizedFeatures()) new KMeans() .setK(3) // # of clusters .setSeed(2L) .fit(trainFeatures) } def persistModel(): Unit = { model().save("./tmp/iris_kMeans_model/") } }
Example 21
Source File: TitanicLogisticRegression.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.classification import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.DataFrame object TitanicLogisticRegression extends SparkSessionWrapper { def withVectorizedFeatures( featureColNames: Array[String] = Array("Gender", "Age", "SibSp", "Parch", "Fare"), outputColName: String = "features" )(df: DataFrame): DataFrame = { val assembler: VectorAssembler = new VectorAssembler() .setInputCols(featureColNames) .setOutputCol(outputColName) assembler.transform(df) } def withLabel( inputColName: String = "Survived", outputColName: String = "label" )(df: DataFrame) = { val labelIndexer: StringIndexer = new StringIndexer() .setInputCol(inputColName) .setOutputCol(outputColName) labelIndexer .fit(df) .transform(df) } def model(df: DataFrame = TitanicData.trainingDF()): LogisticRegressionModel = { val trainFeatures: DataFrame = df .transform(withVectorizedFeatures()) .transform(withLabel()) .select("features", "label") // only uses the features and label columns new LogisticRegression() .fit(trainFeatures) } def persistModel(): Unit = { model().save("./tmp/titanic_model/") } }
Example 22
Source File: TitanicData.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.classification import org.apache.spark.sql.functions._ import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper import org.apache.spark.sql.DataFrame object TitanicData extends SparkSessionWrapper { def trainingDF( titanicDataDirName: String = "./src/test/resources/titanic/" ): DataFrame = { spark .read .option("header", "true") .csv(titanicDataDirName + "train.csv") .withColumn( "Gender", when( col("Sex").equalTo("male"), 0 ) .when(col("Sex").equalTo("female"), 1) .otherwise(null) ) .select( col("Gender").cast("double"), col("Survived").cast("double"), col("Pclass").cast("double"), col("Age").cast("double"), col("SibSp").cast("double"), col("Parch").cast("double"), col("Fare").cast("double") ) .filter( col("Gender").isNotNull && col("Survived").isNotNull && col("Pclass").isNotNull && col("Age").isNotNull && col("SibSp").isNotNull && col("Parch").isNotNull && col("Fare").isNotNull ) } def testDF( titanicDataDirName: String = "./src/test/resources/titanic/" ): DataFrame = { val rawTestDF = spark .read .option("header", "true") .csv(titanicDataDirName + "test.csv") val genderSubmissionDF = spark .read .option("header", "true") .csv(titanicDataDirName + "gender_submission.csv") rawTestDF .join( genderSubmissionDF, Seq("PassengerId") ) .withColumn( "Gender", when(col("Sex").equalTo("male"), 0) .when(col("Sex").equalTo("female"), 1) .otherwise(null) ) .select( col("Gender").cast("double"), col("Survived").cast("double"), col("Pclass").cast("double"), col("Age").cast("double"), col("SibSp").cast("double"), col("Parch").cast("double"), col("Fare").cast("double") ) .filter( col("Gender").isNotNull && col("Pclass").isNotNull && col("Age").isNotNull && col("SibSp").isNotNull && col("Parch").isNotNull && col("Fare").isNotNull ) } }
Example 23
Source File: IrisKMeansClusteringSpec.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.clustering import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import com.github.mrpowers.spark.fast.tests.ColumnComparer import com.github.mrpowers.spark.spec.SparkSessionTestWrapper import org.apache.spark.ml.evaluation.ClusteringEvaluator import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType import org.scalatest.FunSpec class IrisKMeansClusteringSpec extends FunSpec with SparkSessionTestWrapper with ColumnComparer { describe("withVectorizedFeatures") { it("converts all the features to a vector without blowing up") { val df = spark.createDF( List( (5.1, 3.5, 1.4, 0.2) ), List( ("SepalLengthCm", DoubleType, true), ("SepalWidthCm", DoubleType, true), ("PetalLengthCm", DoubleType, true), ("PetalWidthCm", DoubleType, true) ) ).transform(IrisKMeansClustering.withVectorizedFeatures()) df.show() df.printSchema() } } describe("model") { it("prints the cluster centers") { println("Cluster Centers: ") IrisKMeansClustering.model().clusterCenters.foreach(println) } it("trains a KMeans Clustering model that's Silhouette with squared euclidean distance above 0.70 percent") { val trainData: DataFrame = IrisKMeansClustering.trainingDF .transform(IrisKMeansClustering.withVectorizedFeatures()) .select("features") val testData: DataFrame = IrisKMeansClustering.testDF .transform(IrisKMeansClustering.withVectorizedFeatures()) .select("features") val predictions: DataFrame = IrisKMeansClustering .model() .transform(testData) .select( col("features"), col("prediction") ) val res = new ClusteringEvaluator() .evaluate(predictions) assert(res >= 0.60) } } }
Example 24
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row } import org.apache.spark.sql.types.StructType package object ply { implicit class PlyDataFrameReader(reader: DataFrameReader) { def ply: String => DataFrame = reader.format("fr.ign.spark.iqmulus.ply").load } implicit class PlyDataFrame(df: DataFrame) { def saveAsPly(location: String, littleEndian: Boolean = true) = { val df_id = df.drop("pid").drop("fid") val schema = df_id.schema val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveAsPly(s"$location/$key.ply", schema, littleEndian)) df_id.rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class PlyRowIterator(iter: Iterator[Row]) { def saveAsPly( filename: String, schema: StructType, littleEndian: Boolean ) = { val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val rows = iter.toArray val count = rows.size.toLong val header = new PlyHeader(filename, littleEndian, Map("vertex" -> ((count, schema)))) val dos = new java.io.DataOutputStream(f); dos.write(header.toString.getBytes) val ros = new RowOutputStream(dos, littleEndian, schema) rows.foreach(ros.write) dos.close header } } }
Example 25
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row } import org.apache.spark.sql.types.{ FloatType, StructType } package object xyz { implicit class XyzDataFrameReader(reader: DataFrameReader) { def xyz: String => DataFrame = reader.format("fr.ign.spark.iqmulus.xyz").load } implicit class XyzDataFrame(df: DataFrame) { def saveAsXyz(location: String) = { val df_id = df.drop("id") require(df_id.schema.fieldNames.take(3) sameElements Array("x", "y", "z")) require(df_id.schema.fields.map(_.dataType).take(3).forall(_ == FloatType)) val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveXyz(s"$location/$key.xyz")) df_id.rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class XyzRowIterator(iter: Iterator[Row]) { def saveXyz(filename: String) = { val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val dos = new java.io.DataOutputStream(f) var count = 0L iter.foreach(row => { count += 1; dos.writeBytes(row.mkString("", "\t", "\n")) }) dos.close (filename, count) } } }
Example 26
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame } import org.apache.spark.sql.types.StructType import org.apache.spark.sql.Row package object las { implicit class LasDataFrameReader(reader: DataFrameReader) { def las: String => DataFrame = reader.format("fr.ign.spark.iqmulus.las").load } implicit class LasDataFrame(df: DataFrame) { def saveAsLas( location: String, formatOpt: Option[Byte] = None, version: Version = Version(), scale: Array[Double] = Array(0.01, 0.01, 0.01), offset: Array[Double] = Array(0, 0, 0) ) = { val format = formatOpt.getOrElse(LasHeader.formatFromSchema(df.schema)) val schema = LasHeader.schema(format) // no user types for now val cols = schema.fieldNames.intersect(df.schema.fieldNames) val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveAsLas(s"$location/$key.las", schema, format, scale, offset, version)) df.select(cols.head, cols.tail: _*).rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class LasRowIterator(iter: Iterator[Row]) { def saveAsLas( filename: String, schema: StructType, format: Byte, scale: Array[Double], offset: Array[Double], version: Version = Version() ) = { // materialize the partition to access it in a single pass, TODO workaround that val rows = iter.toArray val count = rows.length.toLong val pmin = Array.fill[Double](3)(Double.PositiveInfinity) val pmax = Array.fill[Double](3)(Double.NegativeInfinity) val countByReturn = Array.fill[Long](15)(0) rows.foreach { row => val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble val ret = row.getAs[Byte]("flags") & 0x3 countByReturn(ret) += 1 pmin(0) = Math.min(pmin(0), x) pmin(1) = Math.min(pmin(1), y) pmin(2) = Math.min(pmin(2), z) pmax(0) = Math.max(pmax(0), x) pmax(1) = Math.max(pmax(1), y) pmax(2) = Math.max(pmax(2), z) } val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val header = new LasHeader(filename, format, count, pmin, pmax, scale, offset, version = version, pdr_return_nb = countByReturn) val dos = new java.io.DataOutputStream(f); header.write(dos) val ros = new RowOutputStream(dos, littleEndian = true, schema) rows.foreach(ros.write) dos.close header } } }
Example 27
Source File: DataFrameConverterSpec.scala From incubator-toree with Apache License 2.0 | 5 votes |
package org.apache.toree.utils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row} import org.mockito.Mockito._ import org.scalatest.mock.MockitoSugar import org.scalatest.{BeforeAndAfterAll, FunSpec, Matchers} import play.api.libs.json.{JsArray, JsString, Json} import test.utils.SparkContextProvider import scala.collection.mutable class DataFrameConverterSpec extends FunSpec with MockitoSugar with Matchers with BeforeAndAfterAll { lazy val spark = SparkContextProvider.sparkContext override protected def afterAll(): Unit = { spark.stop() super.afterAll() } val dataFrameConverter: DataFrameConverter = new DataFrameConverter val mockDataFrame = mock[DataFrame] val mockRdd = spark.parallelize(Seq(Row(new mutable.WrappedArray.ofRef(Array("test1", "test2")), 2, null))) val mockStruct = mock[StructType] val columns = Seq("foo", "bar").toArray doReturn(mockStruct).when(mockDataFrame).schema doReturn(columns).when(mockStruct).fieldNames doReturn(mockRdd).when(mockDataFrame).rdd describe("DataFrameConverter") { describe("#convert") { it("should convert to a valid JSON object") { val someJson = dataFrameConverter.convert(mockDataFrame, "json") val jsValue = Json.parse(someJson.get) jsValue \ "columns" should be (JsArray(Seq(JsString("foo"), JsString("bar")))) jsValue \ "rows" should be (JsArray(Seq( JsArray(Seq(JsString("[test1, test2]"), JsString("2"), JsString("null"))) ))) } it("should convert to csv") { val csv = dataFrameConverter.convert(mockDataFrame, "csv").get val values = csv.split("\n") values(0) shouldBe "foo,bar" values(1) shouldBe "[test1, test2],2,null" } it("should convert to html") { val html = dataFrameConverter.convert(mockDataFrame, "html").get html.contains("<th>foo</th>") should be(true) html.contains("<th>bar</th>") should be(true) html.contains("<td>[test1, test2]</td>") should be(true) html.contains("<td>2</td>") should be(true) html.contains("<td>null</td>") should be(true) } it("should convert limit the selection") { val someLimited = dataFrameConverter.convert(mockDataFrame, "csv", 1) val limitedLines = someLimited.get.split("\n") limitedLines.length should be(2) } it("should return a Failure for invalid types") { val result = dataFrameConverter.convert(mockDataFrame, "Invalid Type") result.isFailure should be(true) } } } }
Example 28
Source File: HBaseSource.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.datasources import org.apache.hadoop.hbase.spark.datasources.HBaseTableCatalog import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SQLContext import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private case class HBaseRecord( col0: String, col1: Boolean, col2: Double, col3: Float, col4: Int, col5: Long, col6: Short, col7: String, col8: Byte) @InterfaceAudience.Private object HBaseRecord { def apply(i: Int): HBaseRecord = { val s = s"""row${"%03d".format(i)}""" HBaseRecord(s, i % 2 == 0, i.toDouble, i.toFloat, i, i.toLong, i.toShort, s"String$i extra", i.toByte) } } @InterfaceAudience.Private object HBaseSource { val cat = s"""{ |"table":{"namespace":"default", "name":"HBaseSourceExampleTable"}, |"rowkey":"key", |"columns":{ |"col0":{"cf":"rowkey", "col":"key", "type":"string"}, |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, |"col3":{"cf":"cf3", "col":"col3", "type":"float"}, |"col4":{"cf":"cf4", "col":"col4", "type":"int"}, |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"}, |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"}, |"col7":{"cf":"cf7", "col":"col7", "type":"string"}, |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"} |} |}""".stripMargin def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("HBaseSourceExample") val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ def withCatalog(cat: String): DataFrame = { sqlContext .read .options(Map(HBaseTableCatalog.tableCatalog->cat)) .format("org.apache.hadoop.hbase.spark") .load() } val data = (0 to 255).map { i => HBaseRecord(i) } sc.parallelize(data).toDF.write.options( Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5")) .format("org.apache.hadoop.hbase.spark") .save() val df = withCatalog(cat) df.show() df.filter($"col0" <= "row005") .select($"col0", $"col1").show df.filter($"col0" === "row005" || $"col0" <= "row005") .select($"col0", $"col1").show df.filter($"col0" > "row250") .select($"col0", $"col1").show df.registerTempTable("table1") val c = sqlContext.sql("select count(col1) from table1 where col0 < 'row050'") c.show() } }
Example 29
Source File: IntermediateCacher.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} class IntermediateCacher(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("intermediateCacher")) } val inputCols = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) setDefault(inputCols -> Array.empty[String]) override def transformSchema(schema: StructType): StructType = { schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) val intermediateDF = if ($(inputCols).isEmpty) dataset.toDF() else dataset.select($(inputCols).map(col(_)): _*) intermediateDF.cache() } override def copy(extra: ParamMap): IntermediateCacher = { defaultCopy(extra) } } object IntermediateCacher extends DefaultParamsReadable[IntermediateCacher]
Example 30
Source File: RankingMetricFormatter.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ import ws.vinta.albedo.evaluators.RankingEvaluator._ class RankingMetricFormatter(override val uid: String, val sourceType: String) extends Transformer with DefaultParamsWritable { def this(sourceType: String) = { this(Identifiable.randomUID("rankingMetricFormatter"), sourceType) } val userCol = new Param[String](this, "userCol", "User column name") def getUserCol: String = $(userCol) def setUserCol(value: String): this.type = set(userCol, value) setDefault(userCol -> "user") val itemCol = new Param[String](this, "itemCol", "Item column name") def getItemCol: String = $(itemCol) def setItemCol(value: String): this.type = set(itemCol, value) setDefault(itemCol -> "item") val predictionCol = new Param[String](this, "predictionCol", "Prediction column name") def getPredictionCol: String = $(predictionCol) def setPredictionCol(value: String): this.type = set(predictionCol, value) setDefault(predictionCol -> "prediction") val topK = new IntParam(this, "topK", "Recommend top-k items for every user") def getTopK: Int = $(topK) def setTopK(value: Int): this.type = set(topK, value) setDefault(topK -> 15) override def transformSchema(schema: StructType): StructType = { Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType) .foreach{ case(columnName: String, expectedDataType: DataType) => { val actualDataType = schema(columnName).dataType require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.") } } schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) sourceType match { case "als" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK))) case "lr" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK))) } } override def copy(extra: ParamMap): RankingMetricFormatter = { val copied = new RankingMetricFormatter(uid, sourceType) copyValues(copied, extra) } } object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter]
Example 31
Source File: UserRepoTransformer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ class UserRepoTransformer(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("userRepoTransformer")) } val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) override def transformSchema(schema: StructType): StructType = { $(inputCols).foreach((inputColName: String) => { require(schema.fieldNames.contains(inputColName), s"Input column $inputColName must exist.") }) val newFields: Array[StructField] = Array( StructField("repo_language_index_in_user_recent_repo_languages", IntegerType, nullable = false), StructField("repo_language_count_in_user_recent_repo_languages", IntegerType, nullable = false) ) StructType(schema.fields ++ newFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) import dataset.sparkSession.implicits._ dataset .withColumn("repo_language_index_in_user_recent_repo_languages", repoLanguageIndexInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) .withColumn("repo_language_count_in_user_recent_repo_languages", repoLanguageCountInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) } override def copy(extra: ParamMap): UserRepoTransformer = { defaultCopy(extra) } } object UserRepoTransformer extends DefaultParamsReadable[UserRepoTransformer]
Example 32
Source File: ContentRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.http.HttpHost import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.elasticsearch.action.search.SearchRequest import org.elasticsearch.client.{RestClient, RestHighLevelClient} import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item import org.elasticsearch.index.query.QueryBuilders._ import org.elasticsearch.search.SearchHit import org.elasticsearch.search.builder.SearchSourceBuilder import ws.vinta.albedo.closures.DBFunctions._ class ContentRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("contentRecommender")) } val enableEvaluationMode = new Param[Boolean](this, "enableEvaluationMode", "Should be enable for evaluation only") def getEnableEvaluationMode: Boolean = $(enableEvaluationMode) def setEnableEvaluationMode(value: Boolean): this.type = set(enableEvaluationMode, value) setDefault(enableEvaluationMode -> false) override def source = "content" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) import userDF.sparkSession.implicits._ val userRecommendedItemDF = userDF .as[Int] .flatMap { case (userId) => { // 因為 More Like This query 用 document id 查詢時 // 結果會過濾掉那些做為條件的 document ids // 但是這樣在 evaluate 的時候就不太合適了 // 所以我們改用後 k 個 repo 當作查詢條件 val limit = $(topK) val offset = if ($(enableEvaluationMode)) $(topK) else 0 val repoIds = selectUserStarredRepos(userId, limit, offset) val lowClient = RestClient.builder(new HttpHost("127.0.0.1", 9200, "http")).build() val highClient = new RestHighLevelClient(lowClient) val fields = Array("description", "full_name", "language", "topics") val texts = Array("") val items = repoIds.map((itemId: Int) => new Item("repo", "repo_info_doc", itemId.toString)) val queryBuilder = moreLikeThisQuery(fields, texts, items) .minTermFreq(2) .maxQueryTerms(50) val searchSourceBuilder = new SearchSourceBuilder() searchSourceBuilder.query(queryBuilder) searchSourceBuilder.size($(topK)) searchSourceBuilder.from(0) val searchRequest = new SearchRequest() searchRequest.indices("repo") searchRequest.types("repo_info_doc") searchRequest.source(searchSourceBuilder) val searchResponse = highClient.search(searchRequest) val hits = searchResponse.getHits val searchHits = hits.getHits val userItemScoreTuples = searchHits.map((searchHit: SearchHit) => { val itemId = searchHit.getId.toInt val score = searchHit.getScore (userId, itemId, score) }) lowClient.close() userItemScoreTuples } } .toDF($(userCol), $(itemCol), $(scoreCol)) .withColumn($(sourceCol), lit(source)) userRecommendedItemDF } }
Example 33
Source File: PopularityRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import ws.vinta.albedo.utils.DatasetUtils._ class PopularityRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("popularityRecommender")) } override def source = "popularity" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) implicit val spark: SparkSession = userDF.sparkSession import spark.implicits._ val popularRepoDF = loadPopularRepoDF() .limit($(topK)) .cache() def calculateScoreUDF = udf((stargazers_count: Int, created_at: java.sql.Timestamp) => { val valueScore = math.round(math.log10(stargazers_count) * 1000.0) / 1000.0 val timeScore = (created_at.getTime / 1000.0) / (60 * 60 * 24 * 30 * 12) / 5.0 valueScore + timeScore }) userDF .select($(userCol)) .crossJoin(popularRepoDF) .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"repo_stargazers_count", $"repo_created_at").alias($(scoreCol))) .withColumn($(sourceCol), lit(source)) } }
Example 34
Source File: CurationRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import ws.vinta.albedo.utils.DatasetUtils._ class CurationRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("curationRecommender")) } override def source = "curation" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) implicit val spark: SparkSession = userDF.sparkSession import spark.implicits._ val rawStarringDS = loadRawStarringDS().cache() val curatorIds = Array(652070, 1912583, 59990, 646843, 28702) // vinta, saiday, tzangms, fukuball, wancw val curatedRepoDF = rawStarringDS .select($"repo_id", $"starred_at") .where($"user_id".isin(curatorIds: _*)) .groupBy($"repo_id") .agg(max($"starred_at").alias("starred_at")) .orderBy($"starred_at".desc) .limit($(topK)) .cache() def calculateScoreUDF = udf((starred_at: java.sql.Timestamp) => { starred_at.getTime / 1000.0 }) userDF .select($(userCol)) .crossJoin(curatedRepoDF) .select(col($(userCol)), $"repo_id".alias($(itemCol)), calculateScoreUDF($"starred_at").alias($(scoreCol))) .withColumn($(sourceCol), lit(source)) } }
Example 35
Source File: ALSRecommender.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.recommenders import com.github.fommil.netlib.F2jBLAS import org.apache.spark.ml.recommendation.ALSModel import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.settings class ALSRecommender(override val uid: String) extends Recommender { def this() = { this(Identifiable.randomUID("alsRecommender")) } private def alsModel: ALSModel = { val alsModelPath = s"${settings.dataDir}/${settings.today}/alsModel.parquet" ALSModel.load(alsModelPath) } def blockify(factors: Dataset[(Int, Array[Float])], blockSize: Int = 4096): Dataset[Seq[(Int, Array[Float])]] = { import factors.sparkSession.implicits._ factors.mapPartitions(_.grouped(blockSize)) } override def source = "als" override def recommendForUsers(userDF: Dataset[_]): DataFrame = { transformSchema(userDF.schema) import userDF.sparkSession.implicits._ val activeUsers = userDF.select(col($(userCol)).alias("id")) val userFactors = alsModel.userFactors.join(activeUsers, Seq("id")) val itemFactors = alsModel.itemFactors val rank = alsModel.rank val num = $(topK) val userFactorsBlocked = blockify(userFactors.as[(Int, Array[Float])]) val itemFactorsBlocked = blockify(itemFactors.as[(Int, Array[Float])]) val ratings = userFactorsBlocked.crossJoin(itemFactorsBlocked) .as[(Seq[(Int, Array[Float])], Seq[(Int, Array[Float])])] .flatMap { case (srcIter, dstIter) => val m = srcIter.size val n = math.min(dstIter.size, num) val output = new Array[(Int, Int, Float)](m * n) var i = 0 val pq = new BoundedPriorityQueue[(Int, Float)](num)(Ordering.by(_._2)) srcIter.foreach { case (srcId, srcFactor) => dstIter.foreach { case (dstId, dstFactor) => val score = new F2jBLAS().sdot(rank, srcFactor, 1, dstFactor, 1) pq += dstId -> score } pq.foreach { case (dstId, score) => output(i) = (srcId, dstId, score) i += 1 } pq.clear() } output.toSeq } ratings .toDF($(userCol), $(itemCol), $(scoreCol)) .withColumn($(sourceCol), lit(source)) } }
Example 36
Source File: SimpleVectorAssembler.scala From albedo with MIT License | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuilder def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val schema = dataset.schema val assembleFunc = udf { r: Row => SimpleVectorAssembler.assemble(r.toSeq: _*) } val args = $(inputCols).map { c => schema(c).dataType match { case DoubleType => dataset(c) case _: VectorUDT => dataset(c) case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid") } } dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol))) } override def transformSchema(schema: StructType): StructType = { val inputColNames = $(inputCols) val outputColName = $(outputCol) val inputDataTypes = inputColNames.map(name => schema(name).dataType) inputDataTypes.foreach { case _: NumericType | BooleanType => case t if t.isInstanceOf[VectorUDT] => case other => throw new IllegalArgumentException(s"Data type $other is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) } override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra) } object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] { override def load(path: String): SimpleVectorAssembler = super.load(path) def assemble(vv: Any*): Vector = { val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case null => // TODO: output Double.NaN? throw new SparkException("Values to assemble cannot be null.") case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } Vectors.sparse(cur, indices.result(), values.result()).compressed } }
Example 37
Source File: CouchbaseSink.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.sql.streaming import com.couchbase.spark.Logging import org.apache.spark.sql.{DataFrame, SaveMode} import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.sql.types.StringType import com.couchbase.spark.sql._ import com.couchbase.spark._ import com.couchbase.client.core.CouchbaseException import com.couchbase.client.java.document.JsonDocument import com.couchbase.client.java.document.json.JsonObject import scala.concurrent.duration._ class CouchbaseSink(options: Map[String, String]) extends Sink with Logging { override def addBatch(batchId: Long, data: DataFrame): Unit = { val bucketName = options.get("bucket").orNull val idFieldName = options.getOrElse("idField", DefaultSource.DEFAULT_DOCUMENT_ID_FIELD) val removeIdField = options.getOrElse("removeIdField", "true").toBoolean val timeout = options.get("timeout").map(v => Duration(v.toLong, MILLISECONDS)) val createDocument = options.get("expiry").map(_.toInt) .map(expiry => (id: String, content: JsonObject) => JsonDocument.create(id, expiry, content)) .getOrElse((id: String, content: JsonObject) => JsonDocument.create(id, content)) data.toJSON .queryExecution .toRdd .map(_.get(0, StringType).asInstanceOf[UTF8String].toString()) .map { rawJson => val encoded = JsonObject.fromJson(rawJson) val id = encoded.get(idFieldName) if (id == null) { throw new Exception(s"Could not find ID field $idFieldName in $encoded") } if (removeIdField) { encoded.removeKey(idFieldName) } createDocument(id.toString, encoded) } .saveToCouchbase(bucketName, StoreMode.UPSERT, timeout) } }
Example 38
Source File: DataFrameReaderFunctions.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.sql import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, DataFrameReader} class DataFrameReaderFunctions(@transient val dfr: DataFrameReader) extends Serializable { private def buildFrame(options: Map[String, String] = null, schema: StructType = null, schemaFilter: Option[Filter] = null): DataFrame = { val builder = dfr .format(source) .schema(schema) val filter = schemaFilter.map(N1QLRelation.filterToExpression) if (filter.isDefined) { builder.option("schemaFilter", filter.get) } if (options != null) { builder.options(options) } builder.load() } }
Example 39
Source File: N1qlSpec.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.n1ql import com.couchbase.client.core.CouchbaseException import com.couchbase.client.java.error.QueryExecutionException import com.couchbase.client.java.query.N1qlQuery import org.apache.spark.{SparkConf, SparkContext, SparkException} import org.apache.spark.sql.sources.EqualTo import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.scalatest._ import com.couchbase.spark._ import com.couchbase.spark.connection.CouchbaseConnection import com.couchbase.spark.sql.N1QLRelation import org.apache.spark.sql.types.{StringType, StructField, StructType} import scala.util.control.NonFatal class N1qlSpec extends FunSuite with Matchers with BeforeAndAfterAll { private val master = "local[2]" private val appName = "cb-int-specs1" private var spark: SparkSession = _ override def beforeAll(): Unit = { spark = SparkSession .builder() .master(master) .appName(appName) .config("spark.couchbase.username", "Administrator") .config("spark.couchbase.password", "password") // Open 2 buckets as tests below rely on it .config("com.couchbase.bucket.default", "") .config("com.couchbase.bucket.travel-sample", "") .getOrCreate() } override def afterAll(): Unit = { CouchbaseConnection().stop() spark.stop() } test("Creating N1QLRelation with default bucket, when two buckets exist, should fail") { assertThrows[IllegalStateException] { spark.read .format("com.couchbase.spark.sql.DefaultSource") .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline"))) .option("schemaFilter", "`type` = 'airline'") .schema(StructType(StructField("name", StringType) :: Nil)) .load() } } test("Creating N1QLRelation with non-default bucket, when two buckets exist, should succeed") { spark.read .format("com.couchbase.spark.sql.DefaultSource") .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline"))) .option("schemaFilter", "`type` = 'airline'") .option("bucket", "travel-sample") .schema(StructType(StructField("name", StringType) :: Nil)) .load() } test("N1QL failures should fail the Observable") { try { spark.sparkContext .couchbaseQuery(N1qlQuery.simple("BAD QUERY"), bucketName = "default") .collect() .foreach(println) fail() } catch { case e: SparkException => assert (e.getCause.isInstanceOf[QueryExecutionException]) val err = e.getCause.asInstanceOf[QueryExecutionException] assert (err.getMessage == "syntax error - at QUERY") case NonFatal(e) => println(e) fail() } } }
Example 40
Source File: CouchbaseDataFrameSpec.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.sql import com.couchbase.spark.connection.CouchbaseConnection import org.apache.avro.generic.GenericData.StringType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession} import org.apache.spark.sql.sources.EqualTo import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest._ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class CouchbaseDataFrameSpec extends FlatSpec with Matchers with BeforeAndAfterAll { private val master = "local[2]" private val appName = "cb-int-specs1" private var spark: SparkSession = null override def beforeAll(): Unit = { val conf = new SparkConf() .setMaster(master) .setAppName(appName) .set("spark.couchbase.nodes", "127.0.0.1") .set("com.couchbase.username", "Administrator") .set("com.couchbase.password", "password") .set("com.couchbase.bucket.default", "") .set("com.couchbase.bucket.travel-sample", "") spark = SparkSession.builder().config(conf).getOrCreate() loadData() } override def afterAll(): Unit = { CouchbaseConnection().stop() spark.stop() } def loadData(): Unit = { } "If two buckets are used and the bucket is specified the API" should "not fail" in { val ssc = spark.sqlContext ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample")) } "The DataFrame API" should "infer the schemas" in { val ssc = spark.sqlContext import com.couchbase.spark.sql._ val airline = ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample")) val airport = ssc.read.couchbase(EqualTo("type", "airport"), Map("bucket" -> "travel-sample")) val route = ssc.read.couchbase(EqualTo("type", "route"), Map("bucket" -> "travel-sample")) val landmark = ssc.read.couchbase(EqualTo("type", "landmark"), Map("bucket" -> "travel-sample")) airline .limit(10) .write .mode(SaveMode.Overwrite) .couchbase(Map("bucket" -> "default")) // TODO: validate schemas which are inferred on a field and type basis } it should "write and ignore" in { val ssc = spark.sqlContext import com.couchbase.spark.sql._ // create df, write it twice val data = ("Michael", 28, true) val df = ssc.createDataFrame(spark.sparkContext.parallelize(Seq(data))) df.write .mode(SaveMode.Ignore) .couchbase(options = Map("idField" -> "_1", "bucket" -> "default")) df.write .mode(SaveMode.Ignore) .couchbase(options = Map("idField" -> "_1", "bucket" -> "default")) } it should "filter based on a function" in { val ssc = spark.sqlContext import com.couchbase.spark.sql._ val airlineBySubstrCountry: DataFrame = ssc.read.couchbase( EqualTo("'substr(country, 0, 6)'", "United"), Map("bucket" -> "travel-sample")) airlineBySubstrCountry.count() should equal(6797) } }
Example 41
Source File: CustomConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.DataFrame import CustomConstraint.{FailureMsg, SuccessMsg} import scala.util.Try case class CustomConstraint(name: String, constraintFunction: DataFrame => Either[FailureMsg, SuccessMsg] ) extends Constraint { val fun = (df: DataFrame) => { val tryFun = Try(constraintFunction(df)) val messagePrefix = s"Custom constraint '$name'" val message = tryFun.map { case Left(failureMsg) => s"$messagePrefix failed: $failureMsg" case Right(successMsg) => s"$messagePrefix succeeded: $successMsg" }.recover { case throwable => s"$messagePrefix errored: $throwable" }.get val status = ConstraintUtil.tryToStatus[Either[FailureMsg, SuccessMsg]](tryFun, _.isRight) CustomConstraintResult(this, message, status) } } case class CustomConstraintResult(constraint: CustomConstraint, message: String, status: ConstraintStatus) extends ConstraintResult[CustomConstraint] object CustomConstraint { type SuccessMsg = String type FailureMsg = String }
Example 42
Source File: FunctionalDependencyConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class FunctionalDependencyConstraint(determinantSet: Seq[String], dependentSet: Seq[String]) extends Constraint { require(determinantSet.nonEmpty, "determinantSet must not be empty") require(dependentSet.nonEmpty, "dependentSet must not be empty") val fun = (df: DataFrame) => { val determinantColumns = determinantSet.map(columnName => new Column(columnName)) val dependentColumns = dependentSet.map(columnName => new Column(columnName)) val maybeRelevantSelection = Try(df.select(determinantColumns ++ dependentColumns: _*)) val maybeDeterminantValueCounts = maybeRelevantSelection.map(_.distinct.groupBy(determinantColumns: _*).count) val maybeViolatingDeterminantValuesCount = maybeDeterminantValueCounts.map(_.filter(new Column("count") =!= 1).count) FunctionalDependencyConstraintResult( constraint = this, data = maybeViolatingDeterminantValuesCount.toOption.map(FunctionalDependencyConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeViolatingDeterminantValuesCount, _ == 0) ) } } case class FunctionalDependencyConstraintResult(constraint: FunctionalDependencyConstraint, data: Option[FunctionalDependencyConstraintResultData], status: ConstraintStatus) extends ConstraintResult[FunctionalDependencyConstraint] { val message: String = { val maybeFailedRows = data.map(_.failedRows) val maybeRowPluralS = maybeFailedRows.map(failedRows => if (failedRows == 1) "" else "s") val dependentSet = constraint.dependentSet val determinantString = s"${constraint.determinantSet.mkString(", ")}" val dependentString = s"${dependentSet.mkString(", ")}" val (columnPluralS, columnVerb) = if (dependentSet.size == 1) ("", "is") else ("s", "are") (status, maybeFailedRows, maybeRowPluralS) match { case (ConstraintSuccess, Some(0), _) => s"Column$columnPluralS $dependentString $columnVerb functionally dependent on $determinantString." case (ConstraintFailure, Some(failedRows), Some(rowPluralS)) => s"Column$columnPluralS $dependentString $columnVerb not functionally dependent on " + s"$determinantString ($failedRows violating determinant value$rowPluralS)." case (ConstraintError(throwable), None, None) => s"Checking whether column$columnPluralS $dependentString $columnVerb functionally " + s"dependent on $determinantString failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class FunctionalDependencyConstraintResultData(failedRows: Long)
Example 43
Source File: NumberOfRowsConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.functions.count import org.apache.spark.sql.{Column, DataFrame} case class NumberOfRowsConstraint private[ddq] (expected: Column) extends Constraint { val fun = (df: DataFrame) => { val countDf = df.agg(count(new Column("*")).as(NumberOfRowsConstraint.countKey)) val actual = countDf.collect().map(_.getLong(0)).apply(0) val satisfied = countDf.select(expected).collect().map(_.getBoolean(0)).apply(0) NumberOfRowsConstraintResult( constraint = this, actual = actual, status = if (satisfied) ConstraintSuccess else ConstraintFailure ) } } object NumberOfRowsConstraint { private[constraints] val countKey: String = "count" def apply(expected: Column => Column): NumberOfRowsConstraint = { new NumberOfRowsConstraint(expected(new Column(countKey))) } def greaterThan(expected: Int): NumberOfRowsConstraint = { NumberOfRowsConstraint(_ > expected) } def lessThan(expected: Int): NumberOfRowsConstraint = { NumberOfRowsConstraint(_ < expected) } def equalTo(expected: Int): NumberOfRowsConstraint = { NumberOfRowsConstraint(_ === expected) } } case class NumberOfRowsConstraintResult(constraint: NumberOfRowsConstraint, actual: Long, status: ConstraintStatus) extends ConstraintResult[NumberOfRowsConstraint] { val message: String = { val expected = constraint.expected status match { case ConstraintSuccess => s"The number of rows satisfies $expected." case ConstraintFailure => s"The actual number of rows $actual does not satisfy $expected." case default => throw IllegalConstraintResultException(this) } } }
Example 44
Source File: AlwaysNullConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class AlwaysNullConstraint(columnName: String) extends Constraint { override val fun = (df: DataFrame) => { val tryNotNullCount = Try(df.filter(new Column(columnName).isNotNull).count) AlwaysNullConstraintResult( constraint = this, status = ConstraintUtil.tryToStatus[Long](tryNotNullCount, _ == 0), data = tryNotNullCount.toOption.map(AlwaysNullConstraintResultData) ) } } case class AlwaysNullConstraintResult(constraint: AlwaysNullConstraint, status: ConstraintStatus, data: Option[AlwaysNullConstraintResultData] ) extends ConstraintResult[AlwaysNullConstraint] { val message: String = { val columnName = constraint.columnName val maybeNonNullRows = data.map(_.nonNullRows) val maybePluralS = maybeNonNullRows.map(n => if (n == 1) "" else "s") (status, maybeNonNullRows, maybePluralS) match { case (ConstraintError(throwable), None, None) => s"Checking column $columnName for being always null failed: $throwable" case (ConstraintSuccess, Some(0), Some(pluralS)) => s"Column $columnName is always null." case (ConstraintFailure, Some(nonNullRows), Some(pluralS)) => s"Column $columnName contains $nonNullRows non-null row$pluralS (should always be null)." case default => throw IllegalConstraintResultException(this) } } } case class AlwaysNullConstraintResultData(nonNullRows: Long)
Example 45
Source File: StringColumnConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.DataFrame import scala.util.Try case class StringColumnConstraint(constraintString: String) extends Constraint { val fun = (df: DataFrame) => { val maybeSucceedingRows = Try(df.filter(constraintString).count) val count = df.count val maybeFailingRows = maybeSucceedingRows.map(succeedingRows => count - succeedingRows) StringColumnConstraintResult( constraint = this, data = maybeFailingRows.toOption.map(StringColumnConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0) ) } } case class StringColumnConstraintResult(constraint: StringColumnConstraint, data: Option[StringColumnConstraintResultData], status: ConstraintStatus) extends ConstraintResult[StringColumnConstraint] { val message: String = ColumnConstraintUtil.createColumnConstraintMessage( status = status, constraintResult = this, constraintString = constraint.constraintString, maybeViolatingRows = data.map(_.failedRows) ) } case class StringColumnConstraintResultData(failedRows: Long)
Example 46
Source File: DateFormatConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import java.text.SimpleDateFormat import org.apache.spark.sql.functions._ import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class DateFormatConstraint(columnName: String, formatString: String) extends Constraint { val fun = (df: DataFrame) => { val cannotBeDate = udf((column: String) => column != null && Try { val format = new SimpleDateFormat(formatString) format.setLenient(false) format.parse(column) }.isFailure) val maybeCannotBeDateCount = Try(df.filter(cannotBeDate(new Column(columnName))).count) DateFormatConstraintResult( this, data = maybeCannotBeDateCount.toOption.map(DateFormatConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeCannotBeDateCount, _ == 0) ) } } case class DateFormatConstraintResult(constraint: DateFormatConstraint, data: Option[DateFormatConstraintResultData], status: ConstraintStatus) extends ConstraintResult[DateFormatConstraint] { val message: String = { val format = constraint.formatString val columnName = constraint.columnName val maybeFailedRows = data.map(_.failedRows) val maybePluralS = maybeFailedRows.map(failedRows => if (failedRows == 1) "" else "s") val maybeVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) "is" else "are") (status, maybeFailedRows, maybePluralS, maybeVerb) match { case (ConstraintSuccess, Some(0), _, _) => s"Column $columnName is formatted by $format." case (ConstraintFailure, Some(failedRows), Some(pluralS), Some(verb)) => s"Column $columnName contains $failedRows row$pluralS that $verb not formatted by $format." case (ConstraintError(throwable), None, None, None) => s"Checking whether column $columnName is formatted by $format failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class DateFormatConstraintResultData(failedRows: Long)
Example 47
Source File: TypeConversionConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class TypeConversionConstraint(columnName: String, convertedType: DataType) extends Constraint { val fun = (df: DataFrame) => { val originalColumn = new Column(columnName) val castedColumnName = columnName + "_casted" val maybeCasted = Try(df.select(originalColumn, originalColumn.cast(convertedType).as(castedColumnName))) val maybeFailedCastsAndOriginalType = maybeCasted.map(casted => { val failedCastsCount = casted.filter(new Column(castedColumnName).isNull && originalColumn.isNotNull).count val originalType = df.schema.find(_.name == columnName).get.dataType (failedCastsCount, originalType) }) TypeConversionConstraintResult( constraint = this, data = maybeFailedCastsAndOriginalType.toOption.map{ case (failedCastsCount, originalType) => TypeConversionConstraintResultData( originalType = originalType, failedRows = failedCastsCount ) }, status = ConstraintUtil.tryToStatus[Long](maybeFailedCastsAndOriginalType.map{ case (failedCastsCount, originalType) => failedCastsCount }, _ == 0) ) } } case class TypeConversionConstraintResult(constraint: TypeConversionConstraint, data: Option[TypeConversionConstraintResultData], status: ConstraintStatus) extends ConstraintResult[TypeConversionConstraint] { val message: String = { val convertedType = constraint.convertedType val columnName = constraint.columnName val maybePluralSVerb = data.map(data => if (data.failedRows == 1) ("", "is") else ("s", "are")) (status, data, maybePluralSVerb) match { case (ConstraintSuccess, Some(TypeConversionConstraintResultData(originalType, 0)), _) => s"Column $columnName can be converted from $originalType to $convertedType." case (ConstraintFailure, Some(TypeConversionConstraintResultData(originalType, failedRows)), Some((pluralS, verb))) => s"Column $columnName cannot be converted from $originalType to $convertedType. " + s"$failedRows row$pluralS could not be converted." case (ConstraintError(throwable), None, None) => s"Checking whether column $columnName can be converted to $convertedType failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class TypeConversionConstraintResultData(originalType: DataType, failedRows: Long)
Example 48
Source File: ConditionalColumnConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class ConditionalColumnConstraint(statement: Column, implication: Column) extends Constraint { val fun = (df: DataFrame) => { val maybeFailingRows = Try { val succeedingRows = df.filter(!statement || implication).count df.count - succeedingRows } ConditionalColumnConstraintResult( constraint = this, data = maybeFailingRows.toOption.map(ConditionalColumnConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0) ) } } case class ConditionalColumnConstraintResult(constraint: ConditionalColumnConstraint, data: Option[ConditionalColumnConstraintResultData], status: ConstraintStatus) extends ConstraintResult[ConditionalColumnConstraint] { val message: String = ColumnConstraintUtil.createColumnConstraintMessage( status = status, constraintResult = this, constraintString = s"${constraint.statement} -> ${constraint.implication}", maybeViolatingRows = data.map(_.failedRows) ) } case class ConditionalColumnConstraintResultData(failedRows: Long)
Example 49
Source File: ColumnColumnConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class ColumnColumnConstraint(constraintColumn: Column) extends Constraint { val fun = (df: DataFrame) => { val maybeFailingRows = Try { val succeedingRows = df.filter(constraintColumn).count df.count - succeedingRows } ColumnColumnConstraintResult( constraint = this, data = maybeFailingRows.toOption.map(ColumnColumnConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0) ) } } case class ColumnColumnConstraintResult(constraint: ColumnColumnConstraint, data: Option[ColumnColumnConstraintResultData], status: ConstraintStatus) extends ConstraintResult[ColumnColumnConstraint] { val message: String = ColumnConstraintUtil.createColumnConstraintMessage( status = status, constraintResult = this, constraintString = constraint.constraintColumn.toString, maybeViolatingRows = data.map(_.failedRows) ) } case class ColumnColumnConstraintResultData(failedRows: Long)
Example 50
Source File: JoinableConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class JoinableConstraint(columnNames: Seq[(String, String)], referenceTable: DataFrame) extends Constraint { val fun = (df: DataFrame) => { val columnsMap = columnNames.toMap val renamedColumns = columnNames.map{ case (baseColumn, refColumn) => ("b_" + baseColumn, "r_" + refColumn)} val (baseColumns, refColumns) = columnNames.unzip val (renamedBaseColumns, renamedRefColumns) = renamedColumns.unzip val maybeNonUniqueRows = Try( referenceTable.groupBy(refColumns.map(new Column(_)):_*).count.filter(new Column("count") > 1).count ) // rename all columns to avoid ambiguous column references val maybeRenamedDfAndRef = maybeNonUniqueRows.map(_ => { val renamedDf = df.select(baseColumns.zip(renamedBaseColumns).map { case (original, renamed) => new Column(original).as(renamed) }: _*) val renamedRef = referenceTable.select(refColumns.zip(renamedRefColumns).map { case (original, renamed) => new Column(original).as(renamed) }: _*) (renamedDf, renamedRef) }) // check if join yields some values val maybeDistinctBeforeAndMatchingRows = maybeRenamedDfAndRef.map { case (renamedDf, renamedRef) => val renamedDfDistinct = renamedDf.distinct val distinctBefore = renamedDfDistinct.count val joinCondition = renamedColumns.map{ case (baseColumn, refColumn) => new Column(baseColumn) === new Column(refColumn) }.reduce(_ && _) val join = renamedDfDistinct.join(renamedRef, joinCondition) val matchingRows = join.distinct.count (distinctBefore, matchingRows) } JoinableConstraintResult( constraint = this, data = maybeDistinctBeforeAndMatchingRows.toOption.map{ case (distinctBefore, matchingRows) => JoinableConstraintResultData( distinctBefore = distinctBefore, matchingKeys = matchingRows ) }, status = ConstraintUtil.tryToStatus[Long](maybeDistinctBeforeAndMatchingRows.map{ case (distinctBefore, matchingRows) => matchingRows }, _ > 0) ) } } case class JoinableConstraintResult(constraint: JoinableConstraint, data: Option[JoinableConstraintResultData], status: ConstraintStatus) extends ConstraintResult[JoinableConstraint] { val maybeMatchRatio: Option[Double] = data.map(d => d.matchingKeys.toDouble / d.distinctBefore) val message: String = { val columnNames = constraint.columnNames val columnsString = columnNames.map{ case (baseCol, refCol) => baseCol + "->" + refCol }.mkString(", ") val maybeMatchPercentage = maybeMatchRatio.map(_ * 100.0) (status, data, maybeMatchPercentage) match { case (ConstraintSuccess, Some(JoinableConstraintResultData(distinctBefore, matchingKeys)), Some(matchPercentage)) => s"Key $columnsString can be used for joining. " + s"Join columns cardinality in base table: $distinctBefore. " + s"Join columns cardinality after joining: $matchingKeys (${"%.2f".format(matchPercentage)}" + "%)." case (ConstraintFailure, Some(_), Some(_)) => s"Key $columnsString cannot be used for joining (no result)." case (ConstraintError(throwable), None, None) => s"Checking whether $columnsString can be used for joining failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class JoinableConstraintResultData(distinctBefore: Long, matchingKeys: Long)
Example 51
Source File: AnyOfConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class AnyOfConstraint(columnName: String, allowedValues: Set[Any]) extends Constraint { val fun = (df: DataFrame) => { val maybeError = Try(df.select(new Column(columnName))) // check if column is not ambiguous val maybeColumnIndex = maybeError.map(_ => df.columns.indexOf(columnName)) val maybeNotAllowedCount = maybeColumnIndex.map(columnIndex => df.rdd.filter(row => !row.isNullAt(columnIndex) && !allowedValues.contains(row.get(columnIndex))).count) AnyOfConstraintResult( constraint = this, data = maybeNotAllowedCount.toOption.map(AnyOfConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeNotAllowedCount, _ == 0) ) } } case class AnyOfConstraintResult(constraint: AnyOfConstraint, data: Option[AnyOfConstraintResultData], status: ConstraintStatus) extends ConstraintResult[AnyOfConstraint] { val message: String = { val allowed = constraint.allowedValues val columnName = constraint.columnName val maybeFailedRows = data.map(_.failedRows) val maybePluralSAndVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) ("", "is") else ("s", "are")) (status, maybeFailedRows, maybePluralSAndVerb) match { case (ConstraintSuccess, Some(0), Some((pluralS, verb))) => s"Column $columnName contains only values in $allowed." case (ConstraintFailure, Some(failedRows), Some((pluralS, verb))) => s"Column $columnName contains $failedRows row$pluralS that $verb not in $allowed." case (ConstraintError(throwable), None, None) => s"Checking whether column $columnName contains only values in $allowed failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class AnyOfConstraintResultData(failedRows: Long)
Example 52
Source File: ForeignKeyConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class ForeignKeyConstraint(columnNames: Seq[(String, String)], referenceTable: DataFrame) extends Constraint { val fun = (df: DataFrame) => { val renamedColumns = columnNames.map{ case (baseColumn, refColumn) => ("b_" + baseColumn, "r_" + refColumn)} val (baseColumns, refColumns) = columnNames.unzip val (renamedBaseColumns, renamedRefColumns) = renamedColumns.unzip // check if foreign key is a key in reference table val maybeNonUniqueRows = Try( referenceTable.groupBy(refColumns.map(new Column(_)):_*).count.filter(new Column("count") > 1).count ) if (maybeNonUniqueRows.toOption.exists(_ > 0)) { ForeignKeyConstraintResult( constraint = this, data = Some(ForeignKeyConstraintResultData(numNonMatchingRefs = None)), status = ConstraintFailure ) } else { // rename all columns to avoid ambiguous column references val maybeRenamedDfAndRef = maybeNonUniqueRows.map(_ => { val renamedDf = df.select(baseColumns.zip(renamedBaseColumns).map { case (original, renamed) => new Column(original).as(renamed) }: _*) val renamedRef = referenceTable.select(refColumns.zip(renamedRefColumns).map { case (original, renamed) => new Column(original).as(renamed) }: _*) (renamedDf, renamedRef) }) // check if left outer join yields some null values val maybeLeftOuterJoin = maybeRenamedDfAndRef.map { case (renamedDf, renamedRef) => val joinCondition = renamedColumns.map { case (baseColumn, refColumn) => new Column(baseColumn) === new Column(refColumn) }.reduce(_ && _) renamedDf.distinct.join(renamedRef, joinCondition, "outer") } val maybeNotMatchingRefs = maybeLeftOuterJoin.map(_.filter(renamedRefColumns.map(new Column(_).isNull).reduce(_ && _)).count) ForeignKeyConstraintResult( constraint = this, data = maybeNotMatchingRefs.toOption.map(Some(_)).map(ForeignKeyConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeNotMatchingRefs, _ == 0) ) } } } case class ForeignKeyConstraintResult(constraint: ForeignKeyConstraint, data: Option[ForeignKeyConstraintResultData], status: ConstraintStatus) extends ConstraintResult[ForeignKeyConstraint] { val message: String = { val referenceTable = constraint.referenceTable val columnNames = constraint.columnNames val columnsString = columnNames.map { case (baseCol, refCol) => baseCol + "->" + refCol }.mkString(", ") val isPlural = columnNames.length > 1 val (columnDo, columnDefine, columnIs, columnPluralS) = if (isPlural) ("do", "define", "are", "s") else ("does", "defines", "is", "") val columnNoun = "Column" + columnPluralS val maybeNumNonMatchingRefs = data.map(_.numNonMatchingRefs) (status, maybeNumNonMatchingRefs) match { case (ConstraintSuccess, Some(Some(0))) => s"$columnNoun $columnsString $columnDefine a foreign key " + s"pointing to the reference table $referenceTable." case (ConstraintFailure, Some(None)) => s"$columnNoun $columnsString $columnIs not a key in the reference table." case (ConstraintFailure, Some(Some(nonMatching))) => val (rowsNoun, rowsDo) = if (nonMatching != 1) ("rows", "do") else ("row", "does") s"$columnNoun $columnsString $columnDo not define a foreign key " + s"pointing to $referenceTable. $nonMatching $rowsNoun $rowsDo not match." case (ConstraintError(throwable), None) => s"Checking whether ${columnNoun.toLowerCase} $columnsString $columnDefine a foreign key failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class ForeignKeyConstraintResultData(numNonMatchingRefs: Option[Long])
Example 53
Source File: ExactEqualityConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class ExactEqualityConstraint(other: DataFrame) extends Constraint { val fun = (df: DataFrame) => { val tryEquality = Try { if (df.schema != other.schema) { throw new IllegalArgumentException("Schemas do not match") } val dfGroupCount = df.groupBy(df.columns.map(new Column(_)):_*).count() val otherGroupCount = other.groupBy(df.columns.map(new Column(_)):_*).count() val diffCount1 = dfGroupCount.except(otherGroupCount).count() val diffCount2 = otherGroupCount.except(dfGroupCount).count() (diffCount1, diffCount2) } ExactEqualityConstraintResult( constraint = this, data = tryEquality.toOption.map { case (leftToRightCount, rightToLeftCount) => ExactEqualityConstraintData(leftToRightCount, rightToLeftCount) }, status = ConstraintUtil.tryToStatus[(Long, Long)](tryEquality, { case (leftToRightCount, rightToLeftCount) => leftToRightCount + rightToLeftCount == 0 }) ) } } case class ExactEqualityConstraintResult(constraint: ExactEqualityConstraint, data: Option[ExactEqualityConstraintData], status: ConstraintStatus) extends ConstraintResult[ExactEqualityConstraint] { val message: String = { val otherName = constraint.other.toString() val maybeNonMatchingRows = data.map(data => (data.numNonMatchingLeftToRight, data.numNonMatchingRightToLeft)) val maybePluralS = maybeNonMatchingRows.map { case (leftToRightCount, rightToLeftCount) => ( if (leftToRightCount == 1) "" else "s", if (rightToLeftCount == 1) "" else "s" ) } val maybeVerb = maybeNonMatchingRows.map { case (leftToRightCount, rightToLeftCount) => ( if (leftToRightCount == 1) "is" else "are", if (rightToLeftCount == 1) "is" else "are" ) } (status, maybeNonMatchingRows, maybePluralS, maybeVerb) match { case (ConstraintSuccess, Some(_), Some(_), Some(_)) => s"It is equal to $otherName." case ( ConstraintFailure, Some((leftToRightRows, rightToLeftRows)), Some((leftToRightPluralS, rightToLeftPluralS)), Some((leftToRightVerb, rightToLeftVerb)) ) => s"It is not equal ($leftToRightRows distinct count row$leftToRightPluralS $leftToRightVerb " + s"present in the checked dataframe but not in the other " + s"and $rightToLeftRows distinct count row$rightToLeftPluralS $rightToLeftVerb " + s"present in the other dataframe but not in the checked one) to $otherName." case (ConstraintError(throwable), None, None, None) => s"Checking equality with $otherName failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class ExactEqualityConstraintData(numNonMatchingLeftToRight: Long, numNonMatchingRightToLeft: Long)
Example 54
Source File: NeverNullConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class NeverNullConstraint(columnName: String) extends Constraint { val fun = (df: DataFrame) => { val tryNullCount = Try(df.filter(new Column(columnName).isNull).count) NeverNullConstraintResult( constraint = this, data = tryNullCount.toOption.map(NeverNullConstraintResultData), status = ConstraintUtil.tryToStatus[Long](tryNullCount, _ == 0) ) } } case class NeverNullConstraintResult(constraint: NeverNullConstraint, data: Option[NeverNullConstraintResultData], status: ConstraintStatus) extends ConstraintResult[NeverNullConstraint] { val message: String = { val columnName = constraint.columnName val maybeNullRows = data.map(_.nullRows) val maybePluralS = maybeNullRows.map(nullRows => if (nullRows == 1) "" else "s") val maybeVerb = maybeNullRows.map(nullRows => if (nullRows == 1) "is" else "are") (status, maybeNullRows, maybePluralS, maybeVerb) match { case (ConstraintSuccess, Some(0), Some(pluralS), Some(verb)) => s"Column $columnName is never null." case (ConstraintFailure, Some(nullRows), Some(pluralS), Some(verb)) => s"Column $columnName contains $nullRows row$pluralS that $verb null (should never be null)." case (ConstraintError(throwable), None, None, None) => s"Checking column $columnName for being never null failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class NeverNullConstraintResultData(nullRows: Long)
Example 55
Source File: UniqueKeyConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class UniqueKeyConstraint(columnNames: Seq[String]) extends Constraint { require(columnNames.nonEmpty) val fun = (df: DataFrame) => { val columns = columnNames.map(name => new Column(name)) val maybeNonUniqueRows = Try(df.groupBy(columns: _*).count.filter(new Column("count") > 1).count) UniqueKeyConstraintResult( constraint = this, data = maybeNonUniqueRows.toOption.map(UniqueKeyConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeNonUniqueRows, _ == 0) ) } } case class UniqueKeyConstraintResult(constraint: UniqueKeyConstraint, data: Option[UniqueKeyConstraintResultData], status: ConstraintStatus) extends ConstraintResult[UniqueKeyConstraint] { val message: String = { val columnNames = constraint.columnNames val columnsString = columnNames.mkString(", ") val isPlural = columnNames.length > 1 val columnNoun = "Column" + (if (isPlural) "s" else "") val columnVerb = if (isPlural) "are" else "is" val maybeNumNonUniqueTuples = data.map(_.numNonUniqueTuples) val maybePluralS = maybeNumNonUniqueTuples.map(numNonUniqueTuples => if (numNonUniqueTuples != 1) "s" else "") (status, maybeNumNonUniqueTuples, maybePluralS) match { case (ConstraintSuccess, Some(0), _) => s"$columnNoun $columnsString $columnVerb a key." case (ConstraintFailure, Some(numNonUniqueTuples), Some(pluralS)) => s"$columnNoun $columnsString $columnVerb not a key ($numNonUniqueTuples non-unique tuple$pluralS)." case (ConstraintError(throwable), None, None) => s"Checking whether ${columnNoun.toLowerCase()} $columnsString $columnVerb a key failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class UniqueKeyConstraintResultData(numNonUniqueTuples: Long)
Example 56
Source File: RegexConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import java.util.regex.Pattern import org.apache.spark.sql.functions._ import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class RegexConstraint(columnName: String, regex: String) extends Constraint { val fun = (df: DataFrame) => { val pattern = Pattern.compile(regex) val doesNotMatch = udf((column: String) => column != null && !pattern.matcher(column).find()) val maybeDoesNotMatchCount = Try(df.filter(doesNotMatch(new Column(columnName))).count) RegexConstraintResult( constraint = this, data = maybeDoesNotMatchCount.toOption.map(RegexConstraintResultData), status = ConstraintUtil.tryToStatus[Long](maybeDoesNotMatchCount, _ == 0) ) } } case class RegexConstraintResult(constraint: RegexConstraint, data: Option[RegexConstraintResultData], status: ConstraintStatus) extends ConstraintResult[RegexConstraint] { val message: String = { val columnName = constraint.columnName val regex = constraint.regex val maybeFailedRows = data.map(_.failedRows) val maybePluralSAndVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) ("", "does") else ("s", "do")) (status, maybeFailedRows, maybePluralSAndVerb) match { case (ConstraintSuccess, Some(0), _) => s"Column $columnName matches $regex" case (ConstraintFailure, Some(failedRows), Some((pluralS, verb))) => s"Column $columnName contains $failedRows row$pluralS that $verb not match $regex" case (ConstraintError(throwable), None, None) => s"Checking whether column $columnName matches $regex failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class RegexConstraintResultData(failedRows: Long)
Example 57
Source File: MarkdownReporterTest.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.reporters import java.io.{ByteArrayOutputStream, PrintStream} import de.frosner.ddq.constraints._ import de.frosner.ddq.core._ import de.frosner.ddq.testutils.{DummyConstraint, DummyConstraintResult} import org.apache.spark.sql.DataFrame import org.mockito.Mockito._ import org.scalatest.mock.MockitoSugar import org.scalatest.{FlatSpec, Matchers} class MarkdownReporterTest extends FlatSpec with Matchers with MockitoSugar { "A Markdown reporter" should "produce correct output for a check with constraints" in { val baos = new ByteArrayOutputStream() val markdownReporter = new MarkdownReporter(new PrintStream(baos)) val df = mock[DataFrame] val dfName = "myDf" val dfColumns = Array("1", "2") val dfCount = 5 when(df.columns).thenReturn(dfColumns) val header = s"Checking $dfName" val prologue = s"It has a total number of ${dfColumns.size} columns and $dfCount rows." val message1 = "1" val status1 = ConstraintSuccess val constraint1 = DummyConstraint(message1, status1) val result1 = constraint1.fun(df) val message2 = "2" val status2 = ConstraintFailure val constraint2 = DummyConstraint(message2, status2) val result2 = constraint2.fun(df) val message3 = "3" val status3 = ConstraintError(new IllegalArgumentException()) val constraint3 = DummyConstraint(message3, status3) val result3 = DummyConstraintResult(constraint3, message3, status3) val constraints = Map[Constraint, ConstraintResult[Constraint]]( constraint1 -> result1, constraint2 -> result2, constraint3 -> result3 ) val check = Check(df, Some(dfName), Option.empty, constraints.keys.toSeq) markdownReporter.report(CheckResult(constraints, check, dfCount)) val expectedOutput = s"""**$header** $prologue - *SUCCESS*: ${result1.message} - *FAILURE*: ${result2.message} - *ERROR*: ${result3.message} """ baos.toString shouldBe expectedOutput } it should "produce correct output for a check without constraint" in { val baos = new ByteArrayOutputStream() val markdownReporter = new MarkdownReporter(new PrintStream(baos)) val df = mock[DataFrame] val dfName = "myDf" val dfColumns = Array("1", "2") val dfCount = 5 when(df.columns).thenReturn(dfColumns) val header = s"Checking $dfName" val prologue = s"It has a total number of ${dfColumns.size} columns and $dfCount rows." val check = Check(df, Some(dfName), Option.empty, Seq.empty) markdownReporter.report(CheckResult(Map.empty, check, dfCount)) val expectedOutput = s"""**$header** $prologue Nothing to check! """ baos.toString shouldBe expectedOutput } }
Example 58
Source File: ConsoleReporterTest.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.reporters import java.io.{ByteArrayOutputStream, PrintStream} import de.frosner.ddq.constraints._ import de.frosner.ddq.core._ import de.frosner.ddq.testutils.{DummyConstraint, DummyConstraintResult} import org.apache.spark.sql.DataFrame import org.mockito.Mockito._ import org.scalatest.mock.MockitoSugar import org.scalatest.{FlatSpec, Matchers} class ConsoleReporterTest extends FlatSpec with Matchers with MockitoSugar { "A Console reporter" should "produce correct output for a check with constraints" in { val baos = new ByteArrayOutputStream() val consoleReporter = new ConsoleReporter(new PrintStream(baos)) val df = mock[DataFrame] val displayName = "myDf" val dfColumns = Array("1", "2") val dfCount = 5 when(df.columns).thenReturn(dfColumns) val header = s"Checking $displayName" val prologue = s"It has a total number of ${dfColumns.size} columns and $dfCount rows." val message1 = "1" val status1 = ConstraintSuccess val constraint1 = DummyConstraint(message1, status1) val result1 = constraint1.fun(df) val message2 = "2" val status2 = ConstraintFailure val constraint2 = DummyConstraint(message2, status2) val result2 = constraint2.fun(df) val message3 = "3" val status3 = ConstraintError(new IllegalArgumentException()) val constraint3 = DummyConstraint(message3, status3) val result3 = DummyConstraintResult(constraint3, message3, status3) val constraints = Map[Constraint, ConstraintResult[Constraint]]( constraint1 -> result1, constraint2 -> result2, constraint3 -> result3 ) val check = Check(df, Some(displayName), Option.empty, constraints.keys.toSeq) consoleReporter.report(CheckResult(constraints, check, dfCount)) val expectedOutput = s"""${Console.BLUE}$header${Console.RESET} ${Console.BLUE}$prologue${Console.RESET} ${Console.GREEN}- ${result1.message}${Console.RESET} ${Console.RED}- ${result2.message}${Console.RESET} ${Console.YELLOW}- ${result3.message}${Console.RESET} """ baos.toString shouldBe expectedOutput } it should "produce correct output for a check without constraint" in { val baos = new ByteArrayOutputStream() val consoleReporter = new ConsoleReporter(new PrintStream(baos)) val df = mock[DataFrame] val displayName = "myDf" val dfColumns = Array("1", "2") val dfCount = 5 when(df.columns).thenReturn(dfColumns) val header = s"Checking $displayName" val prologue = s"It has a total number of ${dfColumns.size} columns and $dfCount rows." val check = Check(df, Some(displayName), Option.empty, Seq.empty) consoleReporter.report(CheckResult(Map.empty, check, dfCount)) val expectedOutput = s"""${Console.BLUE}$header${Console.RESET} ${Console.BLUE}$prologue${Console.RESET} ${Console.BLUE}Nothing to check!${Console.RESET} """ baos.toString shouldBe expectedOutput } }
Example 59
Source File: TestData.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.testutils import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} object TestData { def makeIntegerDf(spark: SparkSession, numbers: Seq[Int]): DataFrame = spark.createDataFrame( spark.sparkContext.makeRDD(numbers.map(Row(_))), StructType(List(StructField("column", IntegerType, nullable = false))) ) def makeNullableStringDf(spark: SparkSession, strings: Seq[String]): DataFrame = spark.createDataFrame(spark.sparkContext.makeRDD(strings.map(Row(_))), StructType(List(StructField("column", StringType, nullable = true)))) def makeIntegersDf(spark: SparkSession, row1: Seq[Int], rowN: Seq[Int]*): DataFrame = { val rows = row1 :: rowN.toList val numCols = row1.size val rdd = spark.sparkContext.makeRDD(rows.map(Row(_:_*))) val schema = StructType((1 to numCols).map(idx => StructField("column" + idx, IntegerType, nullable = false))) spark.createDataFrame(rdd, schema) } }
Example 60
Source File: RunnerTest.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.core import de.frosner.ddq.constraints.{ConstraintFailure, ConstraintSuccess} import de.frosner.ddq.reporters.Reporter import de.frosner.ddq.testutils.DummyConstraint import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel import org.mockito.Mockito._ import org.scalatest.mock.MockitoSugar import org.scalatest.{FlatSpec, Matchers} class RunnerTest extends FlatSpec with Matchers with MockitoSugar { "A runner" should "run with multiple checks" in { val df1 = mock[DataFrame] val df2 = mock[DataFrame] val message1 = "1" val status1 = ConstraintSuccess val constraint1 = DummyConstraint(message1, status1) val result1 = constraint1.fun(df1) val message2 = "2" val status2 = ConstraintFailure val constraint2 = DummyConstraint(message2, status2) val result2 = constraint2.fun(df2) val check1 = Check(df1, None, None, Seq(constraint1)) val check2 = Check(df2, None, None, Seq(constraint2)) val checkResults = Runner.run(List(check1, check2), List.empty) checkResults.size shouldBe 2 val checkResult1 = checkResults(check1) val checkResult2 = checkResults(check2) checkResult1.check shouldBe check1 checkResult1.constraintResults shouldBe Map((constraint1, result1)) checkResult2.check shouldBe check2 checkResult2.constraintResults shouldBe Map((constraint2, result2)) } it should "persist and unpersist the data frame if a persist method is specified" in { val storageLevel = StorageLevel.MEMORY_AND_DISK val df = mock[DataFrame] when(df.persist(storageLevel)).thenReturn(df.asInstanceOf[df.type]) val check = Check(df, None, Some(storageLevel), Seq(DummyConstraint("test", ConstraintSuccess))) val checkResult = Runner.run(List(check), List.empty)(check) verify(df).persist(storageLevel) verify(df).unpersist() } it should "not persist and unpersist the data frame if no persist method is specified" in { val df = mock[DataFrame] val check = Check(df, None, None, Seq(DummyConstraint("test", ConstraintSuccess))) val checkResult = Runner.run(List(check), List.empty)(check) verify(df, never()).persist() verify(df, never()).unpersist() } it should "report to all reporters what it returns" in { val df = mock[DataFrame] val check = Check(df, None, None, Seq(DummyConstraint("test", ConstraintSuccess))) val checkResult = Runner.run(List(check), List.empty)(check) val reporter1 = mock[Reporter] val reporter2 = mock[Reporter] Runner.run(List(check), List(reporter1, reporter2)) verify(reporter1).report(checkResult) verify(reporter2).report(checkResult) } }
Example 61
Source File: QueryFunctions.scala From azure-sqldb-spark with MIT License | 5 votes |
package com.microsoft.azure.sqldb.spark.query import java.sql.{Connection, SQLException} import com.microsoft.azure.sqldb.spark.connect.ConnectionUtils._ import com.microsoft.azure.sqldb.spark.LoggingTrait import com.microsoft.azure.sqldb.spark.config.{Config, SqlDBConfig} import com.microsoft.azure.sqldb.spark.connect._ import org.apache.spark.sql.{DataFrame, SQLContext} def sqlDBQuery(config: Config): Either[DataFrame, Boolean] = { var connection: Connection = null val sql = config.get[String](SqlDBConfig.QueryCustom).getOrElse( throw new IllegalArgumentException("Query not found in QueryCustom in Config") ) try { connection = getConnection(config) val statement = connection.createStatement() if (statement.execute(sql)) { Left(sqlContext.read.sqlDB(config)) } else { Right(true) } } catch { case sqlException: SQLException => { sqlException.printStackTrace() Right(false) } case exception: Exception => { exception.printStackTrace() Right(false) } } finally { connection.close() } } }
Example 62
Source File: testData.scala From sparkGLM with Apache License 2.0 | 5 votes |
package com.Alteryx.testUtils.data import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.test._ import org.apache.spark.sql.test.TestSQLContext.implicits._ object testData { val numericDF: DataFrame = TestSQLContext.read.json( "./src/test/scala/com/Alteryx/testUtils/data/linear_reg_all_numeric.json") val mixedDF: DataFrame = TestSQLContext.read.json( "./src/test/scala/com/Alteryx/testUtils/data/linear_reg_mixed.json") case class testRow(intField: Int, strField: String, numField: Double) val dummyDF: DataFrame = { TestSQLContext.sparkContext.parallelize( testRow(1, "a", 1.0) :: testRow(2, "b", 2.0) :: testRow(3, "c", 3.0) :: Nil).toDF() } val oneLessCategoryDF: DataFrame = { TestSQLContext.sparkContext.parallelize( testRow(1, "a", 1.0) :: testRow(2, "b", 2.0) :: testRow(3, "a", 3.0) :: Nil).toDF() } val testRDD = TestSQLContext.sparkContext.parallelize(Seq( Array(1.0, 1.1, 21.4), Array(1.0, 2.2, 36.5), Array(1.0, 3.3, 15.0), Array(1.0, 4.4, 62.5), Array(1.0, 5.5, 36.1), Array(1.0, 6.6, 12.0), Array(1.0, 7.7, 37.0), Array(1.0, 8.8, 41.0), Array(1.0, 9.9, 36.6), Array(1.0, 11.0, 17.9), Array(1.0, 12.1, 53.1), Array(1.0, 13.2, 29.6), Array(1.0, 14.3, 8.3), Array(1.0, 15.4, -24.7), Array(1.0, 16.5, 41.0), Array(1.0, 17.6, 16.5), Array(1.0, 18.7, 16.0), Array(1.0, 19.8, 34.1), Array(1.0, 20.9, 30.5), Array(1.0, 22.0, 24.9), Array(1.0, 23.1, 30.3), Array(1.0, 24.2, 26.4), Array(1.0, 25.3, 11.2), Array(1.0, 26.4, -31.2), Array(1.0, 27.5, 19.9), Array(1.0, 28.6, 5.3), Array(1.0, 29.7, 2.2), Array(1.0, 30.8, -25.2), Array(1.0, 31.9, -6.5), Array(1.0, 33.0, 10.4), Array(1.0, 34.1, 28.1), Array(1.0, 35.2, -2.3), Array(1.0, 36.3, 6.5), Array(1.0, 37.4, -3.5), Array(1.0, 38.5, -31.0), Array(1.0, 39.6, -12.9), Array(1.0, 40.7, -13.6), Array(1.0, 41.8, -8.0), Array(1.0, 42.9, 14.1), Array(1.0, 44.0, 6.3), Array(1.0, 45.1, -13.4), Array(1.0, 46.2, -16.3), Array(1.0, 47.3, 1.6), Array(1.0, 48.4, -2.3), Array(1.0, 49.5, -28.3), Array(1.0, 50.6, -29.7), Array(1.0, 51.7, -9.4), Array(1.0, 52.8, -2.4), Array(1.0, 53.9, -21.1), Array(1.0, 55.0, -2.4) ), 4).map(x => Row(x(0), x(1), x(2))) val testSchema = StructType( StructField("intercept", DoubleType, true) :: StructField("x", DoubleType, true) :: StructField("y", DoubleType, true) :: Nil) val testDFSinglePart: DataFrame = { TestSQLContext.createDataFrame(testRDD, testSchema).coalesce(1) } val testDFMultiPart: DataFrame = { TestSQLContext.createDataFrame(testRDD, testSchema) } }
Example 63
Source File: DataFrameToMleap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter import com.truecar.mleap.runtime.types.StringArrayType import com.truecar.mleap.spark import com.truecar.mleap.spark.SparkDataset import com.truecar.mleap.runtime.types import com.truecar.mleap.spark.SparkLeapFrame import org.apache.spark.ml.mleap import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types._ import com.truecar.mleap.runtime.{Row => MleapRow} case class DataFrameToMleap(dataset: DataFrame) { def toMleap: SparkLeapFrame = { val mleapFields = dataset.schema.fields.flatMap { field => field.dataType match { case _: NumericType | BooleanType | StringType => Seq(types.StructField(field.name, types.DoubleType)) case _: VectorUDT => Seq(types.StructField(field.name, types.VectorType)) case _: StringType => Seq(types.StructField(field.name, types.StringType)) case dataType: ArrayType => dataType.elementType match { case StringType => Seq(types.StructField(field.name, StringArrayType)) case _ => Seq() } case _ => Seq() } } toMleap(types.StructType(mleapFields)) } def toMleap(schema: types.StructType): SparkLeapFrame = { val sparkSchema = dataset.schema // cast MLeap field numeric types to DoubleTypes val mleapCols = schema.fields.map { field => field.dataType match { case types.DoubleType => dataset.col(field.name).cast(DoubleType).as(s"mleap.${field.name}") case types.StringType => dataset.col(field.name).cast(StringType).as(s"mleap.${field.name}") case types.VectorType => dataset.col(field.name).cast(new mleap.VectorUDT()).as(s"mleap.${field.name}") case types.StringArrayType => dataset.col(field.name).cast(new ArrayType(StringType, containsNull = false)).as(s"mleap.${field.name}") } } val cols = Seq(dataset.col("*")) ++ mleapCols val castDataset = dataset.select(cols: _*) val sparkIndices = sparkSchema.fields.indices val mleapIndices = (sparkSchema.fields.length until (sparkSchema.fields.length + schema.fields.length)).toArray val rdd = castDataset.rdd.map { row => // finish converting Spark data structure to MLeap // TODO: make a Spark UDT for MleapVector and just // cast like we do for numeric types val mleapValues = mleapIndices.map(row.get) val mleapRow = MleapRow(mleapValues: _*) val sparkValues: IndexedSeq[Any] = sparkIndices.map(row.get) (mleapRow, sparkValues) } val mleapDataset = SparkDataset(rdd) SparkLeapFrame(schema, sparkSchema, mleapDataset) } }
Example 64
Source File: LeapFrameToSpark.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter import com.truecar.mleap.core.linalg.Vector import com.truecar.mleap.runtime.types.StructType import com.truecar.mleap.spark.{SparkLeapFrame, MleapSparkSupport} import org.apache.spark.sql.{types, Row, DataFrame, SQLContext} import MleapSparkSupport._ trait LeapFrameToSpark[T] { def toSpark(t: T)(implicit sqlContext: SQLContext): DataFrame } case class LeapFrameToSparkWrapper[T: LeapFrameToSpark](t: T) { def toSpark(implicit sqlContext: SQLContext): DataFrame = { implicitly[LeapFrameToSpark[T]].toSpark(t) } } object LeapFrameToSpark { implicit object SparkLeapFrameToSpark extends LeapFrameToSpark[SparkLeapFrame] { override def toSpark(t: SparkLeapFrame) (implicit sqlContext: SQLContext): DataFrame = { val outputNames = t.schema.fields.map(_.name).toSet -- t.sparkSchema.fields.map(_.name).toSet val outputs = outputNames.map { name => (t.schema(name), t.schema.indexOf(name)) }.toArray.sortBy(_._2) val (outputFields, outputIndices) = outputs.unzip val outputMleapSchema = StructTypeToSpark(StructType(outputFields)).toSpark val outputSchema = types.StructType(t.sparkSchema.fields ++ outputMleapSchema.fields) val rows = t.dataset.rdd.map { case (mleapRow, sparkValues) => val mleapData = outputIndices.map { index => mleapRow.get(index) match { case value: Vector => value.toSpark case value => value } } Row(sparkValues ++ mleapData: _*) } sqlContext.createDataFrame(rows, outputSchema) } } }
Example 65
Source File: MleapSparkSupport.scala From mleap with Apache License 2.0 | 5 votes |
package com.truecar.mleap.spark import com.truecar.mleap.core.linalg import com.truecar.mleap.runtime.transformer.{Transformer => MleapTransformer} import com.truecar.mleap.runtime.{types, Row => MleapRow} import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.mleap.converter._ import org.apache.spark.ml.mleap.converter.runtime.{BaseTransformerConverter, TransformerToMleap} import org.apache.spark.ml.mleap.converter.runtime.classification.DecisionTreeClassificationModelToMleap import org.apache.spark.ml.mleap.converter.runtime.regression.DecisionTreeRegressionModelToMleap import org.apache.spark.ml.regression.DecisionTreeRegressionModel import org.apache.spark.ml.tree._ import org.apache.spark.ml.Transformer import org.apache.spark.mllib.linalg._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, SQLContext} trait MleapSparkSupport extends BaseTransformerConverter { import scala.language.implicitConversions implicit def transformerToMleapLifted[T <: Transformer] (t: T) (implicit transformerToMleap: TransformerToMleap[T, _ <: MleapTransformer]): MleapTransformer = { transformerToMleap.toMleapLifted(t) } implicit def mleapTransformerWrapper[T <: MleapTransformer](t: T): MleapTransformerWrapper[T] = { MleapTransformerWrapper(t) } implicit def vectorToSpark(vector: linalg.Vector): VectorToSpark = VectorToSpark(vector) implicit def vectorToMleap(vector: Vector): VectorToMleap = VectorToMleap(vector) implicit def dataFrameToMleap(dataset: DataFrame): DataFrameToMleap = DataFrameToMleap(dataset) implicit def decisionTreeRegressionModelToMleap(tree: DecisionTreeRegressionModel): DecisionTreeRegressionModelToMleap = DecisionTreeRegressionModelToMleap(tree) implicit def decisionTreeClassificationModelToMleap(tree: DecisionTreeClassificationModel): DecisionTreeClassificationModelToMleap = DecisionTreeClassificationModelToMleap(tree) implicit def nodeToMleap(node: Node): NodeToMleap = NodeToMleap(node) implicit def splitToMleap(split: Split): SplitToMleap = SplitToMleap(split) implicit def structTypeToMleap(schema: StructType): StructTypeToMleap = StructTypeToMleap(schema) implicit def rowToSpark(row: MleapRow): RowToSpark = RowToSpark(row) implicit def structTypeToSpark(schema: types.StructType): StructTypeToSpark = StructTypeToSpark(schema) implicit def leapFrameToSpark[T: LeapFrameToSpark](frame: T): LeapFrameToSparkWrapper[T] = { LeapFrameToSparkWrapper(frame) } implicit def leapFrameToSparkConvert[T: LeapFrameToSpark](frame: T) (implicit sqlContext: SQLContext): DataFrame = { implicitly[LeapFrameToSpark[T]].toSpark(frame) } implicit def dataFrameToLeapFrame(dataFrame: DataFrame): SparkLeapFrame = dataFrame.toMleap } object MleapSparkSupport extends MleapSparkSupport
Example 66
Source File: DataPreprocess.scala From xgbspark-text-classification with Apache License 2.0 | 5 votes |
package com.lenovo.ml import org.apache.spark.sql.{SparkSession, DataFrame, Dataset} import scala.collection.mutable import scala.util.matching.Regex import org.ansj.library.DicLibrary import org.ansj.recognition.impl.StopRecognition import org.ansj.splitWord.analysis.DicAnalysis object DataPreprocess { def textCleaner(sparkSession: SparkSession, rawText: DataFrame): Dataset[String] = { // 过滤文本中的时间、网址和邮箱 val regex1 = new Regex("""[-—0-9a-z]+[:]+[0-9a-z]+[:]?""") val regex2 = new Regex("""[0-9]+年|[0-9]+月|[0-9]+[日]|[0-9]+[天]|[0-9]+[号]|[0-9]+[次]""") val regex3 = new Regex("""http[s]?://[a-z0-9./?=_-]+""") val regex4 = new Regex("""[0-9_a-z]+([-+.][0-9_a-z]+)*@[0-9_a-z]+([-.][0-9_a-z]+)*\.[0-9_a-z]+([-.][0-9_a-z]+)*""") import sparkSession.implicits._ rawText.map(x => x.toString).map(x => x.substring(1,x.length - 1).toLowerCase).map(x => regex1.replaceAllIn(x,"")) .map(x => regex2.replaceAllIn(x,"")).map(x => regex3.replaceAllIn(x,"")).map(x => regex4.replaceAllIn(x,"")) } def segWords(sparkSession: SparkSession, stopWordsPath: String, dictionaryPath: String, synonymWordsPath: String, singleWordsPath: String, rawText: DataFrame): DataFrame = { val filter = new StopRecognition() // 设定停用词性 filter.insertStopNatures("w","ns","nr","t","r","u","e","y","o") // 加载停用词表 val stopWords = sparkSession.sparkContext.textFile(stopWordsPath).cache() stopWords.collect().foreach{line => filter.insertStopWords(line)} // 加载自定义词表 val dictionary = sparkSession.sparkContext.textFile(dictionaryPath).cache() dictionary.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)} stopWords.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)} // 构建同义词表 val synonymWords = sparkSession.sparkContext.textFile(synonymWordsPath).cache() var synonymMap: Map[String, String] = Map() synonymWords.collect().foreach{line => val data = line.split(" ",2) synonymMap = synonymMap + (data(0) -> data(1)) } // 构建单字白名单 val singleWords = sparkSession.sparkContext.textFile(singleWordsPath).cache() val singleWhiteList: mutable.Set[String] = mutable.Set() singleWords.collect().foreach{line => singleWhiteList.add(line)} // 通过广播将词表发送给各节点 val stop = sparkSession.sparkContext.broadcast(filter) val dic = sparkSession.sparkContext.broadcast(DicLibrary.get(DicLibrary.DEFAULT)) val synonym = sparkSession.sparkContext.broadcast(synonymMap) val single = sparkSession.sparkContext.broadcast(singleWhiteList) // 读取文本数据,过滤后分词 import sparkSession.implicits._ textCleaner(sparkSession, rawText).map { x => val parse = DicAnalysis.parse(x, dic.value).recognition(stop.value) // 抽取分词结果,不附带词性 val words = for(i<-Range(0,parse.size())) yield parse.get(i).getName val filterWords = words.map(_.trim).filter(x => x.length > 1 || single.value.contains(x)) filterWords.map(x => if(synonym.value.contains(x)) synonym.value(x) else x).mkString(" ") }.toDF("words") } }
Example 67
Source File: RedshiftReaderM.scala From SqlShift with MIT License | 5 votes |
package com.databricks.spark.redshift import com.amazonaws.auth.AWSCredentials import com.amazonaws.services.s3.AmazonS3Client import org.apache.spark.SparkContext import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.{DataFrame, SQLContext} object RedshiftReaderM { val endpoint = "s3.ap-south-1.amazonaws.com" def getS3Client(provider: AWSCredentials): AmazonS3Client = { val client = new AmazonS3Client(provider) client.setEndpoint(endpoint) client } def getDataFrameForConfig(configs: Map[String, String], sparkContext: SparkContext, sqlContext: SQLContext): DataFrame = { val source: DefaultSource = new DefaultSource(new JDBCWrapper(), getS3Client) val br: BaseRelation = source.createRelation(sqlContext, configs) sqlContext.baseRelationToDataFrame(br) } }
Example 68
Source File: SparkNRedshiftUtil.scala From SqlShift with MIT License | 5 votes |
package com.goibibo.sqlshift import java.sql.{Connection, DriverManager} import java.util.Properties import com.databricks.spark.redshift.RedshiftReaderM import com.typesafe.config.Config import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, Suite} import org.slf4j.{Logger, LoggerFactory} trait SparkNRedshiftUtil extends BeforeAndAfterAll { self: Suite => private val logger: Logger = LoggerFactory.getLogger(this.getClass) @transient private var _sc: SparkContext = _ @transient private var _sqlContext: SQLContext = _ def sc: SparkContext = _sc def sqlContext: SQLContext = _sqlContext private def getRedshiftConnection(config: Config): Connection = { val mysql = config.getConfig("redshift") val connectionProps = new Properties() connectionProps.put("user", mysql.getString("username")) connectionProps.put("password", mysql.getString("password")) val jdbcUrl = s"jdbc:redshift://${mysql.getString("hostname")}:${mysql.getInt("portno")}/${mysql.getString("database")}?useSSL=false" Class.forName("com.amazon.redshift.jdbc4.Driver") DriverManager.getConnection(jdbcUrl, connectionProps) } val getSparkContext: (SparkContext, SQLContext) = { val sparkConf: SparkConf = new SparkConf().setAppName("Full Dump Testing").setMaster("local") val sc: SparkContext = new SparkContext(sparkConf) val sqlContext: SQLContext = new SQLContext(sc) System.setProperty("com.amazonaws.services.s3.enableV4", "true") sc.hadoopConfiguration.set("fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com") sc.hadoopConfiguration.set("fs.s3a.fast.upload", "true") (sc, sqlContext) } def readTableFromRedshift(config: Config, tableName: String): DataFrame = { val redshift: Config = config.getConfig("redshift") val options = Map("dbtable" -> tableName, "user" -> redshift.getString("username"), "password" -> redshift.getString("password"), "url" -> s"jdbc:redshift://${redshift.getString("hostname")}:${redshift.getInt("portno")}/${redshift.getString("database")}", "tempdir" -> config.getString("s3.location"), "aws_iam_role" -> config.getString("redshift.iamRole") ) RedshiftReaderM.getDataFrameForConfig(options, sc, sqlContext) } def dropTableRedshift(config: Config, tables: String*): Unit = { logger.info("Droping table: {}", tables) val conn = getRedshiftConnection(config) val statement = conn.createStatement() try { val dropTableQuery = s"""DROP TABLE ${tables.mkString(",")}""" logger.info("Running query: {}", dropTableQuery) statement.executeUpdate(dropTableQuery) } finally { statement.close() conn.close() } } override protected def beforeAll(): Unit = { super.beforeAll() val (sc, sqlContext) = getSparkContext _sc = sc _sqlContext = sqlContext } override protected def afterAll(): Unit = { super.afterAll() _sc.stop() } }
Example 69
Source File: LogisticRegressionSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.classification import com.ibm.aardpfark.pfa.ProbClassifierResult import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.{DataFrame, Row} class LogisticRegressionSuite extends SparkClassifierPFASuiteBase[ProbClassifierResult] { import spark.implicits._ def getOutput(df: DataFrame) = { df.select(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).map { case Row(p: Double, raw: Vector, pr: Vector) => (p, raw.toArray, pr.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).toJSON.collect() } val binaryData = spark.read.format("libsvm").load("data/sample_libsvm_data.txt") val multiData = spark.read.format("libsvm").load("data/sample_multiclass_classification_data.txt") val clf = new LogisticRegression() override val sparkTransformer = clf.fit(binaryData) val result = sparkTransformer.transform(binaryData) override val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() override val expectedOutput = getOutput(result) // Additional tests test("LogisticRegression w/o fitIntercept") { val sparkTransformer = clf.setFitIntercept(false).fit(binaryData) val result = sparkTransformer.transform(binaryData) val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } test("LogisticRegression w/ non-default threshold") { val sparkTransformer = clf.setThreshold(0.0).fit(binaryData) val result = sparkTransformer.transform(binaryData) val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) val sparkTransformer2 = clf.setThreshold(1.0).fit(binaryData) val result2 = sparkTransformer2.transform(binaryData) val expectedOutput2 = getOutput(result2) parityTest(sparkTransformer2, input, expectedOutput2) } test("MLOR w/ intercept") { val sparkTransformer = clf.fit(multiData) val result = sparkTransformer.transform(multiData) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } test("MLOR w/o intercept") { val sparkTransformer = clf.setFitIntercept(false).fit(multiData) val result = sparkTransformer.transform(multiData) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } test("MLOR w/ thresholds") { val sparkTransformer = clf.setThresholds(Array(0.1, 0.6, 0.3)).fit(multiData) val result = sparkTransformer.transform(multiData) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } test("MLOR w/ thresholds - one zero") { val sparkTransformer = clf.setThresholds(Array(0.0, 0.6, 0.3)).fit(multiData) val result = sparkTransformer.transform(multiData) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } }
Example 70
Source File: DataCoder.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.hbase.examples import org.apache.spark.sql.execution.datasources.hbase._ import org.apache.spark.sql.{DataFrame, SparkSession} case class DCRecord( col00: String, col01: Int, col1: Boolean, col2: Double, col3: Float, col4: Int, col5: Long, col6: Short, col7: String, col8: Byte) object DCRecord { def apply(i: Int): DCRecord = { DCRecord(s"row${"%03d".format(i)}", if (i % 2 == 0) { i } else { -i }, i % 2 == 0, i.toDouble, i.toFloat, i, i.toLong, i.toShort, s"String$i extra", i.toByte) } } object DataCoder { def cat = s"""{ |"table":{"namespace":"default", "name":"shcExampleDCTable", "tableCoder":"Phoenix", "version":"2.0"}, |"rowkey":"key1:key2", |"columns":{ |"col00":{"cf":"rowkey", "col":"key1", "type":"string"}, |"col01":{"cf":"rowkey", "col":"key2", "type":"int"}, |"col1":{"cf":"CF1", "col":"COL1", "type":"boolean"}, |"col2":{"cf":"CF1", "col":"COL2", "type":"double"}, |"col3":{"cf":"CF2", "col":"COL3", "type":"float"}, |"col4":{"cf":"CF2", "col":"COL4", "type":"int"}, |"col5":{"cf":"CF3", "col":"COL5", "type":"bigint"}, |"col6":{"cf":"CF3", "col":"COL6", "type":"smallint"}, |"col7":{"cf":"CF4", "col":"COL7", "type":"string"}, |"col8":{"cf":"CF4", "col":"COL8", "type":"tinyint"} |} |}""".stripMargin def main(args: Array[String]){ val spark = SparkSession.builder() .appName("DataCoderExample") .getOrCreate() val sc = spark.sparkContext val sqlContext = spark.sqlContext import sqlContext.implicits._ def withCatalog(cat: String): DataFrame = { sqlContext .read .options(Map(HBaseTableCatalog.tableCatalog->cat)) .format("org.apache.spark.sql.execution.datasources.hbase") .load() } // populate table with composite key val data = (0 to 255).map { i => DCRecord(i) } sc.parallelize(data).toDF.write .options(Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5")) .format("org.apache.spark.sql.execution.datasources.hbase") .save() val df = withCatalog(cat) df.show df.filter($"col0" <= "row005") .select($"col0", $"col1").show df.filter($"col0" === "row005" || $"col0" <= "row005") .select($"col0", $"col1").show df.filter($"col0" > "row250") .select($"col0", $"col1").show df.registerTempTable("table1") val c = sqlContext.sql("select count(col1) from table1 where col0 < 'row050'") c.show() } }
Example 71
Source File: LRJobForDataSources.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.hbase.examples import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.execution.datasources.hbase.{HBaseRelation, HBaseTableCatalog} case class LRRecord( key: Int, col1: Boolean, col2: Double, col3: Float) object LRRecord { def apply(i: Int): LRRecord = { LRRecord(i, i % 2 == 0, i.toDouble, i.toFloat) } } // long running job for different data sources object LRJobForDataSources { val cat = s"""{ |"table":{"namespace":"default", "name":"shcExampleTable", "tableCoder":"PrimitiveType"}, |"rowkey":"key", |"columns":{ |"key":{"cf":"rowkey", "col":"key", "type":"int"}, |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"}, |"col2":{"cf":"cf2", "col":"col2", "type":"double"}, |"col3":{"cf":"cf3", "col":"col3", "type":"float"} |} |}""".stripMargin def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: LRJobAccessing2Clusters <hiveTableName> [sleepTime]") System.exit(1) } val hiveTableName = args(0) val sleepTime = if (args.length > 1) args(1).toLong else 2 * 60 * 1000 // sleep 2 min by default val spark = SparkSession.builder() .appName("LRJobForDataSources") .enableHiveSupport() .getOrCreate() val sc = spark.sparkContext val sqlContext = spark.sqlContext import sqlContext.implicits._ import spark.sql def withCatalog(cat: String): DataFrame = { sqlContext .read .options(Map(HBaseTableCatalog.tableCatalog->cat)) .format("org.apache.spark.sql.execution.datasources.hbase") .load() } val timeEnd = System.currentTimeMillis() + (25 * 60 * 60 * 1000) // 25h later while (System.currentTimeMillis() < timeEnd) { // Part 1: write data into Hive table and read data from it, which accesses HDFS sql(s"DROP TABLE IF EXISTS $hiveTableName") sql(s"CREATE TABLE $hiveTableName(key INT, col1 BOOLEAN, col2 DOUBLE, col3 FLOAT)") for (i <- 1 to 3) { sql(s"INSERT INTO $hiveTableName VALUES ($i, ${i % 2 == 0}, ${i.toDouble}, ${i.toFloat})") } val df1 = sql(s"SELECT * FROM $hiveTableName") df1.show() // Part 2: create HBase table, write data into it, read data from it val data = (0 to 40).map { i => LRRecord(i) } sc.parallelize(data).toDF.write.options( Map(HBaseTableCatalog.tableCatalog -> cat, HBaseTableCatalog.newTable -> "5")) .format("org.apache.spark.sql.execution.datasources.hbase") .save() val df2 = withCatalog(cat) df2.show df2.filter($"key" <= "5").select($"key", $"col1").show // Part 3: join the two dataframes val s1 = df1.filter($"key" <= "40").select("key", "col1") val s2 = df2.filter($"key" <= "20" && $"key" >= "1").select("key", "col2") val result = s1.join(s2, Seq("key")) result.show() Thread.sleep(sleepTime) } spark.stop() } }
Example 72
Source File: Kudu.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp import org.apache.kudu.spark.kudu._ import org.apache.spark.sql.streaming.DataStreamWriter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SparkSession} class KuduSink(master: String, database: String, checkpointLocation: String => String) { def writeTable(sinkName: String, triggerSeconds: Int = 10) = new Sink { override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = { val fullTableName = s"impala::$database.$name" df .writeStream .format("kudu") .option("kudu.master", master) .option("kudu.table", fullTableName) .option("checkpointLocation", checkpointLocation(name)) .option("retries", "3") .outputMode("update") } override val name: String = sinkName } }
Example 73
Source File: KuduSink.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp.kudu import org.apache.kudu.spark.kudu.KuduContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.{DataFrame, SQLContext} import org.slf4j.LoggerFactory import scala.util.control.NonFatal object KuduSink { def withDefaultContext(sqlContext: SQLContext, parameters: Map[String, String]) = new KuduSink(new KuduContext(parameters("kudu.master"), sqlContext.sparkContext), parameters) } class KuduSink(initKuduContext: => KuduContext, parameters: Map[String, String]) extends Sink { private val logger = LoggerFactory.getLogger(getClass) private var kuduContext = initKuduContext private val tablename = parameters("kudu.table") private val retries = parameters.getOrElse("retries", "1").toInt require(retries >= 0, "retries must be non-negative") logger.info(s"Created Kudu sink writing to table $tablename") override def addBatch(batchId: Long, data: DataFrame): Unit = { for (attempt <- 0 to retries) { try { kuduContext.upsertRows(data, tablename) return } catch { case NonFatal(e) => if (attempt < retries) { logger.warn("Kudu upsert error, retrying...", e) kuduContext = initKuduContext } else { logger.error("Kudu upsert error, exhausted", e) throw e } } } } }
Example 74
Source File: Memory.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp import org.apache.spark.sql.streaming.{DataStreamWriter, OutputMode} import org.apache.spark.sql.{DataFrame, Row} object Memory { def memorySink(sinkName: String) = new Sink { override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = df .writeStream .outputMode(OutputMode.Append) .queryName(name) .format("memory") override val name: String = sinkName } }
Example 75
Source File: KuduSinkUnitTest.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp.kudu import org.apache.kudu.spark.kudu.KuduContext import org.apache.spark.sql.DataFrame import org.mockito.Mockito._ import org.scalatest._ import org.scalatest.mockito.MockitoSugar class KuduSinkUnitTest extends FunSuite with MockitoSugar { private val frame = mock[DataFrame] private def setupKuduContextMock(kuduContext: KuduContext, failTimes: Int): KuduContext = { if (failTimes > 0) { val stubber = doThrow(new RuntimeException) for (_ <- 2 to failTimes) { stubber.doThrow(new RuntimeException) } stubber.doCallRealMethod() .when(kuduContext).upsertRows(frame, "table") } kuduContext } test("kudu upsert fails, retries once") { val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 1), 1) helper.sink.addBatch(0, frame) assert(helper.initialized == 1, "context should be initialized once") } test("kudu upsert fails twice, retries once, fails") { val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 2), 1) intercept[RuntimeException] { helper.sink.addBatch(0, frame) } assert(helper.initialized == 1, "context should be initialized once") } test("kudu upsert fails 3 times, retries 3 times") { val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 3), 3) helper.sink.addBatch(0, frame) assert(helper.initialized == 3, "context should be initialized three times") } test("kudu upsert fails 3 times, retries 4 times") { val helper = new KuduSinkWithMockedContext(setupKuduContextMock(mock[KuduContext], failTimes = 3), 4) helper.sink.addBatch(0, frame) assert(helper.initialized == 3, "context should be initialized only three times") } } class KuduSinkWithMockedContext(kuduContext: KuduContext, retries: Int) { // KuduSink constructor inits once var initialized = -1 private def initKuduConext: KuduContext = { initialized += 1 kuduContext } val sink = new KuduSink(initKuduConext, Map( "kudu.table" -> "table", "kudu.master" -> "master", "retries" -> retries.toString)) }
Example 76
Source File: KuduSink.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp.kudu import org.apache.kudu.spark.kudu.KuduContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.{DataFrame, SQLContext} import org.slf4j.LoggerFactory import scala.util.control.NonFatal object KuduSink { def withDefaultContext(sqlContext: SQLContext, parameters: Map[String, String]) = new KuduSink(new KuduContext(parameters("kudu.master"), sqlContext.sparkContext), parameters) } class KuduSink(initKuduContext: => KuduContext, parameters: Map[String, String]) extends Sink { private val logger = LoggerFactory.getLogger(getClass) private var kuduContext = initKuduContext private val tablename = parameters("kudu.table") private val retries = parameters.getOrElse("retries", "1").toInt require(retries >= 0, "retries must be non-negative") logger.info(s"Created Kudu sink writing to table $tablename") override def addBatch(batchId: Long, data: DataFrame): Unit = { for (attempt <- 0 to retries) { try { kuduContext.upsertRows(data, tablename) return } catch { case NonFatal(e) => if (attempt < retries) { logger.warn("Kudu upsert error, retrying...", e) kuduContext = initKuduContext } else { logger.error("Kudu upsert error, exhausted", e) throw e } } } } }
Example 77
Source File: TestSparkContext.scala From spark-images with Apache License 2.0 | 5 votes |
package org.apache.spark.image import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types._ import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Row, DataFrame, SQLContext, SparkSession} import scala.reflect.runtime.universe._ import org.scalatest.{FunSuite, BeforeAndAfterAll} // This context is used for all tests in this project trait TestSparkContext extends BeforeAndAfterAll { self: FunSuite => @transient var sc: SparkContext = _ @transient var sqlContext: SQLContext = _ @transient lazy val spark: SparkSession = { val conf = new SparkConf() .setMaster("local[*]") .setAppName("Spark-Image-Test") .set("spark.ui.port", "4079") .set("spark.sql.shuffle.partitions", "4") // makes small tests much faster val sess = SparkSession.builder().config(conf).getOrCreate() sess.sparkContext.setLogLevel("WARN") sess } override def beforeAll() { super.beforeAll() sc = spark.sparkContext sqlContext = spark.sqlContext import spark.implicits._ } override def afterAll() { sqlContext = null if (sc != null) { sc.stop() } sc = null super.afterAll() } def makeDF[T: TypeTag](xs: Seq[T], col: String): DataFrame = { sqlContext.createDataFrame(xs.map(Tuple1.apply)).toDF(col) } def compareRows(r1: Array[Row], r2: Seq[Row]): Unit = { val a = r1.sortBy(_.toString()) val b = r2.sortBy(_.toString()) assert(a === b) } }
Example 78
Source File: SpreadsheetRelation.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package com.github.potix2.spark.google.spreadsheets import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheetContext import com.github.potix2.spark.google.spreadsheets.util._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SQLContext} case class SpreadsheetRelation protected[spark] ( context:SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext) extends BaseRelation with TableScan with InsertableRelation { import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService._ override def schema: StructType = userSchema.getOrElse(inferSchema()) private lazy val aWorksheet: SparkWorksheet = findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(aWorksheet) => aWorksheet case Left(e) => throw e } private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows private[spreadsheets] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] = for { sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right } yield worksheet override def buildScan(): RDD[Row] = { val aSchema = schema sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter => iter.map { m => var index = 0 val rowArray = new Array[Any](aSchema.fields.length) while(index < aSchema.fields.length) { val field = aSchema.fields(index) rowArray(index) = if (m.contains(field.name)) { TypeCast.castTo(m(field.name), field.dataType, field.nullable) } else { null } index += 1 } Row.fromSeq(rowArray) } } } override def insert(data: DataFrame, overwrite: Boolean): Unit = { if(!overwrite) { sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.") } findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(w) => w.updateCells(data.schema, data.collect().toList, Util.toRowData) case Left(e) => throw e } } private def inferSchema(): StructType = StructType(aWorksheet.headers.toList.map { fieldName => StructField(fieldName, StringType, nullable = true) }) }
Example 79
Source File: DefaultSource.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package com.github.potix2.spark.google.spreadsheets import java.io.File import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { createRelation(sqlContext, parameters, null) } private[spreadsheets] def pathToSheetNames(parameters: Map[String, String]): (String, String) = { val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets.")) val elems = path.split('/') if (elems.length < 2) throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'") (elems(0), elems(1)) } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) val context = createSpreadsheetContext(parameters) createRelation(sqlContext, context, spreadsheetName, worksheetName, schema) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) implicit val context = createSpreadsheetContext(parameters) val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName) if(!spreadsheet.isDefined) throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName") spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData) createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema) } private[spreadsheets] def createSpreadsheetContext(parameters: Map[String, String]) = { val serviceAccountIdOption = parameters.get("serviceAccountId") val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH) SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath)) } private[spreadsheets] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: StructType): SpreadsheetRelation = if (schema == null) { createRelation(sqlContext, context, spreadsheetName, worksheetName, None) } else { createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema)) } private[spreadsheets] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: Option[StructType]): SpreadsheetRelation = SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext) }
Example 80
Source File: DatasetUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata} import org.apache.spark.sql.{Column, DataFrame, Dataset} object DatasetUtil { def withColumns[T](ds: Dataset[T], colNames: Seq[String], cols: Seq[Column], metadata: Seq[Metadata]): DataFrame = { require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") require(colNames.size == metadata.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of metadata elements: ${metadata.size}") val sparkSession = ds.sparkSession val queryExecution = ds.queryExecution val resolver = sparkSession.sessionState.analyzer.resolver val output = queryExecution.analyzed.output checkColumnNameDuplication(colNames, "in given column names", sparkSession.sessionState.conf.caseSensitiveAnalysis) val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) => colName -> col.as(colName, metadata) }.toMap val replacedAndExistingColumns = output.map { field => columnMap.find { case (colName, _) => resolver(field.name, colName) } match { case Some((colName: String, col: Column)) => col.as(colName) case _ => new Column(field) } } val newColumns = columnMap.filter { case (colName, col) => !output.exists(f => resolver(f.name, colName)) }.map { case (colName, col) => col.as(colName) } ds.select(replacedAndExistingColumns ++ newColumns: _*) } def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = { withColumns(ds, Seq(colName), Seq(col), Seq(metadata)) } private def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } /** * Cast a column in a Dataset to Vector type. * * The supported data types of the input column are * - Vector * - float/double type Array. * * Note: The returned column does not have Metadata. * * @param dataset input DataFrame * @param colName column name. * @return Vector column */ def columnToVector(dataset: Dataset[_], colName: String): Column = { val columnDataType = dataset.schema(colName).dataType columnDataType match { case _: VectorUDT => col(colName) case fdt: ArrayType => val transferUDF = fdt.elementType match { case _: FloatType => udf(f = (vector: Seq[Float]) => { val inputArray = Array.fill[Double](vector.size)(0.0) vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble) Vectors.dense(inputArray) }) case _: DoubleType => udf((vector: Seq[Double]) => { Vectors.dense(vector.toArray) }) case other => throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector") } transferUDF(col(colName)) case other => throw new IllegalArgumentException(s"$other column cannot be cast to Vector") } } }
Example 81
Source File: SQLTransformer.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.Transformer import com.tencent.angel.sona.ml.param.{Param, ParamMap} import com.tencent.angel.sona.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType /** * Implements the transformations which are defined by SQL statement. * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...' * where '__THIS__' represents the underlying table of the input dataset. * The select clause specifies the fields, constants, and expressions to display in * the output, it can be any select clause that Spark SQL supports. Users can also * use Spark SQL built-in function and UDFs to operate on these selected columns. * For example, [[SQLTransformer]] supports statements like: * {{{ * SELECT a, a + b AS a_b FROM __THIS__ * SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5 * SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b * }}} */ class SQLTransformer(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = this(Identifiable.randomUID("sql")) /** * SQL statement parameter. The statement is provided in string form. * * @group param */ final val statement: Param[String] = new Param[String](this, "statement", "SQL statement") def setStatement(value: String): this.type = set(statement, value) def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) // Call SessionCatalog.dropTempView to avoid unpersisting the possibly cached dataset. dataset.sparkSession.catalog.dropTempView(tableName) // Compatible.sessionstate.catalog.dropTempView(tableName) result } override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { override def load(path: String): SQLTransformer = super.load(path) }
Example 82
Source File: BinaryClassificationSummaryImpl.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.evaluation.evaluating import com.tencent.angel.sona.ml.evaluation.BinaryClassMetrics.BinaryPredictedResult import com.tencent.angel.sona.ml.evaluation.{BinaryClassMetrics, BinaryClassificationSummary} import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} class BinaryClassificationSummaryImpl(df: DataFrame, probabilityCol: String, labelCol: String) extends BinaryClassificationSummary with Serializable with Logging { private lazy val data: RDD[BinaryPredictedResult] = df.select(probabilityCol, labelCol).rdd.map { case Row(probability: Double, label: Double) => BinaryPredictedResult(probability, label.toInt) } lazy val binaryMetrics: BinaryClassMetrics = data.aggregate(new BinaryClassMetrics)( seqOp = (metrics: BinaryClassMetrics, pres: BinaryPredictedResult) => metrics.add(pres), combOp = (metrics1: BinaryClassMetrics, metrics2: BinaryClassMetrics) => metrics1.merge(metrics2) ) protected lazy val (tp: Double, fp: Double, fn: Double, tn: Double) = ( binaryMetrics.getTP, binaryMetrics.getFP, binaryMetrics.getFN, binaryMetrics.getTN) }
Example 83
Source File: RegressionSummaryImpl.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.evaluation.evaluating import com.tencent.angel.sona.ml.evaluation.RegressionMetrics.RegressionPredictedResult import com.tencent.angel.sona.ml.evaluation.{RegressionMetrics, RegressionSummary} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} class RegressionSummaryImpl(df: DataFrame, predictionCol: String, labelCol: String) extends RegressionSummary with Serializable { private lazy val data: RDD[RegressionPredictedResult] = df.select(predictionCol, labelCol).rdd.map { case Row(probability: Double, label: Double) => RegressionPredictedResult(probability, label.toInt) } override val regMetrics: RegressionMetrics = data.aggregate(new RegressionMetrics)( seqOp = (metrics: RegressionMetrics, pres: RegressionPredictedResult) => metrics.add(pres), combOp = (metrics1: RegressionMetrics, metrics2: RegressionMetrics) => metrics1.merge(metrics2) ) }
Example 84
Source File: MultiClassificationSummaryImpl.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.evaluation.evaluating import com.tencent.angel.sona.ml.evaluation.{MultiClassMetrics, MultiClassificationSummary} import com.tencent.angel.sona.ml.evaluation.MultiClassMetrics.MultiPredictedResult import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} class MultiClassificationSummaryImpl(df: DataFrame, predictionCol: String, labelCol: String) extends MultiClassificationSummary with Serializable with Logging { private lazy val data: RDD[MultiPredictedResult] = df.select(predictionCol, labelCol).rdd.map { case Row(predictediction: Double, label: Double) => MultiPredictedResult(predictediction.toInt, label.toInt) } lazy val multiMetrics: MultiClassMetrics = data.aggregate(new MultiClassMetrics)( seqOp = (metrics: MultiClassMetrics, pres: MultiPredictedResult) => metrics.add(pres), combOp = (metrics1: MultiClassMetrics, metrics2: MultiClassMetrics) => metrics1.merge(metrics2) ) }
Example 85
Source File: ChiSquareTest.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.stat import com.tencent.angel.sona.ml.feature.LabeledPoint import org.apache.spark.linalg import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.util.SONASchemaUtils import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col /** * :: Experimental :: * * Chi-square hypothesis testing for categorical data. * * See <a href="http://en.wikipedia.org/wiki/Chi-squared_test">Wikipedia</a> for more information * on the Chi-squared test. */ object ChiSquareTest { private case class ChiSquareResult( pValues: linalg.Vector, degreesOfFreedom: Array[Int], statistics: linalg.Vector) /** * Conduct Pearson's independence test for every feature against the label. For each feature, the * (feature, label) pairs are converted into a contingency matrix for which the Chi-squared * statistic is computed. All label and feature values must be categorical. * * The null hypothesis is that the occurrence of the outcomes is statistically independent. * * @param dataset DataFrame of categorical labels and categorical features. * Real-valued features will be treated as categorical for each distinct value. * @param featuresCol Name of features column in dataset, of type `Vector` (`VectorUDT`) * @param labelCol Name of label column in dataset, of any numerical type * @return DataFrame containing the test result for every feature against the label. * This DataFrame will contain a single Row with the following fields: * - `pValues: Vector` * - `degreesOfFreedom: Array[Int]` * - `statistics: Vector` * Each of these fields has one value per feature. */ def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = { val spark = dataset.sparkSession import spark.implicits._ SONASchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT) SONASchemaUtils.checkNumericType(dataset.schema, labelCol) val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, linalg.Vector)] .rdd.map { case (label, features) => LabeledPoint(label, features) } val testResults = Statistics.chiSqTest(rdd) val pValues: linalg.Vector = Vectors.dense(testResults.map(_.pValue)) val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom) val statistics: linalg.Vector = Vectors.dense(testResults.map(_.statistic)) spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics))) } }
Example 86
Source File: Correlation.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.stat import org.apache.spark.linalg.{SQLDataTypes, Vector} import scala.collection.JavaConverters._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.types.{StructField, StructType} /** * API for correlation functions in MLlib, compatible with DataFrames and Datasets. * * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]] * to spark.ml's Vector types. */ object Correlation { /** * :: Experimental :: * Compute the correlation matrix for the input Dataset of Vectors using the specified method. * Methods currently supported: `pearson` (default), `spearman`. * * @param dataset A dataset or a dataframe * @param column The name of the column of vectors for which the correlation coefficient needs * to be computed. This must be a column of the dataset, and it must contain * Vector objects. * @param method String specifying the method to use for computing correlation. * Supported: `pearson` (default), `spearman` * @return A dataframe that contains the correlation matrix of the column of vectors. This * dataframe contains a single row and a single column of name * '$METHODNAME($COLUMN)'. * @throws IllegalArgumentException if the column is not a valid column in the dataset, or if * the content of this column is not of type Vector. * * Here is how to access the correlation coefficient: * {{{ * val data: Dataset[Vector] = ... * val Row(coeff: Matrix) = Correlation.corr(data, "value").head * // coeff now contains the Pearson correlation matrix. * }}} * * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector], * which is fairly costly. Cache the input Dataset before calling corr with `method = "spearman"` * to avoid recomputing the common lineage. */ def corr(dataset: Dataset[_], column: String, method: String): DataFrame = { val rdd = dataset.select(column).rdd.map { case Row(v: Vector) => v } val oldM = Statistics.corr(rdd, method) val name = s"$method($column)" val schema = StructType(Array(StructField(name, SQLDataTypes.MatrixType, nullable = false))) dataset.sparkSession.createDataFrame(Seq(Row(oldM)).asJava, schema) } /** * Compute the Pearson correlation matrix for the input Dataset of Vectors. */ def corr(dataset: Dataset[_], column: String): DataFrame = { corr(dataset, column, "pearson") } }
Example 87
Source File: GraphIO.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.utils import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} object GraphIO { private val DELIMITER = "delimiter" private val HEADER = "header" private val int2Long = udf[Long, Int](_.toLong) private val string2Long = udf[Long, String](_.toLong) private val int2Float = udf[Float, Int](_.toFloat) private val long2Float = udf[Float, Long](_.toFloat) private val double2Float = udf[Float, Double](_.toFloat) private val string2Float = udf[Float, String](_.toFloat) def convert2Float(df: DataFrame, structField: StructField, tmpSuffix: String): DataFrame = { val tmpName = structField.name + tmpSuffix structField.dataType match { case _: LongType => df.withColumn(tmpName, long2Float(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: IntegerType => df.withColumn(tmpName, int2Float(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: DoubleType => df.withColumn(tmpName, double2Float(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: StringType => df.withColumn(tmpName, string2Float(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: FloatType => df case t => throw new Exception(s"$t can't convert to Float") } } def convert2Long(df: DataFrame, structField: StructField, tmpSuffix: String): DataFrame = { val tmpName = structField.name + tmpSuffix structField.dataType match { case _: LongType => df case _: IntegerType => df.withColumn(tmpName, int2Long(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: StringType => df.withColumn(tmpName, string2Long(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case t => throw new Exception(s"$t can't convert to Long") } } def load(input: String, isWeighted: Boolean, srcIndex: Int = 0, dstIndex: Int = 1, weightIndex: Int = 2, sep: String = " "): DataFrame = { val ss = SparkSession.builder().getOrCreate() val schema = if (isWeighted) { StructType(Seq( StructField("src", LongType, nullable = false), StructField("dst", LongType, nullable = false), StructField("weight", FloatType, nullable = false) )) } else { StructType(Seq( StructField("src", LongType, nullable = false), StructField("dst", LongType, nullable = false) )) } ss.read .option("sep", sep) .option("header", "false") .schema(schema) .csv(input) } def save(df: DataFrame, output: String, seq: String = "\t"): Unit = { df.printSchema() df.write .mode(SaveMode.Overwrite) .option(HEADER, "false") .option(DELIMITER, seq) .csv(output) } def defaultCheckpointDir: Option[String] = { val sparkContext = SparkContext.getOrCreate() sparkContext.getConf.getOption("spark.yarn.stagingDir") .map { base => new Path(base, s".sparkStaging/${sparkContext.getConf.getAppId}").toString } } }
Example 88
Source File: KCore.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.kcore import com.tencent.angel.sona.context.PSContext import org.apache.spark.SparkContext import com.tencent.angel.sona.graph.params._ import com.tencent.angel.sona.ml.Transformer import com.tencent.angel.sona.ml.param.ParamMap import com.tencent.angel.sona.ml.util.Identifiable import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.storage.StorageLevel class KCore(override val uid: String) extends Transformer with HasSrcNodeIdCol with HasDstNodeIdCol with HasOutputNodeIdCol with HasOutputCoreIdCol with HasStorageLevel with HasPartitionNum with HasPSPartitionNum with HasUseBalancePartition { def this() = this(Identifiable.randomUID("KCore")) override def transform(dataset: Dataset[_]): DataFrame = { val edges = dataset.select($(srcNodeIdCol), $(dstNodeIdCol)).rdd .map(row => (row.getLong(0), row.getLong(1))) .filter(e => e._1 != e._2) edges.persist(StorageLevel.DISK_ONLY) val maxId = edges.map(e => math.max(e._1, e._2)).max() + 1 val minId = edges.map(e => math.min(e._1, e._2)).min() val nodes = edges.flatMap(e => Iterator(e._1, e._2)) val numEdges = edges.count() println(s"minId=$minId maxId=$maxId numEdges=$numEdges level=${$(storageLevel)}") // Start PS and init the model println("start to run ps") PSContext.getOrCreate(SparkContext.getOrCreate()) val model = KCorePSModel.fromMinMax(minId, maxId, nodes, $(psPartitionNum), $(useBalancePartition)) var graph = edges.flatMap(e => Iterator((e._1, e._2), (e._2, e._1))) .groupByKey($(partitionNum)) .mapPartitionsWithIndex((index, edgeIter) => Iterator(KCoreGraphPartition.apply(index, edgeIter))) graph.persist($(storageLevel)) graph.foreachPartition(_ => Unit) graph.foreach(_.initMsgs(model)) var curIteration = 0 var numMsgs = model.numMsgs() var prev = graph println(s"numMsgs=$numMsgs") do { curIteration += 1 graph = prev.map(_.process(model, numMsgs, curIteration == 1)) graph.persist($(storageLevel)) graph.count() prev.unpersist(true) prev = graph model.resetMsgs() numMsgs = model.numMsgs() println(s"curIteration=$curIteration numMsgs=$numMsgs") } while (numMsgs > 0) val retRDD = graph.map(_.save()).flatMap{case (nodes,cores) => nodes.zip(cores)} .map(r => Row.fromSeq(Seq[Any](r._1, r._2))) dataset.sparkSession.createDataFrame(retRDD, transformSchema(dataset.schema)) } override def transformSchema(schema: StructType): StructType = { StructType(Seq( StructField(s"${$(outputNodeIdCol)}", LongType, nullable = false), StructField(s"${$(outputCoreIdCol)}", IntegerType, nullable = false) )) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) }
Example 89
Source File: TokenizerSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest} import scala.beans.BeanInfo import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends MLTest with DefaultReadWriteTest { test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ def testRegexTokenizer(t: RegexTokenizer, dataframe: DataFrame): Unit = { testTransformer[(String, Seq[String])](dataframe, t, "tokens", "wantedTokens") { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) ).toDF() testRegexTokenizer(tokenizer0, dataset0) val dataset1 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) ).toDF() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) ).toDF() testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) ).toDF() testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } }
Example 90
Source File: NormalizerSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import org.apache.spark.linalg import org.apache.spark.linalg.{DenseVector, IntSparseVector, LongSparseVector, Vectors} import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest} import com.tencent.angel.sona.ml.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row} class NormalizerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[linalg.Vector] = _ @transient var l1Normalized: Array[linalg.Vector] = _ @transient var l2Normalized: Array[linalg.Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq[(Int, Double)]()), Vectors.sparse( size= 3L, Seq[(Long, Double)]()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq[(Int, Double)]()), Vectors.sparse(3L, Seq[(Long, Double)]()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq[(Int, Double)]()), Vectors.sparse(3L, Seq[(Long, Double)]()) ) } def assertTypeOfVector(lhs: linalg.Vector, rhs: linalg.Vector): Unit = { assert((lhs, rhs) match { case (v1: DenseVector, v2: DenseVector) => true case (v1: IntSparseVector, v2: IntSparseVector) => true case (v1: LongSparseVector, v2: LongSparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } def assertValues(lhs: linalg.Vector, rhs: linalg.Vector): Unit = { assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.") } test("Normalization with default parameter") { val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized") val dataFrame: DataFrame = data.zip(l2Normalized).seq.toDF("features", "expected") testTransformer[(linalg.Vector, linalg.Vector)](dataFrame, normalizer, "features", "normalized", "expected") { case Row(features: linalg.Vector, normalized: linalg.Vector, expected: linalg.Vector) => assertTypeOfVector(normalized, features) assertValues(normalized, expected) } } test("Normalization with setter") { val dataFrame: DataFrame = data.zip(l1Normalized).seq.toDF("features", "expected") val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1) testTransformer[(linalg.Vector, linalg.Vector)](dataFrame, normalizer, "features", "normalized", "expected") { case Row(features: linalg.Vector, normalized: linalg.Vector, expected: linalg.Vector) => assertTypeOfVector(normalized, features) assertValues(normalized, expected) } } test("read/write") { val t = new Normalizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setP(3.0) testDefaultReadWrite(t) } }
Example 91
Source File: NGramSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest} import scala.beans.BeanInfo import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = Seq(NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") )).toDF() testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") )).toDF() testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData(Array(), Array())).toDF() testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array() )).toDF() testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } def testNGram(t: NGram, dataFrame: DataFrame): Unit = { testTransformer[(Seq[String], Seq[String])](dataFrame, t, "nGrams", "wantedNGrams") { case Row(actualNGrams : Seq[_], wantedNGrams: Seq[_]) => assert(actualNGrams === wantedNGrams) } } }
Example 92
Source File: Glow.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow import java.util.ServiceLoader import scala.collection.JavaConverters._ import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.spark.sql.{DataFrame, SQLUtils, SparkSession} import io.projectglow.common.Named import io.projectglow.sql.{GlowSQLExtensions, SqlExtensionProvider} import io.projectglow.transformers.util.{SnakeCaseMap, StringUtils} def transform(operationName: String, df: DataFrame, options: Map[String, Any]): DataFrame = { val stringValuedMap = options.mapValues { case s: String => s case v => mapper.writeValueAsString(v) }.map(identity) // output of mapValues is not serializable: https://github.com/scala/bug/issues/7005 lookupTransformer(operationName) match { case Some(transformer) => transformer.transform(df, new SnakeCaseMap(stringValuedMap)) case None => throw new IllegalArgumentException(s"No transformer with name $operationName") } } def transform(operationName: String, df: DataFrame, options: (String, Any)*): DataFrame = { transform(operationName, df, options.toMap) } def transform( operationName: String, df: DataFrame, options: java.util.Map[String, String]): DataFrame = { transform(operationName, df, options.asScala.toMap) } private def lookupTransformer(name: String): Option[DataFrameTransformer] = synchronized { transformerLoader.reload() transformerLoader .iterator() .asScala .find(n => StringUtils.toSnakeCase(n.name) == StringUtils.toSnakeCase(name)) } private val transformerLoader = ServiceLoader .load(classOf[DataFrameTransformer]) } object Glow extends GlowBase trait DataFrameTransformer extends Named { def transform(df: DataFrame, options: Map[String, String]): DataFrame }
Example 93
Source File: VCFInputFormatter.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.vcf import java.io.OutputStream import scala.collection.JavaConverters._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.InternalRow import io.projectglow.common.GlowLogging import io.projectglow.transformers.pipe.{InputFormatter, InputFormatterFactory} class VCFInputFormatter(converter: InternalRowToVariantContextConverter, sampleIdInfo: SampleIdInfo) extends InputFormatter with GlowLogging { private var writer: VCFStreamWriter = _ private var stream: OutputStream = _ override def init(stream: OutputStream): Unit = { this.stream = stream this.writer = new VCFStreamWriter( stream, converter.vcfHeader.getMetaDataInInputOrder.asScala.toSet, sampleIdInfo, writeHeader = true) } override def write(record: InternalRow): Unit = { converter.convert(record).foreach(writer.write) } override def close(): Unit = { logger.info("Closing VCF input formatter") writer.close() } } class VCFInputFormatterFactory extends InputFormatterFactory { override def name: String = "vcf" override def makeInputFormatter(df: DataFrame, options: Map[String, String]): InputFormatter = { val (headerLineSet, sampleIdInfo) = VCFHeaderUtils.parseHeaderLinesAndSamples( options, None, df.schema, df.sparkSession.sparkContext.hadoopConfiguration) val rowConverter = new InternalRowToVariantContextConverter( df.schema, headerLineSet, VCFOptionParser.getValidationStringency(options) ) rowConverter.validate() new VCFInputFormatter(rowConverter, sampleIdInfo) } }
Example 94
Source File: VCFWriterUtils.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.vcf import htsjdk.variant.variantcontext.{VariantContext, VariantContextBuilder} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{ArrayType, StructType} import io.projectglow.common.GlowLogging object VCFWriterUtils extends GlowLogging { def throwMixedSamplesFailure(): Unit = { throw new IllegalArgumentException("Cannot mix missing and non-missing sample IDs.") } def throwSampleInferenceFailure(): Unit = { throw new IllegalArgumentException( "Cannot infer sample ids because they are not the same in every row.") } def inferSampleIdsIfPresent(data: DataFrame): SampleIdInfo = { val genotypeSchemaOpt = data .schema .find(_.name == "genotypes") .map(_.dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType]) if (genotypeSchemaOpt.isEmpty) { logger.info("No genotypes column, no sample IDs will be inferred.") return SampleIds(Seq.empty) } val genotypeSchema = genotypeSchemaOpt.get import data.sparkSession.implicits._ val hasSampleIdsColumn = genotypeSchema.exists(_.name == "sampleId") if (hasSampleIdsColumn) { val distinctSampleIds = data .selectExpr("explode(genotypes.sampleId)") .distinct() .as[String] .collect val numPresentSampleIds = distinctSampleIds.count(!sampleIsMissing(_)) if (numPresentSampleIds > 0) { if (numPresentSampleIds < distinctSampleIds.length) { throwMixedSamplesFailure() } return SampleIds(distinctSampleIds) } } val numGenotypesPerRow = data .selectExpr("size(genotypes)") .distinct() .as[Int] .collect if (numGenotypesPerRow.length > 1) { throw new IllegalArgumentException( "Rows contain varying number of missing samples; cannot infer sample IDs.") } logger.warn("Detected missing sample IDs, inferring sample IDs.") InferSampleIds } def sampleIsMissing(s: String): Boolean = { s == null || s.isEmpty } def convertVcAttributesToStrings(vc: VariantContext): VariantContextBuilder = { val vcBuilder = new VariantContextBuilder(vc) val iterator = vc.getAttributes.entrySet().iterator() while (iterator.hasNext) { // parse to string, then write, as the VCF encoder messes up double precisions val entry = iterator.next() vcBuilder.attribute( entry.getKey, VariantContextToInternalRowConverter.parseObjectAsString(entry.getValue)) } vcBuilder } } case class SampleIds(unsortedSampleIds: Seq[String]) extends SampleIdInfo { val sortedSampleIds: Seq[String] = unsortedSampleIds.sorted } case object InferSampleIds extends SampleIdInfo { def fromNumberMissing(numMissingSamples: Int): Seq[String] = { (1 to numMissingSamples).map { idx => "sample_" + idx } } } sealed trait SampleIdInfo
Example 95
Source File: BlockVariantsAndSamplesTransformer.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.transformers.blockvariantsandsamples import io.projectglow.DataFrameTransformer import io.projectglow.common.logging.HlsUsageLogging import org.apache.spark.sql.DataFrame class BlockVariantsAndSamplesTransformer extends DataFrameTransformer with HlsUsageLogging { import BlockVariantsAndSamplesTransformer._ override def name: String = TRANSFORMER_NAME override def transform(df: DataFrame, options: Map[String, String]): DataFrame = { val variantsPerBlock = validateIntegerOption(options, VARIANTS_PER_BLOCK) val sampleBlockCount = validateIntegerOption(options, SAMPLE_BLOCK_COUNT) VariantSampleBlockMaker.makeVariantAndSampleBlocks(df, variantsPerBlock, sampleBlockCount) } } object BlockVariantsAndSamplesTransformer { val TRANSFORMER_NAME = "block_variants_and_samples" val VARIANTS_PER_BLOCK = "variants_per_block" val SAMPLE_BLOCK_COUNT = "sample_block_count" def validateIntegerOption(options: Map[String, String], optionName: String): Int = { try { (options.get(optionName).get.toInt) } catch { case _: Throwable => throw new IllegalArgumentException( s"$optionName is not provided or cannot be cast as an integer!" ) } } }
Example 96
Source File: CSVInputFormatter.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.transformers.pipe import java.io.{OutputStream, PrintWriter} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.csv.SGUnivocityGenerator import org.apache.spark.sql.types.StructType import io.projectglow.SparkShim.CSVOptions class CSVInputFormatter(schema: StructType, parsedOptions: CSVOptions) extends InputFormatter { private var writer: PrintWriter = _ private var univocityGenerator: SGUnivocityGenerator = _ override def init(stream: OutputStream): Unit = { writer = new PrintWriter(stream) univocityGenerator = new SGUnivocityGenerator(schema, writer, parsedOptions) if (parsedOptions.headerFlag) { univocityGenerator.writeHeaders() } } override def write(record: InternalRow): Unit = { univocityGenerator.write(record) } override def close(): Unit = { writer.close() univocityGenerator.close() } } class CSVInputFormatterFactory extends InputFormatterFactory { override def name: String = "csv" override def makeInputFormatter( df: DataFrame, options: Map[String, String] ): InputFormatter = { val sqlConf = df.sparkSession.sessionState.conf val parsedOptions = new CSVOptions( options, sqlConf.csvColumnPruning, sqlConf.sessionLocalTimeZone ) new CSVInputFormatter(df.schema, parsedOptions) } }
Example 97
Source File: UTF8TextInputFormatter.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.transformers.pipe import java.io.{OutputStream, PrintWriter} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SQLUtils.dataTypesEqualExceptNullability import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StringType class UTF8TextInputFormatter() extends InputFormatter { private var writer: PrintWriter = _ override def init(stream: OutputStream): Unit = { writer = new PrintWriter(stream) } override def write(record: InternalRow): Unit = { if (!record.isNullAt(0)) { writer.println(record.getUTF8String(0)) // scalastyle:ignore } } override def close(): Unit = { writer.close() } } class UTF8TextInputFormatterFactory extends InputFormatterFactory { override def name: String = "text" override def makeInputFormatter(df: DataFrame, options: Map[String, String]): InputFormatter = { require(df.schema.length == 1, "Input dataframe must have one column,") require( dataTypesEqualExceptNullability(df.schema.head.dataType, StringType), "Input dataframe must have one string column.") new UTF8TextInputFormatter } }
Example 98
Source File: BigFileDatasource.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql import java.net.URI import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import io.projectglow.common.{GlowLogging, WithUtils} def write(rdd: RDD[Array[Byte]], path: String) { val uri = new URI(path) uploaders.find(_.canUpload(rdd.sparkContext.hadoopConfiguration, path)) match { case Some(uploader) => uploader.upload(rdd, path) case None => logger.info(s"Could not find a parallel uploader for $path, uploading from the driver") writeFileFromDriver(new Path(uri), rdd) } } private def writeFileFromDriver(path: Path, byteRdd: RDD[Array[Byte]]): Unit = { val sc = byteRdd.sparkContext val fs = path.getFileSystem(sc.hadoopConfiguration) WithUtils.withCloseable(fs.create(path)) { stream => WithUtils.withCachedRDD(byteRdd) { cachedRdd => cachedRdd.count() cachedRdd.toLocalIterator.foreach { chunk => stream.write(chunk) } } } } }
Example 99
Source File: BigBgenDatasource.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.bgen import java.io.ByteArrayOutputStream import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLUtils} import org.apache.spark.sql.sources.DataSourceRegister import io.projectglow.common.logging.{HlsEventRecorder, HlsTagValues} import io.projectglow.sql.BigFileDatasource import io.projectglow.sql.util.ComDatabricksDataSource class BigBgenDatasource extends BigFileDatasource with DataSourceRegister { override def shortName(): String = "bigbgen" override def serializeDataFrame( options: Map[String, String], data: DataFrame): RDD[Array[Byte]] = { BigBgenDatasource.serializeDataFrame(options, data) } } class ComDatabricksBigBgenDatasource extends BigBgenDatasource with ComDatabricksDataSource object BigBgenDatasource extends HlsEventRecorder { import io.projectglow.common.BgenOptions._ private def parseOptions(options: Map[String, String]): BigBgenOptions = { val bitsPerProb = options.getOrElse(BITS_PER_PROB_KEY, BITS_PER_PROB_DEFAULT_VALUE).toInt val maxPloidy = options.getOrElse(MAX_PLOIDY_KEY, MAX_PLOIDY_VALUE).toInt val defaultPloidy = options.getOrElse(DEFAULT_PLOIDY_KEY, DEFAULT_PLOIDY_VALUE).toInt val defaultPhasing = options.getOrElse(DEFAULT_PHASING_KEY, DEFAULT_PHASING_VALUE).toBoolean BigBgenOptions(bitsPerProb, maxPloidy, defaultPloidy, defaultPhasing) } private def logBgenWrite(parsedOptions: BigBgenOptions): Unit = { val logOptions = Map( BITS_PER_PROB_KEY -> parsedOptions.bitsPerProb, MAX_PLOIDY_KEY -> parsedOptions.maxPloidy, DEFAULT_PLOIDY_KEY -> parsedOptions.defaultPloidy, DEFAULT_PHASING_KEY -> parsedOptions.defaultPhasing ) recordHlsEvent(HlsTagValues.EVENT_BGEN_WRITE, logOptions) } def serializeDataFrame(options: Map[String, String], data: DataFrame): RDD[Array[Byte]] = { val parsedOptions = parseOptions(options) logBgenWrite(parsedOptions) val dSchema = data.schema val numVariants = data.count val rawRdd = data.queryExecution.toRdd val inputRdd = if (rawRdd.getNumPartitions == 0) { logger.warn("Writing BGEN header only as the input DataFrame has zero partitions.") SQLUtils.createEmptyRDD(data.sparkSession) } else { rawRdd } inputRdd.mapPartitionsWithIndex { case (idx, it) => val baos = new ByteArrayOutputStream() val writeHeader = idx == 0 val writer = new BgenRecordWriter( baos, dSchema, writeHeader, numVariants, parsedOptions.bitsPerProb, parsedOptions.maxPloidy, parsedOptions.defaultPloidy, parsedOptions.defaultPhasing ) it.foreach { row => writer.write(row) } writer.close() Iterator(baos.toByteArray) } } } case class BigBgenOptions( bitsPerProb: Int, maxPloidy: Int, defaultPloidy: Int, defaultPhasing: Boolean)
Example 100
Source File: TextPiperSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.transformers.pipe import scala.collection.JavaConverters._ import org.apache.spark.SparkException import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{StringType, StructField, StructType} import io.projectglow.Glow import io.projectglow.sql.GlowBaseTest class TextPiperSuite extends GlowBaseTest { override def afterEach(): Unit = { Glow.transform("pipe_cleanup", spark.emptyDataFrame) super.afterEach() } def pipeText(df: DataFrame): DataFrame = { val options = Map("inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["cat", "-"]""") new PipeTransformer().transform(df, options) } test("text input and output") { val sess = spark import sess.implicits._ val output = pipeText(Seq("hello", "world").toDF()) assert(output.count() == 2) assert(output.schema == StructType(Seq(StructField("text", StringType)))) assert(output.orderBy("text").as[String].collect.toSeq == Seq("hello", "world")) } test("text input requires one column") { val sess = spark import sess.implicits._ val df = Seq(Seq("hello", "world"), Seq("foo", "bar")).toDF() assertThrows[IllegalArgumentException](pipeText(df)) } test("text input requires string column") { val sess = spark import sess.implicits._ val df = Seq(Seq(5), Seq(6)).toDF() assertThrows[IllegalArgumentException](pipeText(df)) } test("does not break on null row") { val sess = spark import sess.implicits._ val df = Seq("hello", null, "hello").toDF() val output = pipeText(df) assert(output.count() == 2) assert(output.filter("text = 'hello'").count == 2) } test("command fails") { val sess = spark import sess.implicits._ val df = Seq("hello", "world").toDF() val options = Map( "inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["bash", "-c", "exit 1"]""") val ex = intercept[SparkException] { new PipeTransformer().transform(df, options) } assert(ex.getMessage.contains("Subprocess exited with status 1")) // threads should still be cleaned up eventually { assert( !Thread .getAllStackTraces .asScala .keySet .exists(_.getName.startsWith(ProcessHelper.STDIN_WRITER_THREAD_PREFIX))) assert( !Thread .getAllStackTraces .asScala .keySet .exists(_.getName.startsWith(ProcessHelper.STDERR_READER_THREAD_PREFIX))) } } }
Example 101
Source File: GlowSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow import org.apache.spark.sql.DataFrame import io.projectglow.sql.GlowBaseTest class GlowSuite extends GlowBaseTest { def checkTransform(df: DataFrame): Unit = { val sess = spark import sess.implicits._ assert(df.count() == 2) assert(df.as[String].collect.toSeq == Seq("camel", "snake")) } test("uses service provider") { val df = Glow.transform( "dummy_transformer", spark.emptyDataFrame, Map("camel_animal" -> "camel", "snake_animal" -> "snake")) checkTransform(df) } test("transformer names are converted to snake case") { val df = Glow.transform( "dummyTransformer", spark.emptyDataFrame, Map("camel_animal" -> "camel", "snake_animal" -> "snake")) checkTransform(df) } test("options are converted to snake case") { val df = Glow.transform( "dummyTransformer", spark.emptyDataFrame, Map("camelAnimal" -> "camel", "snake_animal" -> "snake")) checkTransform(df) } test("java map options") { val javaMap = new java.util.HashMap[String, String] javaMap.put("camelAnimal", "camel") javaMap.put("snake_animal", "snake") val df = Glow.transform("dummyTransformer", spark.emptyDataFrame, javaMap) checkTransform(df) } test("tuple options") { val df = Glow.transform( "dummyTransformer", spark.emptyDataFrame, ("camelAnimal", "camel"), ("snake_animal", "snake")) checkTransform(df) } test("accept non-string values") { intercept[IllegalArgumentException] { Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("must_be_true" -> false)) } Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("must_be_true" -> true)) } test("float arguments") { intercept[IllegalArgumentException] { Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("pi" -> 15.48)) } Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("pi" -> 3.14159)) Glow.transform("dummyTransformer", spark.emptyDataFrame, Map("pi" -> "3.14159")) } } class DummyTransformer extends DataFrameTransformer { override def name: String = "dummy_transformer" override def transform(df: DataFrame, options: Map[String, String]): DataFrame = { val animals = Seq(options.get("camel_animal"), options.get("snake_animal")).flatten if (!options.get("must_be_true").forall(_.toBoolean)) { throw new IllegalArgumentException("if provided, this arg must be true") } options.get("pi").foreach { pi => require(Math.abs(pi.toDouble - Math.PI) < Math.PI * 0.0001) } df.sparkSession.createDataFrame(animals.map(StringWrapper)).sort() } } case class StringWrapper(s: String)
Example 102
Source File: BigFileDatasourceSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql import java.nio.file.{Files, Paths} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SaveMode} class BigFileDatasourceSuite extends GlowBaseTest { test("save mode: append") { val outFile = Files.createTempFile("tmp", ".tmp").toString val e = intercept[RuntimeException] { spark .emptyDataFrame .write .mode(SaveMode.Append) .format("io.projectglow.sql.DummyBigFileDatasource") .save(outFile) } assert( e.getMessage .contains("Append mode is not supported by io.projectglow.sql.DummyBigFileDatasource")) } test("save mode: overwrite") { val outDir = Files.createTempDirectory("tmp").toString spark .emptyDataFrame .write .mode(SaveMode.Overwrite) .format("io.projectglow.sql.DummyBigFileDatasource") .save(outDir) val filePath = Paths.get(outDir) assert(Files.isRegularFile(filePath)) val writtenBytes = Files.readAllBytes(filePath) assert(writtenBytes.toSeq == Seq(0, 1, 2).map(_.toByte)) } test("save mode: error if exists") { val outFile = Files.createTempFile("tmp", ".tmp").toString val e = intercept[RuntimeException] { spark .emptyDataFrame .write .mode(SaveMode.ErrorIfExists) .format("io.projectglow.sql.DummyBigFileDatasource") .save(outFile) } assert(e.getMessage.contains(s"Path $outFile already exists")) } test("save mode: ignore") { val outDir = Files.createTempDirectory("tmp").toString spark .emptyDataFrame .write .mode(SaveMode.Ignore) .format("io.projectglow.sql.DummyBigFileDatasource") .save(outDir) val dirPath = Paths.get(outDir) assert(Files.isDirectory(dirPath)) } } class DummyBigFileDatasource extends BigFileDatasource { override def serializeDataFrame( options: Map[String, String], data: DataFrame): RDD[Array[Byte]] = { data.sqlContext.sparkContext.parallelize(Seq(Array(0, 1, 2).map(_.toByte))) } }
Example 103
Source File: SparkOperationTestPimpers.scala From sparkplug with MIT License | 5 votes |
package springnz.sparkplug.testkit import com.typesafe.scalalogging.{ LazyLogging, Logger } import org.apache.spark.rdd.RDD import org.apache.spark.sql.{ DataFrame, SQLContext } import springnz.sparkplug.core.SparkOperation import springnz.sparkplug.util.Logging import scala.reflect.ClassTag object SparkOperationTestPimpers extends LazyLogging { private def persistTestResource[A: ClassTag](rdd: RDD[A], rddName: String, overwrite: Boolean = false)( implicit projectName: ProjectName): RDD[A] = { val path = RDDPersister.getPath(projectName.name, rddName) if (overwrite || (!overwrite && !path.exists)) { if (path.exists) { logger.info(s"deleting existing RDD at ${path.pathAsString}") path.delete() } RDDPersister.persistRDD(path.pathAsString, rdd) } else { // (!overwrite && path.exists) logger.info(s"Not persisting RDD that already exists at path [${path.pathAsString}]") rdd } } class RDDExtensions[A: ClassTag](operation: SparkOperation[RDD[A]]) { import RDDSamplers._ def saveTo(rddName: String, sampler: RDD[A] ⇒ RDD[A] = identitySampler)( implicit projectName: ProjectName): SparkOperation[RDD[A]] = operation.map { rdd ⇒ val sampled = sampler(rdd) persistTestResource(sampled, rddName, overwrite = false) sampled } def sourceFrom(rddName: String, sampler: RDD[A] ⇒ RDD[A] = identitySampler)( implicit projectName: ProjectName): SparkOperation[RDD[A]] = SparkOperation { ctx ⇒ val path = RDDPersister.getPath(projectName.name, rddName) if (path.exists) ctx.objectFile[A](path.pathAsString) else { val rdd = operation.run(ctx) val sampled = sampler(rdd) persistTestResource(sampled, rddName, overwrite = false) sampled } } } class DataFrameExtensions(operation: SparkOperation[DataFrame]) { import RDDSamplers._ def saveTo(rddName: String, overwrite: Boolean = false, sampler: RDD[String] ⇒ RDD[String] = identitySampler)( implicit projectName: ProjectName): SparkOperation[DataFrame] = operation.map { df ⇒ val rdd: RDD[String] = df.toJSON val sampled = sampler(rdd) persistTestResource(sampled, rddName, overwrite) val sqlContext = new SQLContext(sampled.sparkContext) sqlContext.read.json(sampled) } def sourceFrom(dataFrameName: String, overwrite: Boolean = false, sampler: RDD[String] ⇒ RDD[String] = rdd ⇒ rdd)( implicit projectName: ProjectName, log: Logger): SparkOperation[DataFrame] = SparkOperation { ctx ⇒ val path = RDDPersister.getPath(projectName.name, dataFrameName) val sampledRDD = if (path.exists) ctx.objectFile[String](path.pathAsString) else { val df = operation.run(ctx) val rdd: RDD[String] = df.toJSON val sampled = sampler(rdd) persistTestResource(sampled, dataFrameName, overwrite) sampled } val sqlContext = new SQLContext(ctx) sqlContext.read.json(sampledRDD) } } }
Example 104
Source File: QueryPeopleTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.dataset import com.github.dnvriend.TestSpec import org.apache.spark.sql.{ Column, DataFrame } class QueryPeopleTest extends TestSpec { it should "query using DSL" in withSparkSession { spark => import spark.implicits._ import org.apache.spark.sql.functions._ val people: DataFrame = spark.read.parquet(TestSpec.PeopleParquet).cache() // name, age people.select('name).limit(1).as[String].head() shouldBe "foo" people.select($"name").limit(1).as[String].head() shouldBe "foo" people.select("name").limit(1).as[String].head() shouldBe "foo" people.select('age).limit(1).as[Int].head() shouldBe 30 people.select($"age").limit(1).as[Int].head() shouldBe 30 people.select("age").limit(1).as[Int].head() shouldBe 30 // select a column from the Dataset val col1: Column = people("name") val col2: Column = people.col("name") val departments: DataFrame = Seq((1, "sales"), (2, "administration"), (3, "human resources")) .toDF("department_id", "department_name").cache() people .withColumn("department_id", lit(1)) .withColumn("age_plus_ten", people("age") + 10) .as[(String, Int, Int, Int)].limit(1).head() shouldBe ("foo", 30, 1, 40) people .withColumn("department_id", lit(1)) .withColumn("age_plus_ten", people("age") + 10) .as('people_dep_age) .join(departments, col("people_dep_age.department_id").equalTo(departments.col("department_id"))) .select($"people_dep_age.name", col("people_dep_age.age"), departments.col("department_name")) .as[(String, Int, String)].limit(1).head() shouldBe ("foo", 30, "sales") val peopleDepAge: DataFrame = people .withColumn("department_id", lit(1)) .withColumn("age_plus_ten", people("age") + 10) peopleDepAge .join(departments, peopleDepAge("department_id") === departments("department_id")) .select(peopleDepAge("name"), peopleDepAge("age"), departments("department_name")) .as[(String, Int, String)].limit(1).head() shouldBe ("foo", 30, "sales") peopleDepAge.filter($"age" > 30) .join(departments, peopleDepAge("department_id") === departments("department_id")) .agg(avg($"age"), max($"age")).limit(1) .as[(Double, Int)].head() shouldBe (45.0, 50) } }
Example 105
Source File: DataFrameWordCountTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.dataframe import com.github.dnvriend.TestSpec import org.apache.spark.sql.{ DataFrame, Dataset } class DataFrameWordCountTest extends TestSpec { it should "wordcount alice in wonderland" in withSparkSession { spark => import org.apache.spark.sql.functions._ import spark.implicits._ val lines: Dataset[String] = spark.read.text(TestSpec.AliceInWonderlandText).as[String] lines.count shouldBe 3599 // alice in wonderland contains 3599 lines val words: DataFrame = lines.flatMap((line: String) => line.split(" ")).map(_.trim).filter(_.nonEmpty).toDF("word") words.count() shouldBe 26467 // there are 26,467 words in the book, excluding spaces val wordCount: Dataset[(String, Long)] = words.groupBy('word).agg(count('word).as("count")).orderBy('count.desc).as[(String, Long)].cache wordCount.take(1).head shouldBe ("the", 1505) // the word 'the' is used 1505 times wordCount.filter(lower('word) === "alice").take(1).head shouldBe ("Alice", 221) wordCount.filter(lower('word) === "queen").take(1).head shouldBe ("Queen", 34) wordCount.filter(lower('word) === "rabbit").take(1).head shouldBe ("Rabbit", 29) wordCount.filter(lower('word) === "cheshire").take(1).head shouldBe ("Cheshire", 6) } }
Example 106
Source File: JdbcDatasourceTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.datasources import com.github.dnvriend.TestSpec import com.github.dnvriend.spark._ import com.github.dnvriend.spark.datasources.SparkImplicits._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.DataFrame object JdbcDatasourceTest { implicit val H2Options: Map[String, String] = Map( "url" -> "jdbc:h2:mem:test;INIT=runscript from 'src/test/resources/create.sql'\\;runscript from 'src/test/resources/init.sql'", "dbtable" -> "customer", "driver" -> "org.h2.Driver", "user" -> "root", "password" -> "root" ) implicit val PostgresOptions: Map[String, String] = Map( "url" -> "jdbc:postgresql://localhost:5432/docker?reWriteBatchedInserts=true", "driver" -> "org.postgresql.Driver", "user" -> "postgres", "password" -> "" ) } class JdbcDatasourceTest extends TestSpec { ignore should "join JDBC and parquet" in withSparkSession { spark => import spark.implicits._ // implicit val jdbcOptions = JdbcDatasourceTest.PostgresOptions implicit val jdbcOptions = JdbcDatasourceTest.H2Options val orders = spark.read.parquet(TestSpec.OrdersParquet).as[Order].cache() val customers = spark.read.jdbc("customer").cache() customers.count() shouldBe 7 val orderCustomer = orders .join(customers, orders("customer_id") === customers("customer_id")) .select(orders("order_id"), 'customer_name, 'customer_age) orderCustomer.as[(Int, String, Int)].collect() shouldBe Seq( (10308, "Ollie Olson", 34), (10309, "Craig Hahn", 21) ) orderCustomer.write.append.jdbc("order_customer") val order_cust: DataFrame = spark.read.jdbc("order_customer") order_cust.printSchema() order_cust.show() } // http://stackoverflow.com/questions/2901453/sql-standard-to-escape-column-names // // The SQL-99 standard specifies that double quote (") is used to delimit identifiers. // //Oracle, PostgreSQL, MySQL, MSSQL and SQlite all support " as the identifier delimiter // (though they don't all use " as the 'default' - // // for example, you have to be running MySQL in ANSI mode and SQL Server only supports it when QUOTED_IDENTIFIER is ON.) }
Example 107
Source File: PersonDataSourceTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.datasources import com.github.dnvriend.TestSpec import com.github.dnvriend.spark.datasources.SparkImplicits._ import org.apache.spark.sql.DataFrame class PersonDataSourceTest extends TestSpec { it should "read a simple person xml file using a custom data source" in withSparkSession { spark => import spark.implicits._ val result: DataFrame = spark.read .format("person") .load("src/test/resources/people.xml") result.as[(Long, String, Int)].collect shouldBe Seq( (1, "Jonathan Archer", 41), (2, "Reginald Barclay", 45), (3, "Julian Bashir", 28), (4, "Pavel Chekov", 52), (5, "Beverly Crusher", 32), (6, "Jadzia Dax", 21), (7, "Geordi La Forge", 35) ) } it should "read a simple person xml file using implicit conversion" in withSparkSession { spark => import spark.implicits._ val result: DataFrame = spark.read.person("src/test/resources/people.xml") result.as[(Long, String, Int)].collect shouldBe Seq( (1, "Jonathan Archer", 41), (2, "Reginald Barclay", 45), (3, "Julian Bashir", 28), (4, "Pavel Chekov", 52), (5, "Beverly Crusher", 32), (6, "Jadzia Dax", 21), (7, "Geordi La Forge", 35) ) } }
Example 108
Source File: StructuredIdentity.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.structuredstreaming.application import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig import org.apache.spark.sql.Column import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ class StructuredIdentity() extends StructuredBenchBase { override def process(ds: DataFrame, config: SparkBenchConfig) = { // Get the singleton instance of SparkSession val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate() import spark.implicits._ val query = ds.writeStream .foreach(new ForeachWriter[Row] { var reporter: KafkaReporter = _ def open(partitionId: Long, version: Long): Boolean = { val reportTopic = config.reporterTopic val brokerList = config.brokerList reporter = new KafkaReporter(reportTopic, brokerList) true } def close(errorOrNull: Throwable): Unit = {} def process(record: Row): Unit = { val inTime = record(0).asInstanceOf[String].toLong val outTime = System.currentTimeMillis() reporter.report(inTime, outTime) } }) .start() query.awaitTermination() } }
Example 109
Source File: StructuredRepartition.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.structuredstreaming.application import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig import org.apache.spark.sql.Column import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ class StructuredRepartition() extends StructuredBenchBase { override def process(ds: DataFrame, config: SparkBenchConfig) = { // Get the singleton instance of SparkSession val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate() import spark.implicits._ val results = ds.repartition(config.coreNumber) val query = results.writeStream .foreach(new ForeachWriter[Row] { var reporter: KafkaReporter = _ def open(partitionId: Long, version: Long): Boolean = { val reportTopic = config.reporterTopic val brokerList = config.brokerList reporter = new KafkaReporter(reportTopic, brokerList) true } def close(errorOrNull: Throwable): Unit = {} def process(record: Row): Unit = { val inTime = record(0).asInstanceOf[String].toLong val outTime = System.currentTimeMillis() reporter.report(inTime, outTime) } }) .start() query.awaitTermination() } }
Example 110
Source File: IntermediateYaml.scala From sope with Apache License 2.0 | 5 votes |
package com.sope.etl.yaml import com.sope.etl._ import com.sope.etl.transform.Transformer import com.sope.etl.transform.exception.YamlDataTransformException import com.sope.etl.transform.model.TransformModelWithoutSourceTarget import org.apache.spark.sql.DataFrame def getTransformedDFs(dataFrames: DataFrame*): Seq[(String, DataFrame)] = { val sources = model.sources.data if (sources.size != dataFrames.size) throw new YamlDataTransformException("Invalid Dataframes provided or incorrect yaml config") val sqlContext = dataFrames.headOption.getOrElse { throw new YamlDataTransformException("Empty Dataframe List") }.sqlContext performRegistrations(sqlContext) val sourceDFMap = sources.zip(dataFrames).map { case (source, df) => (source, { df.createOrReplaceTempView(source) df.alias(source) }) } new Transformer(getYamlFileName, sourceDFMap.toMap, model).transform } }
Example 111
Source File: BigQueryReader.scala From sope with Apache License 2.0 | 5 votes |
package com.sope.spark.utils.google import com.google.cloud.hadoop.io.bigquery.{BigQueryConfiguration, GsonBigQueryInputFormat} import com.google.gson.JsonObject import com.sope.utils.Logging import org.apache.hadoop.io.LongWritable import org.apache.spark.sql.{DataFrame, SQLContext} def load(): DataFrame = { import sqlContext.implicits._ // Load data from BigQuery. val tableData = sc.newAPIHadoopRDD( conf, classOf[GsonBigQueryInputFormat], classOf[LongWritable], classOf[JsonObject]) .map(_._2.toString) sqlContext.read.json(tableData.toDS) } }
Example 112
Source File: BigQueryWriter.scala From sope with Apache License 2.0 | 5 votes |
package com.sope.spark.utils.google import com.google.cloud.hadoop.io.bigquery.output.{BigQueryOutputConfiguration, BigQueryTableFieldSchema, BigQueryTableSchema, IndirectBigQueryOutputFormat} import com.google.cloud.hadoop.io.bigquery.{BigQueryConfiguration, BigQueryFileFormat} import com.sope.utils.Logging import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import scala.collection.JavaConversions._ def save(): Unit = { val projectId = hadoopConf.get("fs.gs.project.id") val bucket = hadoopConf.get("fs.gs.system.bucket") log.info("GCP Project ID: {}", projectId) log.info("GCP Bucket for temporary storage: {} ", bucket) val outputGcsPath = s"gs://$bucket/hadoop/tmp/bigquery/$targetBQTable" log.info("GCP Path for temporary storage: {} ", outputGcsPath) hadoopConf.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId) hadoopConf.set(BigQueryConfiguration.GCS_BUCKET_KEY, bucket) hadoopConf.set("mapreduce.job.outputformat.class", classOf[IndirectBigQueryOutputFormat[_, _]].getName) if (overwriteTable) hadoopConf.set(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION_KEY, "WRITE_TRUNCATE") BigQueryOutputConfiguration.configure( hadoopConf, targetBQTable, getBQSchema, outputGcsPath, BigQueryFileFormat.NEWLINE_DELIMITED_JSON, classOf[TextOutputFormat[_, _]]) val jsonDF = sourceDF.withColumn("json_data", to_json(struct(sourceColumns.map(col): _*))).select(JsonColumn) jsonDF.rdd .map(row => (null, row.getAs[String](0))) .saveAsNewAPIHadoopDataset(hadoopConf) } } object BigQueryWriter { private val JsonColumn = "json_data" }
Example 113
Source File: FunctionTest.scala From sope with Apache License 2.0 | 5 votes |
package com.sope import com.sope.model.{Class, Person, Student} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import com.sope.spark.sql._ import com.sope.TestContext.getSQlContext import org.apache.spark.sql.types.{StringType, IntegerType} import org.scalatest.{FlatSpec, Matchers} class FunctionTest extends FlatSpec with Matchers { private val sqlContext = getSQlContext import sqlContext.implicits._ private val testSData = Seq( Person("Sherlock", "Holmes", "baker street", "[email protected]", "999999"), Person("John", "Watson", "east street", "[email protected]", "55555") ).toDF private val studentDF = Seq( Student("A", "B", 1, 10), Student("B", "C", 2, 10), Student("C", "E", 4, 9), Student("E", "F", 5, 9), Student("F", "G", 6, 10), Student("G", "H", 7, 10), Student("H", "I", 9, 8), Student("H", "I", 9, 7) ).toDF private val classDF = Seq( Class(1, 10, "Tenth"), Class(2, 9, "Ninth"), Class(3, 8, "Eighth") ).toDF "Dataframe Function transformations" should "generate the transformations correctly" in { val nameUpperFunc = (df: DataFrame) => df.withColumn("first_name", upper(col("first_name"))) val nameConcatFunc = (df: DataFrame) => df.withColumn("name", concat(col("first_name"), col("last_name"))) val addressUpperFunc = (df: DataFrame) => df.withColumn("address", upper(col("address"))) val transformed = testSData.applyDFTransformations(Seq(nameUpperFunc, nameConcatFunc, addressUpperFunc)) transformed.show(false) transformed.schema.fields.map(_.name) should contain("name") } "Group by as list Function Transformation" should "generate the transformations correctly" in { val grouped = studentDF.groupByAsList(Seq("cls")) .withColumn("grouped_data", explode($"grouped_data")) .unstruct("grouped_data", keepStructColumn = false) grouped.show(false) grouped.filter("cls = 10").head.getAs[Long]("grouped_count") should be(4) } "Cast Transformation" should "generate the transformations correctly" in { val casted = studentDF.castColumns(IntegerType, StringType) casted.dtypes.count(_._2 == "StringType") should be(4) } "Update Keys Transformation" should "generate the transformations correctly" in { val updatedWithKey = studentDF .updateKeys(Seq("cls"), classDF.renameColumns(Map("cls" -> "class")), "class", "key") .dropColumns(Seq("last_name", "roll_no")) updatedWithKey.show(false) updatedWithKey.filter("first_name = 'A'").head.getAs[Long]("cls_key") should be(1) } }
Example 114
Source File: ConstructVector.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.loader.command import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import com.actian.spark_vector.vector.VectorOps._ import com.actian.spark_vector.vector.VectorJDBC import com.actian.spark_vector.vector.VectorConnectionProperties import com.actian.spark_vector.vector.TableSchemaGenerator import com.actian.spark_vector.loader.options.{ UserOptions, VectorOptions } import com.actian.spark_vector.loader.parsers.Args import resource.managed object ConstructVector { val jdbc = new VectorJDBC(conn) jdbc.createTable(config.vector.targetTable, source.schema) } val mapping = getFieldMapping(source.schema, config.general.colsToLoad.getOrElse(Seq[String]()), conn, config.vector.targetTable) val df = checkSchemaDefaults(source, mapping, conn, config.vector.targetTable) df.rdd.loadVector(df.schema, conn, config.vector.targetTable, config.vector.preSQL, config.vector.postSQL, Option(mapping)) } private def checkSchemaDefaults(source: DataFrame, fieldMapping: Map[String, String], conn: VectorConnectionProperties, table: String): DataFrame = { val jdbc = new VectorJDBC(conn) val defaults = collection.mutable.Map(jdbc.columnDefaults(table).toSeq: _*) jdbc.columnMetadata(table).foreach(c => if(c.nullable) defaults.remove(c.name)) val sourceDefaults = defaults.map(f => (fieldMapping.find(_._2 == f._1).get._1 -> f._2)) source.na.fill(sourceDefaults.toMap) } private def getFieldMapping(sourceSchema: StructType, colsToLoad: Seq[String], conn: VectorConnectionProperties, table: String): Map[String, String] = { val jdbc = new VectorJDBC(conn) val tableSchema = jdbc.columnMetadata(table) require(colsToLoad.size == tableSchema.size || sourceSchema.size == tableSchema.size, "Number of source columns to load does not match number of target columns in table") val fieldMapping = if (!colsToLoad.isEmpty) { require(colsToLoad.size == tableSchema.size, "Number of columns to load does not match number of target columns in table") (for (i <- 0 until colsToLoad.size) yield (colsToLoad(i) -> tableSchema(i).name)).toMap } else { require(sourceSchema.size == tableSchema.size, "Number of source columns do not match number of target columns in table") (for (i <- 0 until sourceSchema.size) yield (sourceSchema(i).name -> tableSchema(i).name)).toMap } fieldMapping } }
Example 115
Source File: SparkSqlTable.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.sql import java.util.concurrent.atomic.AtomicLong import org.apache.spark.sql.DataFrame sealed trait SparkSqlTable { def tableName: String def quotedName: String = sparkQuote(tableName) def close(): Unit } case class HiveTable(override val tableName: String) extends SparkSqlTable { override def close(): Unit = {} } class TempTable private (override val tableName: String, df: DataFrame) extends SparkSqlTable { private def register(): Unit = df.createOrReplaceTempView(tableName) override def close(): Unit = df.sqlContext.dropTempTable(tableName); } object TempTable { private val id = new AtomicLong(0L) def apply(tableNameBase: String, df: DataFrame): TempTable = { val tableName = s"${tableNameBase}_${id.incrementAndGet}" val tt = new TempTable(tableName, df) tt.register() tt } }
Example 116
Source File: DefaultSource.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.sql import org.apache.spark.sql.{ DataFrame, SQLContext, SaveMode } import org.apache.spark.sql.sources.{ BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider, SchemaRelationProvider } import org.apache.spark.sql.types.StructType import com.actian.spark_vector.util.Logging import com.actian.spark_vector.vector.VectorJDBC class DefaultSource extends DataSourceRegister with RelationProvider with SchemaRelationProvider with CreatableRelationProvider with Logging { override def shortName(): String = "vector" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = VectorRelation(TableRef(parameters), sqlContext, parameters) override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = VectorRelation(TableRef(parameters), Some(schema), sqlContext, parameters) override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val tableRef = TableRef(parameters) val table = VectorRelation(tableRef, sqlContext, parameters) mode match { case SaveMode.Overwrite => table.insert(data, true) case SaveMode.ErrorIfExists => val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) } if (isEmpty) { table.insert(data, false) } else { throw new UnsupportedOperationException("Writing to a non-empty Vector table is not allowed with mode ErrorIfExists.") } case SaveMode.Append => table.insert(data, false) case SaveMode.Ignore => val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) } if (isEmpty) { table.insert(data, false) } } table } }
Example 117
Source File: package.scala From sparkpipe-core with Apache License 2.0 | 5 votes |
package software.uncharted.sparkpipe.ops.core.dataframe import org.apache.spark.sql.{SparkSession, DataFrame} import org.apache.spark.sql.types.{StructType, StructField} // Can't test because DataFrameWriter is currently marked final // $COVERAGE-OFF$ def write( path: String, format: String = "parquet", options: Map[String, String] = Map[String, String]() )(input: DataFrame): DataFrame = { if (path.length > 0) { input.write.format(format).options(options).save(path) } else { input.write.format(format).options(options).save() } input } // $COVERAGE-ON$ }
Example 118
Source File: Extractors.scala From streamliner-starter with Apache License 2.0 | 5 votes |
package com.memsql.streamliner.starter import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.sql.types._ import org.apache.spark.streaming.StreamingContext import com.memsql.spark.etl.api.{Extractor, PhaseConfig} import com.memsql.spark.etl.utils.PhaseLogger // This extract just returns a static range of 5 integers each batch interval class BasicExtractor extends Extractor { override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = { logger.info("extracting a constant sequence DataFrame") val schema = StructType(StructField("number", IntegerType, false) :: Nil) val sampleData = List(1,2,3,4,5) val rowRDD = sqlContext.sparkContext.parallelize(sampleData).map(Row(_)) val df = sqlContext.createDataFrame(rowRDD, schema) Some(df) } }
Example 119
Source File: Transformers.scala From streamliner-starter with Apache License 2.0 | 5 votes |
package com.memsql.streamliner.starter import org.apache.spark.sql.{Row, DataFrame, SQLContext} import org.apache.spark.sql.types._ import com.memsql.spark.etl.api.{Transformer, PhaseConfig} import com.memsql.spark.etl.utils.PhaseLogger // A helper object to extract the first column of a schema object ExtractFirstStructField { def unapply(schema: StructType): Option[(String, DataType, Boolean, Metadata)] = schema.fields match { case Array(first: StructField, _*) => Some((first.name, first.dataType, first.nullable, first.metadata)) } } // This transformer expects an input DataFrame and returns it class BasicTransformer extends Transformer { def transform(sqlContext: SQLContext, df: DataFrame, config: PhaseConfig, logger: PhaseLogger): DataFrame = { logger.info("transforming the DataFrame") // check that the first column is of type IntegerType and return its name val column = df.schema match { case ExtractFirstStructField(name: String, dataType: IntegerType, _, _) => name case _ => throw new IllegalArgumentException("The first column of the input DataFrame should be IntegerType") } // filter the dataframe, returning only even numbers df.filter(s"$column % 2 = 0") } }
Example 120
Source File: SparkSqlRunner.scala From amaterasu with Apache License 2.0 | 5 votes |
package org.apache.amaterasu.executor.execution.actions.runners.spark.SparkSql import java.io.File import org.apache.amaterasu.common.execution.actions.Notifier import org.apache.amaterasu.common.logging.Logging import org.apache.amaterasu.common.runtime.Environment import org.apache.commons.io.FilenameUtils import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession} def findFileType(folderName: File): Array[String] = { // get all the files from a directory val files: Array[File] = folderName.listFiles() val extensions: Array[String] = files.map(file => FilenameUtils.getExtension(file.toString)) extensions } } object SparkSqlRunner { def apply(env: Environment, jobId: String, actionName: String, notifier: Notifier, sc: SparkContext): SparkSqlRunner = { val sparkSqlRunnerObj = new SparkSqlRunner sparkSqlRunnerObj.env = env sparkSqlRunnerObj.jobId = jobId sparkSqlRunnerObj.actionName = actionName sparkSqlRunnerObj.notifier = notifier sparkSqlRunnerObj.sc = sc sparkSqlRunnerObj.spark = SparkSession.builder().config(sc.getConf).enableHiveSupport().getOrCreate() sparkSqlRunnerObj } }
Example 121
Source File: SparkConsoleEgress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic, StreamletQueryExecution } import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.functions._ import org.apache.spark.sql.DataFrame class SparkConsoleEgress extends SparkStreamlet { val in1 = AvroInlet[Data]("in1") val in2 = AvroInlet[Data]("in2") val shape = StreamletShape.withInlets(in1, in2) def asTimestamp = udf((t: Long) ⇒ new java.sql.Timestamp(t)) def elapsedTime = udf((t1: Long, t0: Long) ⇒ t1 - t0) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val stream1 = readStream(in1).withColumn("source", lit("spark")).withColumn("elapsed", elapsedTime($"t2", $"t1")) val stream2 = readStream(in2).withColumn("source", lit("akka")).withColumn("elapsed", elapsedTime($"t2", $"t1")) // commented-out process: simple stats to compute min/max/mean on a window // val dataCount = stream1.union(stream2).withColumn("ts", asTimestamp($"timestamp")) // val stats = dataCount // .withWatermark("ts", "1 second") // .groupBy(window($"ts", "5 minutes", "1 minute"), $"source") // //.agg(max($"elapsed"), min($"elapsed"), avg($"elapsed"), count($"source")) val quantiles: (String ⇒ Long ⇒ (DataFrame, Long) ⇒ Unit) = { name ⇒ period ⇒ (df, time) ⇒ df.cache() val count = df.count() val cps = count.toDouble / period val quans = df.stat.approxQuantile("elapsed", Array(0.1, 0.5, 0.9, 0.99), 0.01) println(s"$time, $name, $count, $cps, " + quans.mkString(", ")) } val period = 60 * 5 // seconds val q1 = stream1.writeStream.foreachBatch(quantiles("spark")(period)) .trigger(Trigger.ProcessingTime(s"$period seconds")) .option("checkpointLocation", context.checkpointDir("console-egress-q1")) .start() val q2 = stream2.writeStream.foreachBatch(quantiles("akka")(period)) .trigger(Trigger.ProcessingTime(s"$period seconds")) .option("checkpointLocation", context.checkpointDir("console-egress-q2")) .start() new Thread() { override def run(): Unit = { while (true) { val progress = q1.lastProgress if (progress != null) { println("***************** [PROGRESS] *********************") println(progress.toString()) println("**************************************************") } Thread.sleep(60 * 1000) } } } //.start // uncomment to enable the query progress StreamletQueryExecution(q1, q2) } } }
Example 122
Source File: Checkers.scala From spark3D with Apache License 2.0 | 5 votes |
package com.astrolabsoftware.spark3d import org.apache.spark.sql.Row import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.spark_partition_id object Checkers { def returnFracSize(part: Iterator[Row], numberOfElements: Long): Iterator[Double] = { // Number of elements in the partition val sizePartition = part.size // Use Double val frac : Double = sizePartition.toDouble / numberOfElements.toDouble * 100 // Return an Iterator Iterator(frac) } def returnSize(part: Iterator[Row]): Iterator[Double] = { // Return an Iterator Iterator(part.size) } def checkLoadBalancing(df: DataFrame, kind: String = "frac", numberOfElements: Long = -1L) : DataFrame = { // Need to import implicits to use toDF method val spark2 = SparkSession.getActiveSession.get import spark2.implicits._ // Total number of elements in the DF. val numberOfElementsPriv: Long = numberOfElements match { case -1 => { kind match { case "frac" => df.count() // If not kind="frac", we do not need to total number of rows. case _ => -1L } } case x if x > 0 => numberOfElements case _ => throw new AssertionError(""" Total number of elements in the DataFrame must be Long greater than 0! If you do not know it, set it to -1, and we will compute it for you. """) } // Output a DataFrame containing detail of the load balancing. val dfout = kind match { case "frac" => df.rdd.mapPartitions(part => returnFracSize(part, numberOfElementsPriv)).toDF("Load (%)") case "size" => df.rdd.mapPartitions(returnSize).toDF("Load (#Rows)") case _ => throw new AssertionError(""" Wrong value for `kind`! You must choose between - "frac": Output a DataFrame containing the size of each partition relative to the total size of the dataset (in percent). - "size": Output a DataFrame containing the size of each partition in terms of number of rows. """) } dfout.withColumn("partition_id", spark_partition_id()) } }
Example 123
Source File: package.scala From spark-athena with Apache License 2.0 | 5 votes |
package io.github.tmheo.spark import java.util.Properties import com.amazonaws.athena.jdbc.shaded.com.amazonaws.regions.Regions import org.apache.spark.sql.{DataFrame, DataFrameReader} import scala.collection.JavaConverters._ package object athena { implicit class AthenaDataFrameReader(reader: DataFrameReader) { def athena(table: String): DataFrame = { reader.format("io.github.tmheo.spark.athena") .option(JDBCOptions.JDBC_TABLE_NAME, table) .load } def athena(table: String, region: String, s3StatingDir: String): DataFrame = { reader.format("io.github.tmheo.spark.athena") .option(JDBCOptions.JDBC_TABLE_NAME, table) .option("region", region) .option("s3_staging_dir", s3StatingDir) .load } def athena(table: String, s3StatingDir: String): DataFrame = { athena(table, Regions.getCurrentRegion.getName, s3StatingDir) } def athena(table: String, properties: Properties): DataFrame = { val options = properties.asScala options += (JDBCOptions.JDBC_TABLE_NAME -> table) reader.format("io.github.tmheo.spark.athena").options(options).load } } }
Example 124
Source File: RddToDataFrame.scala From spark-sframe with BSD 2-Clause "Simplified" License | 5 votes |
package org.apache.spark.turi import org.graphlab.create.GraphLabUtil import org.apache.spark.sql.{SQLContext, Row, DataFrame} import org.apache.spark.rdd.RDD import scala.collection.JavaConversions._ import org.apache.spark.sql.types._ import scala.collection.mutable.ListBuffer import scala.collection.mutable.ArrayBuffer import scala.collection.immutable.Map import java.util.HashMap import java.util.ArrayList import java.util.{Date,GregorianCalendar} import java.sql.Date object EvaluateRDD { def inferSchema(obj: Any): DataType = { if(obj.isInstanceOf[Int]) { IntegerType } else if(obj.isInstanceOf[String]) { StringType } else if(obj.isInstanceOf[Double]) { DoubleType } else if(obj.isInstanceOf[Long]) { LongType } else if(obj.isInstanceOf[Float]) { FloatType } else if(obj.isInstanceOf[Map[_,_]]) { MapType(inferSchema(obj.asInstanceOf[Map[_,_]].head._1),inferSchema(obj.asInstanceOf[Map[_,_]].head._2)) } else if(obj.isInstanceOf[java.util.HashMap[_,_]]) { MapType(inferSchema(obj.asInstanceOf[java.util.HashMap[_,_]].head._1),inferSchema(obj.asInstanceOf[java.util.HashMap[_,_]].head._2)) } else if(obj.isInstanceOf[Array[_]]) { ArrayType(inferSchema(obj.asInstanceOf[Array[_]](0))) } else if(obj.isInstanceOf[java.util.ArrayList[_]]) { ArrayType(inferSchema(obj.asInstanceOf[java.util.ArrayList[_]](0))) } else if(obj.isInstanceOf[java.util.GregorianCalendar]) { TimestampType } else if(obj.isInstanceOf[java.util.Date] || obj.isInstanceOf[java.sql.Date]) { DateType } else { StringType } } def toScala(obj: Any): Any = { if (obj.isInstanceOf[java.util.HashMap[_,_]]) { val jmap = obj.asInstanceOf[java.util.HashMap[_,_]] jmap.map { case (k,v) => toScala(k) -> toScala(v) }.toMap } else if(obj.isInstanceOf[java.util.ArrayList[_]]) { val buf = ArrayBuffer[Any]() val jArray = obj.asInstanceOf[java.util.ArrayList[_]] for(item <- jArray) { buf += toScala(item) } buf.toArray } else if(obj.isInstanceOf[java.util.GregorianCalendar]) { new java.sql.Timestamp(obj.asInstanceOf[java.util.GregorianCalendar].getTime().getTime()) } else { obj } } def toSparkDataFrame(sqlContext: SQLContext, rdd: RDD[java.util.HashMap[String,_]]): DataFrame = { val scalaRDD = rdd.map(l => toScala(l)) val rowRDD = scalaRDD.map(l => Row.fromSeq(l.asInstanceOf[Map[_,_]].values.toList)) var sample_data: java.util.HashMap[String,_] = rdd.take(1)(0) var schema_list: ListBuffer[StructField] = new ListBuffer[StructField]() for ((name,v) <- sample_data) { schema_list.append(StructField(name,inferSchema(v))) } sqlContext.createDataFrame(rowRDD,StructType(schema_list)) } }
Example 125
Source File: DataframeToDriverCsvFileWriter.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import java.io.PrintWriter import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types._ import io.deepsense.sparkutils.readwritedataframe.ManagedResource object DataframeToDriverCsvFileWriter { def write( dataFrame: DataFrame, options: Map[String, String], dataSchema: StructType, pathWithoutScheme: String): Unit = { val data = dataFrame.rdd.collect() val params = new CSVOptions(options) ManagedResource( new LocalCsvOutputWriter(dataSchema, params, pathWithoutScheme) ) { writer => data.foreach(row => { writer.write(row.toSeq.map(_.asInstanceOf[String])) }) } } } class LocalCsvOutputWriter( dataSchema: StructType, params: CSVOptions, driverPath: String) { private val driverFileWriter = new PrintWriter(driverPath) private val FLUSH_BATCH_SIZE = 1024L private var records: Long = 0L private val csvWriter = new LineCsvWriter(params, dataSchema.fieldNames.toSeq) def write(row: Seq[String]): Unit = { csvWriter.writeRow(row, records == 0L && params.headerFlag) records += 1 if (records % FLUSH_BATCH_SIZE == 0) { flush() } } private def flush(): Unit = { val lines = csvWriter.flush() if (lines.nonEmpty) { driverFileWriter.write(lines) } } def close(): Unit = { flush() driverFileWriter.close() } }
Example 126
Source File: DataframeToRawCsvRDD.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.sparkutils.readwritedataframe import org.apache.commons.csv.QuoteMode import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame object DataframeToRawCsvRDD { val defaultCsvFormat = com.databricks.spark.csv.defaultCsvFormat def apply(dataFrame: DataFrame, parameters: Map[String, String] = Map()) (implicit sparkContext: SparkContext): RDD[String] = { val delimiter = parameters.getOrElse("delimiter", ",") val delimiterChar = if (delimiter.length == 1) { delimiter.charAt(0) } else { throw new Exception("Delimiter cannot be more than one character.") } val escape = parameters.getOrElse("escape", null) val escapeChar: Character = if (escape == null) { null } else if (escape.length == 1) { escape.charAt(0) } else { throw new Exception("Escape character cannot be more than one character.") } val quote = parameters.getOrElse("quote", "\"") val quoteChar: Character = if (quote == null) { null } else if (quote.length == 1) { quote.charAt(0) } else { throw new Exception("Quotation cannot be more than one character.") } val quoteModeString = parameters.getOrElse("quoteMode", "MINIMAL") val quoteMode: QuoteMode = if (quoteModeString == null) { null } else { QuoteMode.valueOf(quoteModeString.toUpperCase) } val nullValue = parameters.getOrElse("nullValue", "null") val csvFormat = defaultCsvFormat .withDelimiter(delimiterChar) .withQuote(quoteChar) .withEscape(escapeChar) .withQuoteMode(quoteMode) .withSkipHeaderRecord(false) .withNullString(nullValue) val generateHeader = parameters.getOrElse("header", "false").toBoolean val headerRdd = if (generateHeader) { sparkContext.parallelize(Seq( csvFormat.format(dataFrame.columns.map(_.asInstanceOf[AnyRef]): _*) )) } else { sparkContext.emptyRDD[String] } val rowsRdd = dataFrame.rdd.map(row => { csvFormat.format(row.toSeq.map(_.asInstanceOf[AnyRef]): _*) }) headerRdd union rowsRdd } }
Example 127
Source File: DataframeToDriverCsvFileWriter.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import java.io.PrintWriter import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types._ import io.deepsense.sparkutils.readwritedataframe.{DataframeToRawCsvRDD, ManagedResource} object DataframeToDriverCsvFileWriter { def write( dataFrame: DataFrame, options: Map[String, String], dataSchema: StructType, pathWithoutScheme: String): Unit = { val rawCsvLines = DataframeToRawCsvRDD(dataFrame, options)(dataFrame.sqlContext.sparkContext) writeRddToDriverFile(pathWithoutScheme, rawCsvLines) } // TODO extract to commons from DriverFiles private def writeRddToDriverFile(driverPath: String, lines: RDD[String]): Unit = { val recordSeparator = System.getProperty("line.separator", "\n") ManagedResource(new PrintWriter(driverPath)) { writer => lines.collect().foreach(line => writer.write(line + recordSeparator)) } } }
Example 128
Source File: SerializableSparkModel.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.Model import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import io.deepsense.sparkutils.ML class SerializableSparkModel[M <: Model[M]](val sparkModel: M) extends ML.Model[SerializableSparkModel[M]] with MLWritable { override def copy(extra: ParamMap): SerializableSparkModel[M] = new SerializableSparkModel(sparkModel.copy(extra)) override def write: MLWriter = { sparkModel match { case w: MLWritable => w.write case _ => new DefaultMLWriter(this) } } override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset) override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema) override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae" } // This class may seem unused, but it is used reflectively by spark deserialization mechanism object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] { override def read: MLReader[SerializableSparkModel[_]] = { new DefaultMLReader[SerializableSparkModel[_]]() } }
Example 129
Source File: DistributionCalculator.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.dataframe.report.distribution import org.apache.spark.mllib.stat.MultivariateStatisticalSummary import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types._ import io.deepsense.deeplang.doperables.dataframe.report.distribution.continuous.ContinuousDistributionBuilderFactory import io.deepsense.deeplang.doperables.dataframe.report.distribution.discrete.DiscreteDistributionBuilderFactory import io.deepsense.deeplang.utils.aggregators.AggregatorBatch import io.deepsense.reportlib.model._ object DistributionCalculator { def distributionByColumn( sparkDataFrame: org.apache.spark.sql.DataFrame, multivarStats: MultivariateStatisticalSummary): Map[String, Distribution] = { val dataFrameEmpty = multivarStats.count == 0 if (dataFrameEmpty) { noDistributionBecauseOfNoData(sparkDataFrame.schema) } else { distributionForNonEmptyDataFrame(sparkDataFrame, multivarStats) } } private def noDistributionBecauseOfNoData(schema: StructType): Map[String, Distribution] = { for (columnName <- schema.fieldNames) yield { columnName -> NoDistribution( columnName, NoDistributionReasons.NoData ) } }.toMap private def distributionForNonEmptyDataFrame( sparkDataFrame: DataFrame, multivarStats: MultivariateStatisticalSummary): Map[String, Distribution] = { val schema = sparkDataFrame.schema val distributionBuilders = for { (structField, columnIndex) <- sparkDataFrame.schema.zipWithIndex } yield { DistributionType.forStructField(structField) match { case DistributionType.Discrete => DiscreteDistributionBuilderFactory.prepareBuilder(columnIndex, structField) case DistributionType.Continuous => ContinuousDistributionBuilderFactory.prepareBuilder( columnIndex, structField, multivarStats) case DistributionType.NotApplicable => NoDistributionBuilder( structField.name, NoDistributionReasons.NotApplicableForType(structField.dataType)) } } val results = { val aggregators = distributionBuilders.flatMap(_.allAggregators) AggregatorBatch.executeInBatch(sparkDataFrame.rdd, aggregators) } val distributions = distributionBuilders.map(_.build(results)) distributions.map(d => d.name -> d).toMap } }
Example 130
Source File: EstimatorModelWrapperFixtures.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.estimators import scala.language.reflectiveCalls import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml import org.apache.spark.ml.param.{ParamMap, Param => SparkParam} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.report.Report import io.deepsense.deeplang.doperables.serialization.SerializableSparkModel import io.deepsense.deeplang.doperables.{SparkEstimatorWrapper, SparkModelWrapper} import io.deepsense.deeplang.params.wrappers.spark.SingleColumnCreatorParamWrapper import io.deepsense.deeplang.params.{Param, Params} import io.deepsense.sparkutils.ML object EstimatorModelWrapperFixtures { class SimpleSparkModel private[EstimatorModelWrapperFixtures]() extends ML.Model[SimpleSparkModel] { def this(x: String) = this() override val uid: String = "modelId" val predictionCol = new SparkParam[String](uid, "name", "description") def setPredictionCol(value: String): this.type = set(predictionCol, value) override def copy(extra: ParamMap): this.type = defaultCopy(extra) override def transformDF(dataset: DataFrame): DataFrame = { dataset.selectExpr("*", "1 as " + $(predictionCol)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = ??? } class SimpleSparkEstimator extends ML.Estimator[SimpleSparkModel] { def this(x: String) = this() override val uid: String = "estimatorId" val predictionCol = new SparkParam[String](uid, "name", "description") override def fitDF(dataset: DataFrame): SimpleSparkModel = new SimpleSparkModel().setPredictionCol($(predictionCol)) override def copy(extra: ParamMap): ML.Estimator[SimpleSparkModel] = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(predictionCol), IntegerType, nullable = false)) } } trait HasPredictionColumn extends Params { val predictionColumn = new SingleColumnCreatorParamWrapper[ ml.param.Params { val predictionCol: SparkParam[String] }]( "prediction column", None, _.predictionCol) setDefault(predictionColumn, "abcdefg") def getPredictionColumn(): String = $(predictionColumn) def setPredictionColumn(value: String): this.type = set(predictionColumn, value) } class SimpleSparkModelWrapper extends SparkModelWrapper[SimpleSparkModel, SimpleSparkEstimator] with HasPredictionColumn { override val params: Array[Param[_]] = Array(predictionColumn) override def report: Report = ??? override protected def loadModel( ctx: ExecutionContext, path: String): SerializableSparkModel[SimpleSparkModel] = ??? } class SimpleSparkEstimatorWrapper extends SparkEstimatorWrapper[SimpleSparkModel, SimpleSparkEstimator, SimpleSparkModelWrapper] with HasPredictionColumn { override val params: Array[Param[_]] = Array(predictionColumn) override def report: Report = ??? } }
Example 131
Source File: CustomCodeEntryPoint.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.workflowexecutor.customcode import java.util.concurrent.TimeoutException import java.util.concurrent.atomic.AtomicReference import scala.annotation.tailrec import scala.concurrent.duration._ import scala.concurrent.{Await, Promise} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.{SparkConf, SparkContext} import io.deepsense.commons.utils.Logging import io.deepsense.deeplang._ import io.deepsense.sparkutils.SparkSQLSession class CustomCodeEntryPoint( val sparkContext: SparkContext, val sparkSQLSession: SparkSQLSession, val dataFrameStorage: DataFrameStorage, val operationExecutionDispatcher: OperationExecutionDispatcher) extends Logging { import io.deepsense.workflowexecutor.customcode.CustomCodeEntryPoint._ def getSparkContext: JavaSparkContext = sparkContext def getSparkSQLSession: SparkSQLSession = sparkSQLSession def getNewSparkSQLSession: SparkSQLSession = sparkSQLSession.newSession() def getSparkConf: SparkConf = sparkContext.getConf private val codeExecutor: AtomicReference[Promise[CustomCodeExecutor]] = new AtomicReference(Promise()) private val pythonPort: AtomicReference[Promise[Int]] = new AtomicReference(Promise()) def getCodeExecutor(timeout: Duration): CustomCodeExecutor = getFromPromise(codeExecutor.get, timeout) def getPythonPort(timeout: Duration): Int = getFromPromise(pythonPort.get, timeout) def registerCodeExecutor(newCodeExecutor: CustomCodeExecutor): Unit = replacePromise(codeExecutor, newCodeExecutor) def registerCallbackServerPort(newPort: Int): Unit = replacePromise(pythonPort, newPort) def retrieveInputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame = dataFrameStorage.getInputDataFrame(workflowId, nodeId, portNumber).get def retrieveOutputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame = dataFrameStorage.getOutputDataFrame(workflowId, nodeId, portNumber).get def registerOutputDataFrame( workflowId: String, nodeId: String, portNumber: Int, dataFrame: DataFrame): Unit = dataFrameStorage.setOutputDataFrame(workflowId, nodeId, portNumber, dataFrame) def executionCompleted(workflowId: String, nodeId: String): Unit = operationExecutionDispatcher.executionEnded(workflowId, nodeId, Right(())) def executionFailed(workflowId: String, nodeId: String, error: String): Unit = operationExecutionDispatcher.executionEnded(workflowId, nodeId, Left(error)) } object CustomCodeEntryPoint { private case class PromiseReplacedException() extends Exception @tailrec private def getFromPromise[T](promise: => Promise[T], timeout: Duration): T = { try { Await.result(promise.future, timeout) } catch { case e: TimeoutException => throw e case e: PromiseReplacedException => getFromPromise(promise, timeout) } } private def replacePromise[T](promise: AtomicReference[Promise[T]], newValue: T): Unit = { val oldPromise = promise.getAndSet(Promise.successful(newValue)) try { oldPromise.failure(new PromiseReplacedException) } catch { // The oldPromise will have been completed always, except for the first time. // The illegal state is expected, but we have to complete the oldPromise, // since someone might be waiting on it. case e: IllegalStateException => () } } case class CustomCodeEntryPointConfig( pyExecutorSetupTimeout: Duration = 5.seconds) }
Example 132
Source File: LinearRegressionDataGen.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.datageneration.mlgenerator import org.apache.spark.mllib.util.LinearDataGenerator import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{getOrDefault, getOrThrow, time} import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} object LinearRegressionDataGen extends WorkloadDefaults { val name = "data-generation-lr" // Application parameters #1million points have 200M data size val numOfExamples: Int = 40000 val numOfFeatures: Int = 4 val eps: Double = 0.5 val intercepts: Double = 0.1 val numOfPartitions: Int = 10 val maxIteration: Int = 3 override def apply(m: Map[String, Any]) = new LinearRegressionDataGen( numRows = getOrThrow(m, "rows").asInstanceOf[Int], numCols = getOrThrow(m, "cols").asInstanceOf[Int], output = Some(getOrThrow(m, "output").asInstanceOf[String]), saveMode = getOrDefault[String](m, "save-mode", SaveModes.error), eps = getOrDefault[Double](m, "eps", eps), intercepts = getOrDefault[Double](m, "intercepts", intercepts), numPartitions = getOrDefault[Int](m, "partitions", numOfPartitions) ) } case class LinearRegressionDataGen ( numRows: Int, numCols: Int, input: Option[String] = None, output: Option[String], saveMode: String, eps: Double, intercepts: Double, numPartitions: Int ) extends Workload { override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val timestamp = System.currentTimeMillis() val (generateTime, data): (Long, RDD[LabeledPoint]) = time { LinearDataGenerator.generateLinearRDD( spark.sparkContext, numRows, numCols, eps, numPartitions, intercepts ) } import spark.implicits._ val (convertTime, dataDF) = time { data.toDF } val (saveTime, _) = time { val outputstr = output.get if(outputstr.endsWith(".csv")) throw SparkBenchException("LabeledPoints cannot be saved to CSV. Please try outputting to Parquet instead.") writeToDisk(output.get, saveMode, dataDF, spark) }//TODO you can't output this to CSV. Parquet is fine val timeResultSchema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("generate", LongType, nullable = true), StructField("convert", LongType, nullable = true), StructField("save", LongType, nullable = true), StructField("total_runtime", LongType, nullable = false) ) ) val total = generateTime + convertTime + saveTime val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total))) spark.createDataFrame(timeList, timeResultSchema) } }
Example 133
Source File: KMeansDataGen.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.datageneration.mlgenerator import com.ibm.sparktc.sparkbench.workload.ml.KMeansWorkload import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._ import com.ibm.sparktc.sparkbench.utils.SaveModes import org.apache.spark.mllib.util.KMeansDataGenerator import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.types._ object KMeansDataGen extends WorkloadDefaults { val name = "data-generation-kmeans" override def apply(m: Map[String, Any]) = new KMeansDataGen( numRows = getOrThrow(m, "rows").asInstanceOf[Int], numCols = getOrThrow(m, "cols").asInstanceOf[Int], output = Some(getOrThrow(m, "output").asInstanceOf[String]), saveMode = getOrDefault[String](m, "save-mode", SaveModes.error), k = getOrDefault[Int](m, "k", KMeansWorkload.numOfClusters), scaling = getOrDefault[Double](m, "scaling", KMeansWorkload.scaling), numPartitions = getOrDefault[Int](m, "partitions", KMeansWorkload.numOfPartitions) ) } case class KMeansDataGen( numRows: Int, numCols: Int, input: Option[String] = None, output: Option[String], saveMode: String, k: Int, scaling: Double, numPartitions: Int ) extends Workload { override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val timestamp = System.currentTimeMillis() val (generateTime, data): (Long, RDD[Array[Double]]) = time { KMeansDataGenerator.generateKMeansRDD( spark.sparkContext, numRows, k, numCols, scaling, numPartitions ) } val (convertTime, dataDF) = time { val schemaString = data.first().indices.map(i => "c" + i.toString).mkString(" ") val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false)) val schema = StructType(fields) val rowRDD = data.map(arr => Row(arr:_*)) spark.createDataFrame(rowRDD, schema) } val (saveTime, _) = time { writeToDisk(output.get, saveMode, dataDF, spark) } val timeResultSchema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("generate", LongType, nullable = true), StructField("convert", LongType, nullable = true), StructField("save", LongType, nullable = true), StructField("total_runtime", LongType, nullable = false) ) ) val total = generateTime + convertTime + saveTime val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total))) spark.createDataFrame(timeList, timeResultSchema) } }
Example 134
Source File: KMeansWorkloadTest.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload.ml import java.io.File import com.holdenkarau.spark.testing.Utils import com.ibm.sparktc.sparkbench.testfixtures.SparkSessionProvider import com.ibm.sparktc.sparkbench.utils.SaveModes import com.ibm.sparktc.sparkbench.utils.SparkFuncs.{load, writeToDisk} import org.apache.spark.mllib.util.KMeansDataGenerator import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} class KMeansWorkloadTest extends FlatSpec with Matchers with BeforeAndAfterEach { private val spark = SparkSessionProvider.spark private val fileName = s"/tmp/spark-bench-scalatest/kmeans-${java.util.UUID.randomUUID.toString}.csv" override def afterEach() { Utils.deleteRecursively(new File(fileName)) } def makeDataFrame(): DataFrame = { val data: RDD[Array[Double]] = KMeansDataGenerator.generateKMeansRDD( spark.sparkContext, 1, 1, 1, KMeansWorkload.scaling, KMeansWorkload.numOfPartitions ) val schemaString = data.first().indices.map(_.toString).mkString(" ") val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false)) val schema = StructType(fields) val rowRDD = data.map(arr => Row(arr: _*)) spark.createDataFrame(rowRDD, schema) } "reconcileSchema" should "handle a StringType schema and turn it into a DoubleType Schema" in { val df2Disk = makeDataFrame() writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv")) val conf = Map("name" -> "kmeans", "input" -> fileName) val work = KMeansWorkload(conf) val df = load(spark, fileName) val ddf = work.reconcileSchema(df) ddf.schema.head.dataType shouldBe DoubleType } "The load function" should "parse the DataFrame it's given into an RDD[Vector]" in { val df = makeDataFrame() val conf = Map("name" -> "kmeans", "input" -> "") val work = KMeansWorkload(conf) val ddf = work.reconcileSchema(df) val (_, rdd) = work.loadToCache(ddf, spark) rdd.first() } it should "work even when we've pulled the data from disk" in { val df2Disk = makeDataFrame() writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv")) val conf = Map("name" -> "kmeans", "input" -> fileName) val work = KMeansWorkload(conf) val df = load(spark, fileName) val ddf = work.reconcileSchema(df) val (_, rdd) = work.loadToCache(ddf, spark) rdd.first() } "doWorkload" should "work" in { val df2Disk = makeDataFrame() writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv")) val conf = Map("name" -> "kmeans", "input" -> fileName) val work = KMeansWorkload(conf) val df = load(spark, fileName) val ddf = work.reconcileSchema(df) work.doWorkload(Some(ddf), spark) } }
Example 135
Source File: DataFrameExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 136
Source File: HashingTF.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 137
Source File: SQLTransformer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 138
Source File: MultilayerPerceptronClassifierWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] val weights: Array[Double] = mlpModel.weights.toArray val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 139
Source File: Transformer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(this.createTransformFunc, outputDataType) dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 140
Source File: VectorSlicerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 141
Source File: BinarizerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Double] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4) } test("params") { ParamsSuite.checkParams(new Binarizer) } test("Binarize continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize continuous features with setter") { val threshold: Double = 0.2 val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with setter") { val threshold: Double = 0.2 val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("read/write") { val t = new Binarizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setThreshold(0.1) testDefaultReadWrite(t) } }
Example 142
Source File: SQLBuilderTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import scala.util.control.NonFatal import org.apache.spark.sql.{DataFrame, Dataset, QueryTest} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton { protected def checkSQL(e: Expression, expectedSQL: String): Unit = { val actualSQL = e.sql try { assert(actualSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following expression: | |${e.prettyName} | |$cause """.stripMargin) } } protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = { val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) => fail( s"""Cannot convert the following logical query plan to SQL: | |${plan.treeString} """.stripMargin) } try { assert(generatedSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following logical query plan: | |${plan.treeString} | |$cause """.stripMargin) } checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan)) } protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = { checkSQL(df.queryExecution.analyzed, expectedSQL) } }
Example 143
Source File: JdbcRelationProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 144
Source File: FrequentItems.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 145
Source File: FileStreamSink.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.datasources.{FileFormat, FileFormatWriter} object FileStreamSink { // The name of the subdirectory that is used to store metadata about which files are valid. val metadataDir = "_spark_metadata" } class FileStreamSink( sparkSession: SparkSession, path: String, fileFormat: FileFormat, partitionColumnNames: Seq[String], options: Map[String, String]) extends Sink with Logging { private val basePath = new Path(path) private val logPath = new Path(basePath, FileStreamSink.metadataDir) private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, logPath.toUri.toString) private val hadoopConf = sparkSession.sessionState.newHadoopConf() override def addBatch(batchId: Long, data: DataFrame): Unit = { if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) { logInfo(s"Skipping already committed batch $batchId") } else { val committer = FileCommitProtocol.instantiate( className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass, jobId = batchId.toString, outputPath = path, isAppend = false) committer match { case manifestCommitter: ManifestFileCommitProtocol => manifestCommitter.setupManifestOptions(fileLog, batchId) case _ => // Do nothing } // Get the actual partition columns as attributes after matching them by name with // the given columns names. val partitionColumns: Seq[Attribute] = partitionColumnNames.map { col => val nameEquality = data.sparkSession.sessionState.conf.resolver data.logicalPlan.output.find(f => nameEquality(f.name, col)).getOrElse { throw new RuntimeException(s"Partition column $col not found in schema ${data.schema}") } } FileFormatWriter.write( sparkSession = sparkSession, queryExecution = data.queryExecution, fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec(path, Map.empty), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = None, refreshFunction = _ => (), options = options) } } override def toString: String = s"FileSink[$path]" }
Example 146
Source File: console.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class ConsoleSink(options: Map[String, String]) extends Sink with Logging { // Number of rows to display, by default 20 rows private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true) // Track the batch id private var lastBatchId = -1L override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized { val batchIdStr = if (batchId <= lastBatchId) { s"Rerun batch: $batchId" } else { lastBatchId = batchId s"Batch: $batchId" } // scalastyle:off println println("-------------------------------------------") println(batchIdStr) println("-------------------------------------------") // scalastyle:off println data.sparkSession.createDataFrame( data.sparkSession.sparkContext.parallelize(data.collect()), data.schema) .show(numRowsToShow, isTruncated) } } class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new ConsoleSink(parameters) } def shortName(): String = "console" }
Example 147
Source File: RowDataSourceStrategySuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.sql.DriverManager import java.util.Properties import org.scalatest.BeforeAndAfter import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.sources._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.util.Utils class RowDataSourceStrategySuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext { import testImplicits._ val url = "jdbc:h2:mem:testdb0" val urlWithUserAndPass = "jdbc:h2:mem:testdb0;user=testUser;password=testPass" var conn: java.sql.Connection = null before { Utils.classForName("org.h2.Driver") // Extra properties that will be specified for our database. We need these to test // usage of parameters from OPTIONS clause in queries. val properties = new Properties() properties.setProperty("user", "testUser") properties.setProperty("password", "testPass") properties.setProperty("rowId", "false") conn = DriverManager.getConnection(url, properties) conn.prepareStatement("create schema test").executeUpdate() conn.prepareStatement("create table test.inttypes (a INT, b INT, c INT)").executeUpdate() conn.prepareStatement("insert into test.inttypes values (1, 2, 3)").executeUpdate() conn.commit() sql( s""" |CREATE TEMPORARY TABLE inttypes |USING org.apache.spark.sql.jdbc |OPTIONS (url '$url', dbtable 'TEST.INTTYPES', user 'testUser', password 'testPass') """.stripMargin.replaceAll("\n", " ")) } after { conn.close() } test("SPARK-17673: Exchange reuse respects differences in output schema") { val df = sql("SELECT * FROM inttypes") val df1 = df.groupBy("a").agg("b" -> "min") val df2 = df.groupBy("a").agg("c" -> "min") val res = df1.union(df2) assert(res.distinct().count() == 2) // would be 1 if the exchange was incorrectly reused } }
Example 148
Source File: TakeOrderedAndProjectSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.util.Random import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ class TakeOrderedAndProjectSuite extends SparkPlanTest with SharedSQLContext { private var rand: Random = _ private var seed: Long = 0 protected override def beforeAll(): Unit = { super.beforeAll() seed = System.currentTimeMillis() rand = new Random(seed) } private def generateRandomInputData(): DataFrame = { val schema = new StructType() .add("a", IntegerType, nullable = false) .add("b", IntegerType, nullable = false) val inputData = Seq.fill(10000)(Row(rand.nextInt(), rand.nextInt())) spark.createDataFrame(sparkContext.parallelize(Random.shuffle(inputData), 10), schema) } private def noOpFilter(plan: SparkPlan): SparkPlan = FilterExec(Literal(true), plan) val limit = 250 val sortOrder = 'a.desc :: 'b.desc :: Nil test("TakeOrderedAndProject.doExecute without project") { withClue(s"seed = $seed") { checkThatPlansAgree( generateRandomInputData(), input => noOpFilter(TakeOrderedAndProjectExec(limit, sortOrder, input.output, input)), input => GlobalLimitExec(limit, LocalLimitExec(limit, SortExec(sortOrder, true, input))), sortAnswers = false) } } test("TakeOrderedAndProject.doExecute with project") { withClue(s"seed = $seed") { checkThatPlansAgree( generateRandomInputData(), input => noOpFilter( TakeOrderedAndProjectExec(limit, sortOrder, Seq(input.output.last), input)), input => GlobalLimitExec(limit, LocalLimitExec(limit, ProjectExec(Seq(input.output.last), SortExec(sortOrder, true, input)))), sortAnswers = false) } } }
Example 149
Source File: XGBoost.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import eleflow.uberdata.models.UberXGBOOSTModel import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType} import scala.reflect.ClassTag class XGBoost[I](override val uid: String, val models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))])( implicit kt: ClassTag[I], ord: Ordering[I] = null) extends ForecastBaseModel[XGBoostSmallModel[I]] with HasInputCol with HasOutputCol with DefaultParamsWritable with HasFeaturesCol with HasNFutures with HasGroupByCol { def this( models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))] )(implicit kt: ClassTag[I], ord: Ordering[I] ) = this(Identifiable.randomUID("xgboost"), models) override def transform(dataSet: Dataset[_]): DataFrame = { val schema = dataSet.schema val predSchema = transformSchema(schema) val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)}) val predictions = joined.map { case (id, ((bestModel, metrics), row)) => val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]]( IUberdataForecastUtil.FEATURES_COL_NAME ) val label = DataTransformer.toFloat(row.getAs($(featuresCol))) val labelPoint = features.map { vec => val array = vec.toArray.map(_.toFloat) LabeledPoint(label, null, array) } val matrix = new DMatrix(labelPoint.toIterator) val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance .predict(matrix) .flatMap(_.map(_.toDouble)) .splitAt(features.length) Row( row.toSeq :+ Vectors .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _* ) } dataSet.sqlContext.createDataFrame(predictions, predSchema) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra) }
Example 150
Source File: TimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataSet: Dataset[_]): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val index = sparkContext.broadcast(dataSet.schema.fieldIndex($(timeCol).get)) val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(groupByCol).get)) val featuresColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(featuresCol))) val grouped = rdd.map { case (row: Row) => val timeColRow = IUberdataForecastUtil.convertColumnToLong(row, index.value) convertColumnToDouble(timeColRow, featuresColIndex) }.groupBy { row => row.getAs[L](labelColIndex.value) }.map { case (key, values) => val toBeUsed = values.toArray.sortBy(row => row.getAs[Long](index.value)) (key, toBeUsed) } val toBeTrained = grouped.map { case (key, values) => org.apache.spark.sql.Row( key, Vectors.dense(values.map(_.getAs[Double](featuresColIndex.value))) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(toBeTrained, trainSchema) } override def transformSchema(schema: StructType): StructType = { val labelIndex = schema.fieldIndex($(groupByCol).get) StructType( Seq( schema.fields(labelIndex), StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): TimeSeriesGenerator[L] = defaultCopy(extra) } object TimeSeriesGenerator extends DefaultParamsReadable[TimeSeriesGenerator[_]] { override def load(path: String): TimeSeriesGenerator[_] = super.load(path) }
Example 151
Source File: XGBoostBigModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberXGBoostModel import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint} import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)]) extends ForecastBaseModel[XGBoostBigModel[I]] with HasLabelCol with HasIdCol { def setLabelcol(label: String): this.type = set(labelCol, label) def setIdcol(id: String): this.type = set(idCol, id) override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) ) } .join(prediction) .map { case (id, (features, predictValue)) => Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } protected def predict(dataSet: Dataset[_]) = { val features = dataSet.rdd.map { case (row: Row) => val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) val id = row.getAs[I]($(idCol)) SparkLabeledPoint(DataTransformer.toFloat(id), features) }.cache val (_, model) = models.head UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(getPredictionSchema) protected def getPredictionSchema: Array[StructField] = { Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) } }
Example 152
Source File: ArimaBestModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.TimeSeriesModel import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.StructType class ArimaBestModel[L, M <: TimeSeriesModel]( override val uid: String, val bestPrediction: RDD[(L, M)], val validationMetrics: RDD[(L, Seq[ModelParamEvaluation[L]])] ) extends Model[ArimaBestModel[L, M]] with TimeSeriesBestModelFinderParam[L] { //TODO avaliar necessidade override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) dataset.toDF() } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): ArimaBestModel[L, M] = { val copied = new ArimaBestModel[L, M](uid, bestPrediction, validationMetrics) copyValues(copied, extra) } }
Example 153
Source File: MovingAverage.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types._ def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(windowSize -> 3) override def transform(dataSet: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataSet.schema) val sparkContext = dataSet.sqlContext.sparkContext val inputType = outputSchema($(inputCol)).dataType val inputTypeBr = sparkContext.broadcast(inputType) val dataSetRdd = dataSet.rdd val inputColName = sparkContext.broadcast($(inputCol)) val inputColIndex = dataSet.columns.indexOf($(inputCol)) val inputColIndexBr = sparkContext.broadcast(inputColIndex) val windowSizeBr = sparkContext.broadcast($(windowSize)) val maRdd = dataSetRdd.map { case (row: Row) => val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) { val vector = row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value) (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1))) } else { val iterable = row.getAs[Iterable[Double]](inputColName.value) (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1))) } val (before, after) = row.toSeq.splitAt(inputColIndexBr.value) Row( (before :+ rawValue) ++ after.tail :+ MovingAverageCalc .simpleMovingAverageArray(array, windowSizeBr.value): _* ) } dataSet.sqlContext.createDataFrame(maRdd, outputSchema) } override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra) } object MovingAverageCalc { private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = { (for (i <- 1 to values.length) yield //TODO rollback this comment with the right size of features to make the meanaverage return // the features values for the first values of the calc if (i < period) 0d //values(i) else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d) } } object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] { override def load(path: String): MovingAverage[_] = super.load(path) }
Example 154
Source File: VectorizeEncoder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.core.data.DataTransformer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} class VectorizeEncoder(override val uid: String) extends Transformer with HasIdCol with HasTimeCol with HasInputCols with HasLabelCol with HasGroupByCol with HasOutputCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("vectorizer")) def setIdCol(input: String) = set(idCol, input) def setLabelCol(input: String) = set(labelCol, input) def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy)) def setInputCol(input: Array[String]) = set(inputCols, input) def setTimeCol(time: String) = set(timeCol, Some(time)) def setOutputCol(output: String) = set(outputCol, output) override def transform(dataSet: Dataset[_]): DataFrame = { val context = dataSet.sqlContext.sparkContext val input = context.broadcast($(inputCols)) val allColumnNames = dataSet.schema.map(_.name) val nonInputColumnIndexes = context.broadcast( allColumnNames.zipWithIndex.filter( f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol) || f._1 == $(timeCol).getOrElse(""))) val result = dataSet.rdd.map { case (row: Row) => val rowSeq = row.toSeq val nonInputColumns = nonInputColumnIndexes.value.map { case (_, index) => rowSeq(index) } val size = input.value.length val (values, indices) = input.value .filter(col => row.getAs(col) != null) .map { column => DataTransformer.toDouble(row.getAs(column)) } .zipWithIndex .filter(f => f._1 != 0d) .unzip Row( nonInputColumns :+ org.apache.spark.ml.linalg.Vectors .sparse(size, indices.toArray, values.toArray): _* ) } val newSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(result, newSchema) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType( schema.filter( col => !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol) || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("") ) ).add(StructField($(outputCol), new VectorUDT)) }
Example 155
Source File: AllColumnsTimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) // override def transform(dataSet: DataFrame): DataFrame = { override def transform(dataSet: Dataset[_] ): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(labelCol))) val keyValueDataSet = rdd.map { case (row: Row) => Row( row.getAs[T](labelColIndex.value), row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol)) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(keyValueDataSet, trainSchema) } override def transformSchema(schema: StructType): StructType = { StructType( schema.filter(_.name == $(labelCol)).head +: Seq( StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): AllColumnsTimeSeriesGenerator[T, U] = defaultCopy(extra) } object AllColumnsTimeSeriesGenerator extends DefaultParamsReadable[AllColumnsTimeSeriesGenerator[_, _]] { override def load(path: String): AllColumnsTimeSeriesGenerator[_, _] = super.load(path) }
Example 156
Source File: HoltWintersEstimator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.TimeSeriesModel import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import org.apache.spark.sql.Dataset class HoltWintersBestModel[T, M <: TimeSeriesModel]( override val uid: String, val bestPrediction: RDD[(T, M)], val validationMetrics: RDD[(T, ModelParamEvaluation[T])] ) extends Model[HoltWintersBestModel[T, M]] with TimeSeriesBestModelFinderParam[T] { //TODO look for this method usage to see if it can be removed override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) dataset.toDF() } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): HoltWintersBestModel[T, M] = { val copied = new HoltWintersBestModel[T, M](uid, bestPrediction, validationMetrics) copyValues(copied, extra) } }
Example 157
Source File: XGBoostBigModelTimeSeries.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import java.sql.Timestamp import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasTimeCol import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModelTimeSeries[I](override val uid: String, override val models: Seq[(ParamMap, XGBoostModel)]) extends XGBoostBigModel[I](uid, models) with HasTimeCol{ def setTimecol(time: String): this.type = set(timeCol, Some(time)) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME), row.getAs[java.sql.Timestamp]($(timeCol).get))) } .join(prediction) .map { case (id, ((features, time), predictValue)) => Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField($(timeCol).get, TimestampType), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) ) }
Example 158
Source File: HoltWintersBestModelFinder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberHoltWintersModel import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import scala.reflect.ClassTag class HoltWintersBestModelFinder[G]( override val uid: String )(implicit kt: ClassTag[G]) extends HoltWintersBestModelEvaluation[G, HoltWintersModel[G]] with DefaultParamsWritable with HasGroupByCol with TimeSeriesBestModelFinder { def setTimeSeriesEvaluator(eval: TimeSeriesEvaluator[G]): this.type = set(timeSeriesEvaluator, eval) def setEstimatorParamMaps(value: Array[ParamMap]): this.type = set(estimatorParamMaps, value) def setNFutures(value: Int): this.type = set(nFutures, value) override def setValidationCol(value: String): this.type = set(validationCol, value) def setLabelCol(label: String): this.type = set(labelCol, label) def setGroupByCol(groupBy: String): this.type = set(groupByCol, Some(groupBy)) def this()(implicit kt: ClassTag[G]) = this(Identifiable.randomUID("arima")) def modelEvaluation( idModels: RDD[(G, Row, Option[UberHoltWintersModel])] ): RDD[(G, (UberHoltWintersModel, ModelParamEvaluation[G]))] = { val eval = $(timeSeriesEvaluator) val broadcastEvaluator = idModels.context.broadcast(eval) idModels.filter(_._3.isDefined).map { case (id, row, models) => val evaluatedModels = models.map { model => holtWintersEvaluation(row, model, broadcastEvaluator, id) }.head log.warn(s"best model reach ${evaluatedModels._2.metricResult}") (id, evaluatedModels) } } override protected def train(dataSet: Dataset[_]): HoltWintersModel[G] = { val splitDs = split(dataSet, $(nFutures)) val idModels = splitDs.rdd.map(train) new HoltWintersModel[G](uid, modelEvaluation(idModels)) .setValidationCol($(validationCol)) .asInstanceOf[HoltWintersModel[G]] } def train(row: Row): (G, Row, Option[UberHoltWintersModel]) = { val id = row.getAs[G]($(groupByCol).get) val result = try { val dense = row.getAs[org.apache.spark.ml.linalg.DenseVector]($(featuresCol)) val ts:org.apache.spark.mllib.linalg.Vector = org.apache.spark.mllib.linalg.Vectors.dense(dense.toArray); Some( UberHoltWintersModel.fitModelWithBOBYQA(ts, $(nFutures)) ) } catch { case e: Exception => log.error( s"Got the following Exception ${e.getLocalizedMessage} in id $id" ) None } (id, row, result) } } object HoltWintersBestModelFinder extends DefaultParamsReadable[HoltWintersBestModelFinder[_]] { override def load(path: String): HoltWintersBestModelFinder[_] = super.load(path) }
Example 159
Source File: IUberdataForecastUtil.scala From uberdata with Apache License 2.0 | 5 votes |
package eleflow.uberdata import eleflow.uberdata.core.IUberdataContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.lit object IUberdataForecastUtil { lazy val FEATURES_PREDICTION_COL_NAME = "featuresPrediction" lazy val FEATURES_COL_NAME = "features" lazy val ALGORITHM = "algorithm" lazy val PARAMS = "parameters" lazy val METRIC_COL_NAME = "metric" def convertColumnToLong(row: Row, columnIndex: Int): Row = { row.get(columnIndex) match { case s: java.sql.Timestamp => val (prior, after) = row.toSeq.splitAt(columnIndex) val result = (prior :+ s.getTime) ++ after.tail :+ s Row(result: _*) case d: Double => val (prior, after) = row.toSeq.splitAt(columnIndex) val result = (prior :+ d.toLong) ++ after.tail :+ d Row(result: _*) case i: Int => val (prior, after) = row.toSeq.splitAt(columnIndex) val result = (prior :+ i.toLong) ++ after.tail :+ i Row(result: _*) case s: Short => val (prior, after) = row.toSeq.splitAt(columnIndex) val result = (prior :+ s.toLong) ++ after.tail :+ s Row(result: _*) case _ => row } } def convertColumnToLongAddAtEnd(row: Row, columnIndex: Int): Row = { val result = row.get(columnIndex) match { case s: java.sql.Timestamp => val result = row.toSeq :+ s.getTime Row(result: _*) case d: Double => val result = row.toSeq :+ d.toLong Row(result: _*) case i: Int => val result = row.toSeq :+ i.toLong Row(result: _*) case s: Short => val result = row.toSeq :+ s.toLong Row(result: _*) case _ => row } result } def createIdColColumn(dataFrame : DataFrame, context : IUberdataContext) : DataFrame = { val arrId = dataFrame.rdd.zipWithIndex.map( x => x._1.toSeq :+ x._2 ).map( x => Row.fromSeq(x)) context.sqlContext.createDataFrame(arrId, dataFrame.withColumn("idCol", lit(1L : Long)).schema) } }
Example 160
Source File: TnViewCreator.scala From TopNotch with Apache License 2.0 | 5 votes |
package com.bfm.topnotch.tnview import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SparkSession /** * The class for combining multiple data sets into one that can be used as an input to the diff and assertion commands. * This one data set is a "view" of the many used to create it. * @param spark The SparkSession to use for creating views */ class TnViewCreator(spark: SparkSession) { /** * Create a view using from multiple data sets using a sql statement * @param inputs The inputs to create views from * @param params The HiveQL statement used to create the new view and the input tables' names in the statement * @return The new view in the form of a dataframe */ def createView(inputs: Seq[DataFrame], params: TnViewParams): DataFrame = { // register the views as temporary tables accessible from sql queries inputs.zip(params.tableAliases).foreach{ case (view, name) => view.createOrReplaceTempView(name) } spark.sql(params.query) } }
Example 161
Source File: TnTestHelper.scala From TopNotch with Apache License 2.0 | 5 votes |
package com.bfm.topnotch import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.scalatest.Matchers import scala.io.Source import org.json4s._ import org.json4s.native.JsonMethods._ /** * This class handles some of the TopNotch reusable test code */ object TnTestHelper extends Matchers { val INDEX_COL_NAME = "__INDEX_COL__" /** * Read a file from the resources/src/test/scala/com/bfm/topnotch folder * @param fileName The path to the file relative to the path resources/src/test/scala/com/bfm/topnotch * @return The contents of the file as one string */ def readResourceFileToJson[T](fileName: String, classType: Class[_]): JValue = { parse(Source.fromFile(classType.getResource(fileName).getFile).getLines().mkString("\n")) } /** * Attach an index to rows into a dataframe so we can track them throughout a series of operations * @param df The dataframe to index * @return A dataframe equal to df but with an index column */ def attachIdx(df: DataFrame): DataFrame = df.withColumn(INDEX_COL_NAME, monotonicallyIncreasingId()).cache /** * Get a number greater than or equal to num that is divisible by denomiator */ def numDivisibleBy(num: Int, denomiator: Int) = num / denomiator * denomiator /** * Grow a data frame to a desired size by duplicating rows. */ def growDataFrame(initDF: DataFrame, newSize: Int): DataFrame = { val initCount = initDF.count if (initCount < 1) throw new IllegalArgumentException("initDF's size must be greater than 0") List.fill((newSize / initCount + 1).toInt)(initDF).reduce(_.unionAll(_)).limit(newSize) } /** * Compares two dataframes and ensures that they have the same schema (ignore nullable) and the same values * @param actualDF The DF we want to check for correctness * @param correctDF The correct DF we use for comparison * @param onlySchema only compare the schemas of the dataframes */ def dfEquals(actualDF: DataFrame, correctDF: DataFrame, onlySchema: Boolean = false): Unit = { actualDF.schema.map(f => (f.name, f.dataType)).toSet shouldBe correctDF.schema.map(f => (f.name, f.dataType)).toSet if (!onlySchema) { actualDF.collect.map(_.toSeq.toSet).toSet shouldBe correctDF.collect.map(_.toSeq.toSet).toSet } } }
Example 162
Source File: TreeUtils.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute} import org.apache.spark.sql.DataFrame object TreeUtils { def setMetadata( data: DataFrame, featuresColName: String, featureArity: Array[Int]): DataFrame = { val featuresAttributes = featureArity.zipWithIndex.map { case (arity: Int, feature: Int) => if (arity > 0) { NominalAttribute.defaultAttr.withIndex(feature).withNumValues(arity) } else { NumericAttribute.defaultAttr.withIndex(feature) } } val featuresMetadata = new AttributeGroup("features", featuresAttributes).toMetadata() data.select(data(featuresColName).as(featuresColName, featuresMetadata)) } }
Example 163
Source File: Word2Vec.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import scala.util.Random import org.apache.spark.ml import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.feature.Word2VecModel import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, split} import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object Word2Vec extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ val df = DataGenerator.generateDoc( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, vocabSize, docLength, "text" ) df.select(split(col("text"), " ").as("text")) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.feature.Word2Vec().setInputCol("text") } override def testAdditionalMethods( ctx: MLBenchContext, model: Transformer): Map[String, () => _] = { import ctx.params._ val rng = new Random(ctx.seed()) val word2vecModel = model.asInstanceOf[Word2VecModel] val testWord = Vectors.dense(Array.fill(word2vecModel.getVectorSize)(rng.nextGaussian())) Map("findSynonyms" -> (() => { word2vecModel.findSynonyms(testWord, numSynonymsToFind) })) } }
Example 164
Source File: GaussianMixture.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.clustering import org.apache.spark.ml import org.apache.spark.ml.PipelineStage import org.apache.spark.sql.DataFrame import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.data.DataGenerator object GaussianMixture extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateGaussianMixtureData(ctx.sqlContext, numCenters = k, numExamples = numExamples, seed = ctx.seed(), numPartitions = numPartitions, numFeatures = numFeatures) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.clustering.GaussianMixture() .setK(k) .setSeed(randomSeed.toLong) .setMaxIter(maxIter) .setTol(tol) } // TODO(?) add a scoring method here. }
Example 165
Source File: FPGrowth.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.fpm import org.apache.spark.ml import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.fpm.FPGrowthModel import org.apache.spark.sql.DataFrame import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object FPGrowth extends BenchmarkAlgorithm with TestFromTraining { def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateItemSet( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numItems, itemSetSize) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.fpm.FPGrowth() .setItemsCol("items") } override def testAdditionalMethods( ctx: MLBenchContext, model: Transformer): Map[String, () => _] = { val fpModel = model.asInstanceOf[FPGrowthModel] Map("associationRules" -> (() => { fpModel.associationRules.count() })) } }
Example 166
Source File: MLLib.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib import scala.io.Source import scala.language.implicitConversions import org.slf4j.LoggerFactory import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.{SparkConf, SparkContext} import com.databricks.spark.sql.perf._ class MLLib(sqlContext: SQLContext) extends Benchmark(sqlContext) with Serializable { def this() = this(SQLContext.getOrCreate(SparkContext.getOrCreate())) } object MLLib { def run(yamlFile: String = null, yamlConfig: String = null): DataFrame = { logger.info("Starting run") val conf = getConf(yamlFile, yamlConfig) val sparkConf = new SparkConf().setAppName("MLlib QA").setMaster("local[2]") val sc = SparkContext.getOrCreate(sparkConf) sc.setLogLevel("INFO") val b = new com.databricks.spark.sql.perf.mllib.MLLib() val benchmarks = getBenchmarks(conf) println(s"${benchmarks.size} benchmarks identified:") val str = benchmarks.map(_.prettyPrint).mkString("\n") println(str) logger.info("Starting experiments") val e = b.runExperiment( executionsToRun = benchmarks, iterations = 1, // If you want to increase the number of iterations, add more seeds resultLocation = conf.output, forkThread = false) e.waitForFinish(conf.timeout.toSeconds.toInt) logger.info("Run finished") e.getCurrentResults() } }
Example 167
Source File: CarbonLoadParams.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.management import java.text.SimpleDateFormat import java.util import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.command.UpdateTableModel import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.carbondata.core.indexstore.PartitionSpec import org.apache.carbondata.core.statusmanager.SegmentStatus import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.events.OperationContext import org.apache.carbondata.processing.loading.model.CarbonLoadModel case class CarbonLoadParams( sparkSession: SparkSession, tableName: String, sizeInBytes: Long, isOverwriteTable: Boolean, carbonLoadModel: CarbonLoadModel, hadoopConf: Configuration, logicalPartitionRelation: LogicalRelation, dateFormat : SimpleDateFormat, timeStampFormat : SimpleDateFormat, optionsOriginal: Map[String, String], finalPartition : Map[String, Option[String]], currPartitions: util.List[PartitionSpec], partitionStatus : SegmentStatus, var dataFrame: Option[DataFrame], scanResultRDD : Option[RDD[InternalRow]], updateModel: Option[UpdateTableModel], operationContext: OperationContext) { }
Example 168
Source File: DoubleDataTypeTestCase.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.integration.spark.testsuite.primitiveTypes import java.util.Random import org.apache.spark.sql.test.util.QueryTest import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.scalatest.BeforeAndAfterAll class DoubleDataTypeTestCase extends QueryTest with BeforeAndAfterAll { lazy val df: DataFrame = generateDataFrame private def generateDataFrame(): DataFrame = { val r = new Random() val rdd = sqlContext.sparkContext .parallelize(1 to 10, 2) .map { x => Row(x, "London" + (x % 2), x.toDouble / 13, x.toDouble / 11) } val schema = StructType( Seq( StructField("id", IntegerType, nullable = false), StructField("city", StringType, nullable = false), StructField("m1", DoubleType, nullable = false), StructField("m2", DoubleType, nullable = false) ) ) sqlContext.createDataFrame(rdd, schema) } override def beforeAll { sql("drop table if exists uniq_carbon") sql("drop table if exists uniq_hive") sql("drop table if exists doubleTypeCarbonTable") sql("drop table if exists doubleTypeHiveTable") df.write .format("carbondata") .option("tableName", "doubleTypeCarbonTable") .option("tempCSV", "false") .option("table_blocksize", "32") .mode(SaveMode.Overwrite) .save() df.write .mode(SaveMode.Overwrite) .saveAsTable("doubleTypeHiveTable") } test("detail query") { checkAnswer(sql("select * from doubleTypeCarbonTable order by id"), sql("select * from doubleTypeHiveTable order by id")) } test("duplicate values") { sql("create table uniq_carbon(name string, double_column double) STORED AS carbondata ") sql(s"load data inpath '$resourcesPath/uniq.csv' into table uniq_carbon") sql("create table uniq_hive(name string, double_column double) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','") sql(s"load data local inpath '$resourcesPath/uniqwithoutheader.csv' into table uniq_hive") checkAnswer(sql("select * from uniq_carbon where double_column>=11"), sql("select * from uniq_hive where double_column>=11")) } // test("agg query") { // checkAnswer(sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeCarbonTable group by city"), // sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeHiveTable group by city")) // // checkAnswer(sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeCarbonTable group by city"), // sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeHiveTable group by city")) // } override def afterAll { sql("drop table if exists uniq_carbon") sql("drop table if exists uniq_hive") sql("drop table if exists doubleTypeCarbonTable") sql("drop table if exists doubleTypeHiveTable") } }
Example 169
Source File: TestUpdateAndDeleteWithLargeData.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.iud import java.text.SimpleDateFormat import org.apache.spark.sql.test.util.QueryTest import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.util.CarbonProperties class TestUpdateAndDeleteWithLargeData extends QueryTest with BeforeAndAfterAll { var df: DataFrame = _ override def beforeAll { dropTable() buildTestData() } private def buildTestData(): Unit = { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy-MM-dd") // Simulate data and write to table orders import sqlContext.implicits._ val sdf = new SimpleDateFormat("yyyy-MM-dd") df = sqlContext.sparkSession.sparkContext.parallelize(1 to 1500000) .map(value => (value, new java.sql.Date(sdf.parse("2015-07-" + (value % 10 + 10)).getTime), "china", "aaa" + value, "phone" + 555 * value, "ASD" + (60000 + value), 14999 + value, "ordersTable" + value)) .toDF("o_id", "o_date", "o_country", "o_name", "o_phonetype", "o_serialname", "o_salary", "o_comment") createTable() } private def createTable(): Unit = { df.write .format("carbondata") .option("tableName", "orders") .option("tempCSV", "true") .option("compress", "true") .mode(SaveMode.Overwrite) .save() } private def dropTable() = { sql("DROP TABLE IF EXISTS orders") } test("test the update and delete delete functionality for large data") { sql( """ update ORDERS set (o_comment) = ('yyy')""").show() checkAnswer(sql( """select o_comment from orders limit 2 """), Seq(Row("yyy"), Row("yyy"))) sql("delete from orders where exists (select 1 from orders)") checkAnswer(sql( """ SELECT count(*) FROM orders """), Row(0)) } }
Example 170
Source File: BloomCoarseGrainIndexTestUtil.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.index.bloom import java.io.{File, PrintWriter} import java.util.UUID import scala.util.Random import org.apache.spark.sql.test.util.QueryTest import org.apache.spark.sql.DataFrame object BloomCoarseGrainIndexTestUtil extends QueryTest { def createFile(fileName: String, line: Int = 10000, start: Int = 0): Unit = { if (!new File(fileName).exists()) { val write = new PrintWriter(new File(fileName)) for (i <- start until (start + line)) { write.println( s"$i,n$i,city_$i,${ Random.nextInt(80) }," + s"${ UUID.randomUUID().toString },${ UUID.randomUUID().toString }," + s"${ UUID.randomUUID().toString },${ UUID.randomUUID().toString }," + s"${ UUID.randomUUID().toString },${ UUID.randomUUID().toString }," + s"${ UUID.randomUUID().toString },${ UUID.randomUUID().toString }") } write.close() } } def deleteFile(fileName: String): Unit = { val file = new File(fileName) if (file.exists()) { file.delete() } } private def checkSqlHitIndex(sqlText: String, indexName: String, shouldHit: Boolean): DataFrame = { // we will not check whether the query will hit the index because index may be skipped // if the former index pruned all the blocklets sql(sqlText) } def checkBasicQuery(indexName: String, bloomDMSampleTable: String, normalTable: String, shouldHit: Boolean = true): Unit = { checkAnswer( checkSqlHitIndex(s"select * from $bloomDMSampleTable where id = 1", indexName, shouldHit), sql(s"select * from $normalTable where id = 1")) checkAnswer( checkSqlHitIndex(s"select * from $bloomDMSampleTable where id = 999", indexName, shouldHit), sql(s"select * from $normalTable where id = 999")) checkAnswer( checkSqlHitIndex(s"select * from $bloomDMSampleTable where city = 'city_1'", indexName, shouldHit), sql(s"select * from $normalTable where city = 'city_1'")) checkAnswer( checkSqlHitIndex(s"select * from $bloomDMSampleTable where city = 'city_999'", indexName, shouldHit), sql(s"select * from $normalTable where city = 'city_999'")) checkAnswer( sql(s"select min(id), max(id), min(name), max(name), min(city), max(city)" + s" from $bloomDMSampleTable"), sql(s"select min(id), max(id), min(name), max(name), min(city), max(city)" + s" from $normalTable")) } }
Example 171
Source File: CaseClassDataFrameAPIExample.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.examples import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.apache.carbondata.examples.util.ExampleUtils case class People(name: String, occupation: String, id: Int) object CaseClassDataFrameAPIExample { def main(args: Array[String]) { val spark = ExampleUtils.createSparkSession("CaseClassDataFrameAPIExample") exampleBody(spark) spark.close() } def exampleBody(spark : SparkSession): Unit = { val people = List(People("sangeeta", "engineer", 1), People("pallavi", "consultant", 2)) val peopleRDD: RDD[People] = spark.sparkContext.parallelize(people) import spark.implicits._ val peopleDF: DataFrame = peopleRDD.toDF("name", "occupation", "id") // writing data to carbon table peopleDF.write .format("carbondata") .option("tableName", "caseclass_table") .option("compress", "true") .mode(SaveMode.Overwrite) .save() spark.sql("SELECT * FROM caseclass_table").show() spark.sql("DROP TABLE IF EXISTS caseclass_table") } }
Example 172
Source File: TestLikeQueryWithIndex.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.secondaryindex import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, DataFrame, Row} import org.apache.spark.sql.test.util.QueryTest import org.scalatest.BeforeAndAfterAll class TestLikeQueryWithIndex extends QueryTest with BeforeAndAfterAll { override def beforeAll { sql("drop table if exists TCarbon") sql("CREATE TABLE IF NOT EXISTS TCarbon(ID Int, country String, "+ "name String, phonetype String, serialname String) "+ "STORED AS carbondata" ) var csvFilePath = s"$resourcesPath/secindex/secondaryIndexLikeTest.csv" sql( s"LOAD DATA LOCAL INPATH '" + csvFilePath + "' INTO TABLE " + s"TCarbon " + s"OPTIONS('DELIMITER'= ',')" ) sql("create index insert_index on table TCarbon (name) AS 'carbondata'" ) } test("select secondary index like query Contains") { val df = sql("select * from TCarbon where name like '%aaa1%'") secondaryIndexTableCheck(df,_.equalsIgnoreCase("TCarbon")) checkAnswer( sql("select * from TCarbon where name like '%aaa1%'"), Seq(Row(1, "china", "aaa1", "phone197", "A234"), Row(9, "china", "aaa1", "phone756", "A455")) ) } test("select secondary index like query ends with") { val df = sql("select * from TCarbon where name like '%aaa1'") secondaryIndexTableCheck(df,_.equalsIgnoreCase("TCarbon")) checkAnswer( sql("select * from TCarbon where name like '%aaa1'"), Seq(Row(1, "china", "aaa1", "phone197", "A234"), Row(9, "china", "aaa1", "phone756", "A455")) ) } test("select secondary index like query starts with") { val df = sql("select * from TCarbon where name like 'aaa1%'") secondaryIndexTableCheck(df, Set("insert_index","TCarbon").contains(_)) checkAnswer( sql("select * from TCarbon where name like 'aaa1%'"), Seq(Row(1, "china", "aaa1", "phone197", "A234"), Row(9, "china", "aaa1", "phone756", "A455")) ) } def secondaryIndexTableCheck(dataFrame:DataFrame, tableNameMatchCondition :(String) => Boolean): Unit ={ dataFrame.queryExecution.sparkPlan.collect { case bcf: CarbonDatasourceHadoopRelation => if(!tableNameMatchCondition(bcf.carbonTable.getTableUniqueName)){ assert(true) } } } override def afterAll { sql("DROP INDEX if exists insert_index ON TCarbon") sql("drop table if exists TCarbon") } }
Example 173
Source File: ITSelectorSuite.scala From spark-infotheoretic-feature-selection with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.sql.{DataFrame, SQLContext} import org.junit.runner.RunWith import org.scalatest.{BeforeAndAfterAll, FunSuite} import org.scalatest.junit.JUnitRunner import TestHelper._ test("Run ITFS on nci data (nPart = 10, nfeat = 10)") { val df = readCSVData(sqlContext, "test_nci9_s3.csv") val cols = df.columns val pad = 2 val allVectorsDense = true val model = getSelectorModel(sqlContext, df, cols.drop(1), cols.head, 10, 10, allVectorsDense, pad) assertResult("443, 755, 1369, 1699, 3483, 5641, 6290, 7674, 9399, 9576") { model.selectedFeatures.mkString(", ") } } }
Example 174
Source File: SavingStream.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService} import com.kakao.mango.text.ThreadSafeDateFormat import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream import java.util.concurrent.{Future => JFuture} import scala.reflect.runtime.universe.TypeTag object SavingStream { val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd") val hh = ThreadSafeDateFormat("HH") val mm = ThreadSafeDateFormat("mm") val m0 = (ms: Long) => mm(ms).charAt(0) + "0" } @transient var executor: RichExecutorService = _ def ex: RichExecutorService = { if (executor == null) { this.synchronized { if (executor == null) { executor = new RichExecutorService(es.get()) } } } executor } def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = { stream.foreachRDD { (rdd, time) => ex.submit { toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*) } } } def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms)) } } def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms)) } } def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms)) } } def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms)) } } } class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) { override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd) } class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) { override def toDF(rdd: RDD[String]) = ctx.read.json(rdd) } class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) { import com.kakao.mango.json._ override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson)) } class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) { override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema) }
Example 175
Source File: DefaultSource.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.metrics.source.MetricsHandler import org.apache.spark.sql.sources.{ BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider } import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} object DefaultSource { val MEMSQL_SOURCE_NAME = "com.memsql.spark" val MEMSQL_SOURCE_NAME_SHORT = "memsql" val MEMSQL_GLOBAL_OPTION_PREFIX = "spark.datasource.memsql." } class DefaultSource extends RelationProvider with DataSourceRegister with CreatableRelationProvider with LazyLogging { override def shortName: String = DefaultSource.MEMSQL_SOURCE_NAME_SHORT private def includeGlobalParams(sqlContext: SQLContext, params: Map[String, String]): Map[String, String] = sqlContext.getAllConfs.foldLeft(params)({ case (params, (k, v)) if k.startsWith(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) => params + (k.stripPrefix(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) -> v) case (params, _) => params }) override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val params = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters)) val options = MemsqlOptions(params) if (options.disablePushdown) { SQLPushdownRule.ensureRemoved(sqlContext.sparkSession) MemsqlReaderNoPushdown(MemsqlOptions.getQuery(params), options, sqlContext) } else { SQLPushdownRule.ensureInjected(sqlContext.sparkSession) MemsqlReader(MemsqlOptions.getQuery(params), Nil, options, sqlContext) } } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val opts = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters)) val conf = MemsqlOptions(opts) val table = MemsqlOptions .getTable(opts) .getOrElse( throw new IllegalArgumentException( s"To write a dataframe to MemSQL you must specify a table name via the '${MemsqlOptions.TABLE_NAME}' parameter" ) ) JdbcHelpers.prepareTableForWrite(conf, table, mode, data.schema) val isReferenceTable = JdbcHelpers.isReferenceTable(conf, table) val partitionWriterFactory = if (conf.onDuplicateKeySQL.isEmpty) { new LoadDataWriterFactory(table, conf) } else { new BatchInsertWriterFactory(table, conf) } val schema = data.schema var totalRowCount = 0L data.foreachPartition(partition => { val writer = partitionWriterFactory.createDataWriter(schema, TaskContext.getPartitionId(), 0, isReferenceTable, mode) try { partition.foreach(record => { writer.write(record) totalRowCount += 1 }) writer.commit() MetricsHandler.setRecordsWritten(totalRowCount) } catch { case e: Exception => { writer.abort() throw e } } }) createRelation(sqlContext, parameters) } }
Example 176
Source File: ReferenceTableTest.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.{DataFrame, SaveMode} import scala.util.Try class ReferenceTableTest extends IntegrationSuiteBase { val childAggregatorHost = "localhost" val childAggregatorPort = "5508" val dbName = "testdb" val commonCollectionName = "test_table" val referenceCollectionName = "reference_table" override def beforeEach(): Unit = { super.beforeEach() // Set child aggregator as a dmlEndpoint spark.conf .set("spark.datasource.memsql.dmlEndpoints", s"${childAggregatorHost}:${childAggregatorPort}") } def writeToTable(tableName: String): Unit = { val df = spark.createDF( List(4, 5, 6), List(("id", IntegerType, true)) ) df.write .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT) .mode(SaveMode.Append) .save(s"${dbName}.${tableName}") } def readFromTable(tableName: String): DataFrame = { spark.read .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT) .load(s"${dbName}.${tableName}") } def writeAndReadFromTable(tableName: String): Unit = { writeToTable(tableName) val dataFrame = readFromTable(tableName) val sqlRows = dataFrame.collect(); assert(sqlRows.length == 3) } def dropTable(tableName: String): Unit = executeQuery(s"drop table if exists $dbName.$tableName") describe("Success during write operations") { it("to common table") { dropTable(commonCollectionName) executeQuery( s"create table if not exists $dbName.$commonCollectionName (id INT NOT NULL, PRIMARY KEY (id))") writeAndReadFromTable(commonCollectionName) } it("to reference table") { dropTable(referenceCollectionName) executeQuery( s"create reference table if not exists $dbName.$referenceCollectionName (id INT NOT NULL, PRIMARY KEY (id))") writeAndReadFromTable(referenceCollectionName) } } describe("Success during creating") { it("common table") { dropTable(commonCollectionName) writeAndReadFromTable(commonCollectionName) } } describe("Failure because of") { it("database name not specified") { spark.conf.set("spark.datasource.memsql.database", "") val df = spark.createDF( List(4, 5, 6), List(("id", IntegerType, true)) ) val result = Try { df.write .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT) .mode(SaveMode.Append) .save(s"${commonCollectionName}") } assert(SQLHelper.isSQLExceptionWithCode(result.failed.get, List(1046))) } } }
Example 177
Source File: DeltaLoad.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo import com.adidas.analytics.algo.DeltaLoad._ import com.adidas.analytics.algo.core.Algorithm import com.adidas.analytics.algo.shared.DateComponentDerivation import com.adidas.analytics.config.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration import com.adidas.analytics.util.DataFrameUtils._ import com.adidas.analytics.util._ import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.storage.StorageLevel import org.slf4j.{Logger, LoggerFactory} private def getUpsertRecords(deltaRecords: Dataset[Row], resultColumns: Seq[String]): Dataset[Row] = { // Create partition window - Partitioning by delta records logical key (i.e. technical key of active records) val partitionWindow = Window .partitionBy(businessKey.map(col): _*) .orderBy(technicalKey.map(component => col(component).desc): _*) // Ranking & projection val rankedDeltaRecords = deltaRecords .withColumn(rankingColumnName, row_number().over(partitionWindow)) .filter(upsertRecordsModesFilterFunction) rankedDeltaRecords .filter(rankedDeltaRecords(rankingColumnName) === 1) .selectExpr(resultColumns: _*) } protected def withDatePartitions(spark: SparkSession, dfs: DFSWrapper, dataFrames: Vector[DataFrame]): Vector[DataFrame] = { logger.info("Adding partitioning information if needed") try { dataFrames.map { df => if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions){ df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions)) } else df } } catch { case e: Throwable => logger.error("Cannot add partitioning information for data frames.", e) //TODO: Handle failure case properly throw new RuntimeException("Unable to transform data frames.", e) } } } object DeltaLoad { private val logger: Logger = LoggerFactory.getLogger(getClass) def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = { new DeltaLoad(spark, dfs, configLocation) } }
Example 178
Source File: PartitionHelpers.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo.core import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} trait PartitionHelpers { protected def getDistinctPartitions(outputDataFrame: DataFrame, targetPartitions: Seq[String]): Dataset[Row] = { val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString)) outputDataFrame.select(targetPartitionsColumns: _*).distinct } protected def getParameterValue(row: Row, partitionString: String): String = createParameterValue(row.get(row.fieldIndex(partitionString))) protected def createParameterValue(partitionRawValue: Any): String = partitionRawValue match { case value: java.lang.Short => value.toString case value: java.lang.Integer => value.toString case value: scala.Predef.String => "'" + value + "'" case null => throw new Exception("Partition Value is null. No support for null partitions!") case value => throw new Exception("Unsupported partition DataType: " + value.getClass) } }
Example 179
Source File: DataFrameUtils.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.util import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, functions} import org.slf4j.{Logger, LoggerFactory} object DataFrameUtils { private val logger: Logger = LoggerFactory.getLogger(getClass) type FilterFunction = Row => Boolean type PartitionCriteria = Seq[(String, String)] def mapPartitionsToDirectories(partitionCriteria: PartitionCriteria): Seq[String] = { partitionCriteria.map { case (columnName, columnValue) => s"$columnName=$columnValue" } } def buildPartitionsCriteriaMatcherFunc(multiplePartitionsCriteria: Seq[PartitionCriteria], schema: StructType): FilterFunction = { val targetPartitions = multiplePartitionsCriteria.flatten.map(_._1).toSet val fieldNameToMatchFunctionMapping = schema.fields.filter { case StructField(name, _, _, _) => targetPartitions.contains(name) }.map { case StructField(name, _: ByteType, _, _) => name -> ((r: Row, value: String) => r.getAs[Byte](name) == value.toByte) case StructField(name, _: ShortType, _, _) => name -> ((r: Row, value: String) => r.getAs[Short](name) == value.toShort) case StructField(name, _: IntegerType, _, _) => name -> ((r: Row, value: String) => r.getAs[Int](name) == value.toInt) case StructField(name, _: LongType, _, _) => name -> ((r: Row, value: String) => r.getAs[Long](name) == value.toLong) case StructField(name, _: FloatType, _, _) => name -> ((r: Row, value: String) => r.getAs[Float](name) == value.toFloat) case StructField(name, _: DoubleType, _, _) => name -> ((r: Row, value: String) => r.getAs[Double](name) == value.toDouble) case StructField(name, _: BooleanType, _, _) => name -> ((r: Row, value: String) => r.getAs[Boolean](name) == value.toBoolean) case StructField(name, _: StringType, _, _) => name -> ((r: Row, value: String) => r.getAs[String](name) == value) }.toMap def convertPartitionCriteriaToFilterFunctions(partitionCriteria: PartitionCriteria): Seq[FilterFunction] = partitionCriteria.map { case (name, value) => (row: Row) => fieldNameToMatchFunctionMapping(name)(row, value) } def joinSinglePartitionFilterFunctionsWithAnd(partitionFilterFunctions: Seq[FilterFunction]): FilterFunction = partitionFilterFunctions .reduceOption((predicate1, predicate2) => (row: Row) => predicate1(row) && predicate2(row)) .getOrElse((_: Row) => false) multiplePartitionsCriteria .map(convertPartitionCriteriaToFilterFunctions) .map(joinSinglePartitionFilterFunctionsWithAnd) .reduceOption((predicate1, predicate2) => (row: Row) => predicate1(row) || predicate2(row)) .getOrElse((_: Row) => false) } implicit class DataFrameHelper(df: DataFrame) { def collectPartitions(targetPartitions: Seq[String]): Seq[PartitionCriteria] = { logger.info(s"Collecting unique partitions for partitions columns (${targetPartitions.mkString(", ")})") val partitions = df.selectExpr(targetPartitions: _*).distinct().collect() partitions.map { row => targetPartitions.map { columnName => Option(row.getAs[Any](columnName)) match { case Some(columnValue) => columnName -> columnValue.toString case None => throw new RuntimeException(s"Partition column '$columnName' contains null value") } } } } def addMissingColumns(targetSchema: StructType): DataFrame = { val dataFieldsSet = df.schema.fieldNames.toSet val selectColumns = targetSchema.fields.map { field => if (dataFieldsSet.contains(field.name)) { functions.col(field.name) } else { functions.lit(null).cast(field.dataType).as(field.name) } } df.select(selectColumns: _*) } def isEmpty: Boolean = df.head(1).isEmpty def nonEmpty: Boolean = df.head(1).nonEmpty } }
Example 180
Source File: InputReader.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.util import org.apache.spark.sql.{DataFrame, SparkSession} import org.slf4j.{Logger, LoggerFactory} def newTableLocationReader(table: String, format: DataFormat, options: Map[String, String] = Map.empty): TableLocationReader = { TableLocationReader(table, format, options) } case class TableReader(table: String, options: Map[String, String]) extends InputReader { override def read(sparkSession: SparkSession): DataFrame = { logger.info(s"Reading data from table $table") sparkSession.read.options(options).table(table) } } case class FileSystemReader(location: String, format: DataFormat, options: Map[String, String]) extends InputReader { override def read(sparkSession: SparkSession): DataFrame = { logger.info(s"Reading data from location $location") format.read(sparkSession.read.options(options), location) } } case class TableLocationReader(table: String, format: DataFormat, options: Map[String, String]) extends InputReader { override def read(sparkSession: SparkSession): DataFrame = { val location = HiveTableAttributeReader(table, sparkSession).getTableLocation logger.info(s"Reading data from location $location") format.read(sparkSession.read.options(options), location) } } }
Example 181
Source File: TestUtils.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.utils import org.apache.spark.sql.functions.{col, count, lit} import org.apache.spark.sql.{DataFrame, Row} object TestUtils { implicit class ExtendedDataFrame(df: DataFrame) { def hasDiff(anotherDf: DataFrame): Boolean = { def printDiff(incoming: Boolean)(row: Row): Unit = { if (incoming) print("+ ") else print("- ") println(row) } val groupedDf = df.groupBy(df.columns.map(col): _*).agg(count(lit(1))).collect().toSet val groupedAnotherDf = anotherDf.groupBy(anotherDf.columns.map(col): _*).agg(count(lit(1))).collect().toSet groupedDf.diff(groupedAnotherDf).foreach(printDiff(incoming = true)) groupedAnotherDf.diff(groupedDf).foreach(printDiff(incoming = false)) groupedDf.diff(groupedAnotherDf).nonEmpty || groupedAnotherDf.diff(groupedDf).nonEmpty } } }
Example 182
Source File: FileReader.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.utils import com.adidas.analytics.util.DataFormat import com.adidas.analytics.util.DataFormat.{DSVFormat, JSONFormat, ParquetFormat} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} class FileReader(format: DataFormat, options: Map[String, String]) { def read(spark: SparkSession, location: String, fillNulls: Boolean = false): DataFrame = { val df = format.read(spark.read.options(options), location) if (fillNulls) { df.na.fill("") } else { df } } } object FileReader { def newDSVFileReader(optionalSchema: Option[StructType] = None, delimiter: Char = '|', header: Boolean = false): FileReader = { val options = Map("delimiter" -> delimiter.toString, "header" -> header.toString) if (optionalSchema.isEmpty) { new FileReader(DSVFormat(optionalSchema), options + ("inferSchema" -> "true")) } else { new FileReader(DSVFormat(optionalSchema), options) } } def newParquetFileReader(): FileReader = { new FileReader(ParquetFormat(), Map.empty[String, String]) } def newJsonFileReader(optionalSchema: Option[StructType] = None): FileReader = { new FileReader(JSONFormat(optionalSchema), Map.empty[String, String]) } def apply(format: DataFormat, options: (String, String)*): FileReader = { new FileReader(format, options.toMap) } def apply(format: DataFormat, options: Map[String, String]): FileReader = { new FileReader(format, options) } }
Example 183
Source File: AggregateImpressionLog.scala From spark-hyperloglog with MIT License | 5 votes |
package com.collective.analytics import com.collective.analytics.schema.{ActivityLog, SegmentLog, ImpressionLog} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.hyperloglog.functions import org.slf4j.LoggerFactory class AggregateImpressionLog(impressionLog: DataFrame) extends Serializable { private val log = LoggerFactory.getLogger(classOf[AggregateImpressionLog]) def segmentLog(): DataFrame = { log.info(s"Compute segment log") import org.apache.spark.sql.functions._ import functions._ impressionLog.select( col(ImpressionLog.ad_id), col(ImpressionLog.site_id), col(ImpressionLog.cookie_id), col(ImpressionLog.impressions), col(ImpressionLog.clicks), explode(col(ImpressionLog.segments)) as SegmentLog.segment ).groupBy( col(SegmentLog.segment) ).agg( hyperLogLog(ImpressionLog.cookie_id) as SegmentLog.cookies_hll, sum(ImpressionLog.impressions) as SegmentLog.impressions, sum(ImpressionLog.clicks) as SegmentLog.clicks ) } }
Example 184
Source File: SparkEsDataFrameFunctions.scala From Spark2Elasticsearch with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.elasticsearch.sql import com.github.jparkie.spark.elasticsearch.SparkEsBulkWriter import com.github.jparkie.spark.elasticsearch.conf.{ SparkEsMapperConf, SparkEsTransportClientConf, SparkEsWriteConf } import com.github.jparkie.spark.elasticsearch.transport.SparkEsTransportClientManager import org.apache.spark.sql.{ DataFrame, Row } def bulkLoadToEs( esIndex: String, esType: String, sparkEsTransportClientConf: SparkEsTransportClientConf = SparkEsTransportClientConf.fromSparkConf(sparkContext.getConf), sparkEsMapperConf: SparkEsMapperConf = SparkEsMapperConf.fromSparkConf(sparkContext.getConf), sparkEsWriteConf: SparkEsWriteConf = SparkEsWriteConf.fromSparkConf(sparkContext.getConf) )(implicit sparkEsTransportClientManager: SparkEsTransportClientManager = sparkEsTransportClientManager): Unit = { val sparkEsWriter = new SparkEsBulkWriter[Row]( esIndex = esIndex, esType = esType, esClient = () => sparkEsTransportClientManager.getTransportClient(sparkEsTransportClientConf), sparkEsSerializer = new SparkEsDataFrameSerializer(dataFrame.schema), sparkEsMapper = new SparkEsDataFrameMapper(sparkEsMapperConf), sparkEsWriteConf = sparkEsWriteConf ) sparkContext.runJob(dataFrame.rdd, sparkEsWriter.write _) } }
Example 185
Source File: BigQueryDataFrame.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery import com.google.api.services.bigquery.model.{TableReference, TableSchema} import com.google.cloud.hadoop.io.bigquery._ import com.google.gson._ import com.samelamin.spark.bigquery.converters.{BigQueryAdapter, SchemaConverters} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{LongWritable, NullWritable} import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat import org.apache.spark.sql.DataFrame import org.slf4j.LoggerFactory import scala.util.Random def saveAsBigQueryTable(fullyQualifiedOutputTableId: String, isPartitionedByDay: Boolean = false, timePartitionExpiration: Long = 0, writeDisposition: WriteDisposition.Value = null, createDisposition: CreateDisposition.Value = null): Unit = { val destinationTable = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId) val bigQuerySchema = SchemaConverters.SqlToBQSchema(adaptedDf) val gcsPath = writeDFToGoogleStorage(adaptedDf,destinationTable,bigQuerySchema) bq.load(destinationTable, bigQuerySchema, gcsPath, isPartitionedByDay, timePartitionExpiration, writeDisposition, createDisposition) delete(new Path(gcsPath)) } def writeDFToGoogleStorage(adaptedDf: DataFrame, destinationTable: TableReference, bqSchema: TableSchema): String = { val tableName = BigQueryStrings.toString(destinationTable) BigQueryConfiguration.configureBigQueryOutput(hadoopConf, tableName, bqSchema.toPrettyString()) hadoopConf.set("mapreduce.job.outputformat.class", classOf[BigQueryOutputFormat[_, _]].getName) val bucket = self.sparkSession.conf.get(BigQueryConfiguration.GCS_BUCKET_KEY) val temp = s"spark-bigquery-${System.currentTimeMillis()}=${Random.nextInt(Int.MaxValue)}" val gcsPath = s"gs://$bucket/hadoop/tmp/spark-bigquery/$temp" if(hadoopConf.get(BigQueryConfiguration.TEMP_GCS_PATH_KEY) == null) { hadoopConf.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, gcsPath) } logger.info(s"Loading $gcsPath into $tableName") adaptedDf .toJSON .rdd .map(json => (null, jsonParser.parse(json))) .saveAsNewAPIHadoopFile(gcsPath, classOf[GsonBigQueryInputFormat], classOf[LongWritable], classOf[TextOutputFormat[NullWritable, JsonObject]], hadoopConf) gcsPath } private def delete(path: Path): Unit = { val fs = FileSystem.get(path.toUri, hadoopConf) fs.delete(path, true) } }
Example 186
Source File: BigQueryAdapter.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery.converters import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.functions.current_timestamp import org.apache.spark.sql.types._ object BigQueryAdapter { private def adaptName(name: String, siblings: Array[String]): String = { var newName = name.replaceAll("\\W", "_") if (!newName.equals(name)) { // Avoid duplicates: var counter = 1; while (!siblings.find(_.equals(newName)).isEmpty) { newName = newName + "_" + counter counter = counter + 1 } } newName } private def adaptField(structField: StructField, parentType: StructType): StructField = { new StructField(adaptName(structField.name, parentType.fieldNames), adaptType(structField.dataType), structField.nullable) } private def adaptType(dataType: DataType): DataType = { dataType match { case structType: StructType => new StructType(structType.fields.map(adaptField(_, structType))) case arrayType: ArrayType => new ArrayType(adaptType(arrayType.elementType), arrayType.containsNull) case mapType: MapType => new MapType(adaptType(mapType.keyType), adaptType(mapType.valueType), mapType.valueContainsNull) case other => other } } def apply(df: DataFrame): DataFrame = { val sqlContext = df.sparkSession.sqlContext val sparkContext = df.sparkSession.sparkContext val timestampColumn = sparkContext .hadoopConfiguration.get("timestamp_column","bq_load_timestamp") val newSchema = adaptType(df.schema).asInstanceOf[StructType] val encoder = RowEncoder.apply(newSchema).resolveAndBind() val encodedDF = df .queryExecution .toRdd.map(x=>encoder.fromRow(x)) sqlContext.createDataFrame(encodedDF,newSchema).withColumn(timestampColumn,current_timestamp()) } }
Example 187
Source File: BigQuerySource.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery.streaming import java.math.BigInteger import com.google.cloud.hadoop.io.bigquery.BigQueryStrings import com.samelamin.spark.bigquery.BigQueryClient import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.execution.streaming.{Offset, _} import org.apache.spark.sql.types.{BinaryType, StringType, StructField, StructType} import com.samelamin.spark.bigquery._ import com.samelamin.spark.bigquery.converters.SchemaConverters import org.joda.time.DateTime import org.slf4j.LoggerFactory override def getBatch(start: Option[Offset], end: Offset): DataFrame = { val startIndex = start.getOrElse(LongOffset(0L)).asInstanceOf[LongOffset].offset.toLong val endIndex = end.asInstanceOf[LongOffset].offset.toLong val startPartitionTime = new DateTime(startIndex).toLocalDate val endPartitionTime = new DateTime(endIndex).toLocalDate.toString logger.info(s"Fetching data between $startIndex and $endIndex") val query = s""" |SELECT | * |FROM | `${fullyQualifiedOutputTableId.replace(':','.')}` |WHERE | $timestampColumn BETWEEN TIMESTAMP_MILLIS($startIndex) AND TIMESTAMP_MILLIS($endIndex) | AND _PARTITIONTIME BETWEEN TIMESTAMP('$startPartitionTime') AND TIMESTAMP('$endPartitionTime') | """.stripMargin val bigQuerySQLContext = new BigQuerySQLContext(sqlContext) val df = bigQuerySQLContext.bigQuerySelect(query) df } override def stop(): Unit = {} def getConvertedSchema(sqlContext: SQLContext): StructType = { val bigqueryClient = BigQueryClient.getInstance(sqlContext) val tableReference = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId) SchemaConverters.BQToSQLSchema(bigqueryClient.getTableSchema(tableReference)) } } object BigQuerySource { val DEFAULT_SCHEMA = StructType( StructField("Sample Column", StringType) :: StructField("value", BinaryType) :: Nil ) }
Example 188
Source File: BigQuerySink.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery.streaming import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.execution.streaming.Sink import com.samelamin.spark.bigquery._ import org.slf4j.LoggerFactory import scala.util.Try import org.apache.hadoop.fs.Path class BigQuerySink(sparkSession: SparkSession, path: String, options: Map[String, String]) extends Sink { private val logger = LoggerFactory.getLogger(classOf[BigQuerySink]) private val basePath = new Path(path) private val logPath = new Path(basePath, new Path(BigQuerySink.metadataDir,"transaction.json")) private val fileLog = new BigQuerySinkLog(sparkSession, logPath.toUri.toString) override def addBatch(batchId: Long, data: DataFrame): Unit = { if (batchId <= fileLog.getLatest().getOrElse(-1L)) { logger.info(s"Skipping already committed batch $batchId") } else { val fullyQualifiedOutputTableId = options.get("tableReferenceSink").get val isPartitionByDay = Try(options.get("partitionByDay").get.toBoolean).getOrElse(true) val bqDF = new BigQueryDataFrame(data) bqDF.saveAsBigQueryTable(fullyQualifiedOutputTableId, isPartitionByDay) fileLog.writeBatch(batchId) } } } object BigQuerySink { // The name of the subdirectory that is used to store metadata about which files are valid. val metadataDir = "_spark_metadata" }
Example 189
Source File: DataFrameReaderFunctions.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, DataFrameReader} private def buildFrame(options: Map[String, String] = null, schema: StructType = null): DataFrame = { val builder = dfr .format(source) .schema(schema) if (options != null) { builder.options(options) } builder.load() } }
Example 190
Source File: SqsSource.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sqs import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation} import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.FileStreamSource._ import org.apache.spark.sql.types.StructType class SqsSource(sparkSession: SparkSession, metadataPath: String, options: Map[String, String], override val schema: StructType) extends Source with Logging { private val sourceOptions = new SqsSourceOptions(options) private val hadoopConf = sparkSession.sessionState.newHadoopConf() private val metadataLog = new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath) private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L) private val maxFilesPerTrigger = sourceOptions.maxFilesPerTrigger private val maxFileAgeMs: Long = sourceOptions.maxFileAgeMs private val fileFormatClassName = sourceOptions.fileFormatClassName private val shouldSortFiles = sourceOptions.shouldSortFiles private val sqsClient = new SqsClient(sourceOptions, hadoopConf) metadataLog.allFiles().foreach { entry => sqsClient.sqsFileCache.add(entry.path, MessageDescription(entry.timestamp, true, "")) } sqsClient.sqsFileCache.purge() logInfo(s"maxFilesPerBatch = $maxFilesPerTrigger, maxFileAgeMs = $maxFileAgeMs") val batchFiles = sqsClient.sqsFileCache.getUncommittedFiles(maxFilesPerTrigger, shouldSortFiles) if (batchFiles.nonEmpty) { metadataLogCurrentOffset += 1 metadataLog.add(metadataLogCurrentOffset, batchFiles.map { case (path, timestamp, receiptHandle) => FileEntry(path = path, timestamp = timestamp, batchId = metadataLogCurrentOffset) }.toArray) logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files") val messageReceiptHandles = batchFiles.map { case (path, timestamp, receiptHandle) => sqsClient.sqsFileCache.markCommitted(path) logDebug(s"New file: $path") receiptHandle }.toList sqsClient.addToDeleteMessageQueue(messageReceiptHandles) } val numPurged = sqsClient.sqsFileCache.purge() if (!sqsClient.deleteMessageQueue.isEmpty) { sqsClient.deleteMessagesFromQueue() } logTrace( s""" |Number of files selected for batch = ${batchFiles.size} |Number of files purged from tracking map = $numPurged """.stripMargin) FileStreamSourceOffset(metadataLogCurrentOffset) } override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1) override def commit(end: Offset): Unit = { // No-op for now; SqsSource currently garbage-collects files based on timestamp // and the value of the maxFileAge parameter. } override def stop(): Unit = { if (!sqsClient.sqsScheduler.isTerminated) { sqsClient.sqsScheduler.shutdownNow() } } override def toString: String = s"SqsSource[${sqsClient.sqsUrl}]" }
Example 191
Source File: MQTTStreamSink.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.sql.streaming.mqtt import scala.collection.JavaConverters._ import scala.collection.mutable import org.eclipse.paho.client.mqttv3.MqttException import org.apache.spark.SparkEnv import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.bahir.utils.Logging import org.apache.bahir.utils.Retry class MQTTStreamWriter (schema: StructType, parameters: DataSourceOptions) extends StreamWriter with Logging { override def createWriterFactory(): DataWriterFactory[InternalRow] = { // Skipping client identifier as single batch can be distributed to multiple // Spark worker process. MQTT server does not support two connections // declaring same client ID at given point in time. val params = parameters.asMap().asScala.filterNot( _._1.equalsIgnoreCase("clientId") ) MQTTDataWriterFactory(params) } override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} } case class MQTTDataWriterFactory(config: mutable.Map[String, String]) extends DataWriterFactory[InternalRow] { override def createDataWriter( partitionId: Int, taskId: Long, epochId: Long ): DataWriter[InternalRow] = new MQTTDataWriter(config) } case object MQTTWriterCommitMessage extends WriterCommitMessage class MQTTDataWriter(config: mutable.Map[String, String]) extends DataWriter[InternalRow] { private lazy val publishAttempts: Int = SparkEnv.get.conf.getInt("spark.mqtt.client.publish.attempts", -1) private lazy val publishBackoff: Long = SparkEnv.get.conf.getTimeAsMs("spark.mqtt.client.publish.backoff", "5s") private lazy val (_, _, topic, _, _, qos, _, _, _) = MQTTUtils.parseConfigParams(config.toMap) override def write(record: InternalRow): Unit = { val client = CachedMQTTClient.getOrCreate(config.toMap) val message = record.getBinary(0) Retry(publishAttempts, publishBackoff, classOf[MqttException]) { // In case of errors, retry sending the message. client.publish(topic, message, qos, false) } } override def commit(): WriterCommitMessage = MQTTWriterCommitMessage override def abort(): Unit = {} } case class MQTTRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class MQTTStreamSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter(queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new MQTTStreamWriter(schema, options) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { MQTTRelation(sqlContext, data) } override def shortName(): String = "mqtt" }
Example 192
Source File: StarsAnalysisDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.analysis import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter} import functions.segment.Segmenter import org.apache.log4j.{Level, Logger} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} object StarsAnalysisDemo { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("Stars Analysis Demo") .getOrCreate() val filePath = "E:/data/chinaNews/entertainment.txt" // 加载数据,并保留年份和内容字段,并对内容字段进行过滤 import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) { var year: String = tokens(2).split("-")(0) if (tokens(2).contains("年")) year = tokens(2).split("年")(0) var content = tokens(3) if (content.length > 22 && content.substring(0, 20).contains("日电")) { content = content.substring(content.indexOf("日电") + 2, content.length).trim } if (content.startsWith("(")) content = content.substring(content.indexOf(")") + 1, content.length) if (content.length > 20 && content.substring(content.length - 20, content.length).contains("记者")) { content = content.substring(0, content.lastIndexOf("记者")).trim } Some(year, content) } else None }.toDF("year", "content") // 分词,去除长度为1的词,每个词保留词性 val segmenter = new Segmenter() .isAddNature(true) .isDelEn(true) .isDelNum(true) .setMinTermLen(2) .setMinTermNum(5) .setSegType("StandardSegment") .setInputCol("content") .setOutputCol("segmented") val segDF: DataFrame = segmenter.transform(data) segDF.cache() val segRDD: RDD[(Int, Seq[String])] = segDF.select("year", "segmented").rdd.map { case Row(year: String, terms: Seq[String]) => (Integer.parseInt(year), terms) } val result: Array[String] = segRDD.map(line => line._1.toString + "\u00ef" + line._2.mkString(",")).collect() val writer: BufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E:/entertainment_seg.txt"))) result.foreach(line => writer.write(line + "\n")) writer.close() // 统计2016出现在新闻中最多的明星 val stars2016 = segRDD.filter(_._1 == 2016) .flatMap { case (year: Int, termStr: Seq[String]) => val person = termStr .map(term => (term.split("/")(0), term.split("/")(1))) .filter(_._2.equalsIgnoreCase("nr")) .map(term => (term._1, 1L)) person } .reduceByKey(_ + _) .sortBy(_._2, ascending = false) segDF.unpersist() stars2016.take(100).foreach(println) spark.stop() } }
Example 193
Source File: Preprocessor.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions import config.paramconf.PreprocessParams import functions.clean.Cleaner import functions.segment.Segmenter import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer} import org.apache.spark.sql.DataFrame def preprocess(data: DataFrame): Pipeline = { val spark = data.sparkSession val params = new PreprocessParams val indexModel = new StringIndexer() .setHandleInvalid(params.handleInvalid) .setInputCol("label") .setOutputCol("indexedLabel") .fit(data) val cleaner = new Cleaner() .setFanJian(params.fanjian) .setQuanBan(params.quanban) .setMinLineLen(params.minLineLen) .setInputCol("content") .setOutputCol("cleand") val segmenter = new Segmenter() .isAddNature(params.addNature) .isDelEn(params.delEn) .isDelNum(params.delNum) .isNatureFilter(params.natureFilter) .setMinTermLen(params.minTermLen) .setMinTermNum(params.minTermNum) .setSegType(params.segmentType) .setInputCol(cleaner.getOutputCol) .setOutputCol("segmented") val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect() val remover = new StopWordsRemover() .setStopWords(stopwords) .setInputCol(segmenter.getOutputCol) .setOutputCol("removed") val vectorizer = new CountVectorizer() .setMinTF(params.minTF) .setVocabSize(params.vocabSize) .setInputCol(remover.getOutputCol) .setOutputCol("vectorized") val idf = new IDF() .setMinDocFreq(params.minDocFreq) .setInputCol(vectorizer.getOutputCol) .setOutputCol("features") val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf) new Pipeline().setStages(stages) } }
Example 194
Source File: Cleaner.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions.clean import com.hankcs.hanlp.HanLP import config.paramconf.{HasOutputCol, HasInputCol} import functions.MySchemaUtils import functions.clean.chinese.BCConvert import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1) override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val cleanFunc = udf {line: String => var cleaned = "" getFanJian match { case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line) case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line) case _ => cleaned = line } getQuanBan match { case "q2b" => cleaned = BCConvert.qj2bj(cleaned) case "b2q" => cleaned = BCConvert.bj2qj(cleaned) case _ => cleaned = cleaned } cleaned } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record => val outputIndex = record.fieldIndex($(outputCol)) record.getString(outputIndex).length >= getMinLineLen } } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.typeName.equals(StringType.typeName), s"Input type must be StringType but got $inputType.") MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable) } } object Cleaner extends DefaultParamsReadable[Cleaner] { override def load(path: String): Cleaner = super.load(path) }
Example 195
Source File: QueryTest.scala From spark-netezza with Apache License 2.0 | 5 votes |
package com.ibm.spark.netezza.integration import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.{DataFrame, Row} import org.scalatest.FunSuite def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Option[String] = { val isSorted = df.queryExecution.logical.collect { case s: logical.Sort => s }.nonEmpty val sparkAnswer = try df.collect().toSeq catch { case e: Exception => val errorMessage = s""" |Exception thrown while executing query: |${df.queryExecution} |== Exception == |$e |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)} """.stripMargin return Some(errorMessage) } sameRows(expectedAnswer, sparkAnswer, isSorted).map { results => s""" |Results do not match for query: |${df.queryExecution} |== Results == |$results """.stripMargin } } def prepareAnswer(answer: Seq[Row], isSorted: Boolean): Seq[Row] = { // Converts data to types that we can do equality comparison using Scala collections. // For BigDecimal type, the Scala type has a better definition of equality test (similar to // Java's java.math.BigDecimal.compareTo). // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for // equality test. val converted: Seq[Row] = answer.map(prepareRow) if (!isSorted) converted.sortBy(_.toString()) else converted } // We need to call prepareRow recursively to handle schemas with struct types. def prepareRow(row: Row): Row = { Row.fromSeq(row.toSeq.map { case null => null case d: java.math.BigDecimal => BigDecimal(d) // Convert array to Seq for easy equality check. case b: Array[_] => b.toSeq case r: Row => prepareRow(r) case o => o }) } def sameRows( expectedAnswer: Seq[Row], sparkAnswer: Seq[Row], isSorted: Boolean = false): Option[String] = { if (prepareAnswer(expectedAnswer, isSorted) != prepareAnswer(sparkAnswer, isSorted)) { val errorMessage = s""" |== Results == |${sideBySide( s"== Correct Answer - ${expectedAnswer.size} ==" +: prepareAnswer(expectedAnswer, isSorted).map(_.toString()), s"== Spark Answer - ${sparkAnswer.size} ==" +: prepareAnswer(sparkAnswer, isSorted).map(_.toString())).mkString("\n")} """.stripMargin return Some(errorMessage) } None } def sideBySide(left: Seq[String], right: Seq[String]): Seq[String] = { val maxLeftSize = left.map(_.size).max val leftPadded = left ++ Seq.fill(math.max(right.size - left.size, 0))("") val rightPadded = right ++ Seq.fill(math.max(left.size - right.size, 0))("") leftPadded.zip(rightPadded).map { case (l, r) => (if (l == r) " " else "!") + l + (" " * ((maxLeftSize - l.size) + 3)) + r } } }
Example 196
Source File: TablePartitionColIntegrationTestSuite.scala From spark-netezza with Apache License 2.0 | 5 votes |
package com.ibm.spark.netezza.integration import org.apache.spark.sql.{DataFrame, Row} import org.netezza.error.NzSQLException class TablePartitionColIntegrationTestSuite extends IntegrationSuiteBase with QueryTest { val tabName = "staff" val expected = Seq( Row(1, "John Doe"), Row(2, "Jeff Smith"), Row(3, "Kathy Saunders"), Row(4, null)) val expectedFiltered = Seq(Row(1, "John Doe"), Row(2, "Jeff Smith")) override def beforeAll(): Unit = { super.beforeAll() try {executeJdbcStmt(s"drop table $tabName")} catch { case e: NzSQLException => } executeJdbcStmt(s"create table $tabName(id int , name varchar(20))") executeJdbcStmt(s"insert into $tabName values(1 , 'John Doe')") executeJdbcStmt(s"insert into $tabName values(2 , 'Jeff Smith')") executeJdbcStmt(s"insert into $tabName values(3 , 'Kathy Saunders')") executeJdbcStmt(s"insert into $tabName values(4 , null)") } override def afterAll(): Unit = { try { executeJdbcStmt(s"DROP TABLE $tabName") } finally { super.afterAll() } } private def defaultOpts() = { Map("url" -> testURL, "user" -> user, "password" -> password, "numPartitions" -> Integer.toString(1)) } test("Test table read with column partitions") { val opts = defaultOpts + ("dbtable" -> s"$tabName") + ("partitioncol" -> "ID") + ("numPartitions" -> Integer.toString(4)) + ("lowerbound" -> "1") + ("upperbound" -> "100") val testDf = sqlContext.read.format("com.ibm.spark.netezza").options(opts).load() verifyAnswer(testDf, expected) verifyAnswer(testDf.filter("ID < 3"), expectedFiltered) } test("Test table read specifying lower or upper boundary") { var opts = defaultOpts + ("dbtable" -> s"$tabName") + ("partitioncol" -> "ID") + ("numPartitions" -> Integer.toString(4)) val testOpts = Seq(opts , opts + ("lowerbound" -> "1"), opts + ("upperbound" -> "10")) for (opts <- testOpts) { val testDf = sqlContext.read.format("com.ibm.spark.netezza").options(opts).load() verifyAnswer(testDf, expected) verifyAnswer(testDf.filter("ID < 3"), expectedFiltered) } } test("Test table read with single partition") { val opts = defaultOpts + ("dbtable" -> s"$tabName") + ("partitioncol" -> "ID") + ("numPartitions" -> Integer.toString(1)) val testDf = sqlContext.read.format("com.ibm.spark.netezza").options(opts).load() verifyAnswer(testDf, expected) verifyAnswer(testDf.filter("ID < 3"), expectedFiltered) } test("Test table with number of partitions set to zero.") { val opts = defaultOpts + ("dbtable" -> s"$tabName") + ("partitioncol" -> "ID") + ("numPartitions" -> Integer.toString(0)) val testDf = sqlContext.read.format("com.ibm.spark.netezza").options(opts).load() verifyAnswer(testDf, expected) } }
Example 197
Source File: IntegrationSuiteBase.scala From spark-netezza with Apache License 2.0 | 5 votes |
package com.ibm.spark.netezza.integration import java.sql.Connection import com.ibm.spark.netezza.NetezzaJdbcUtils import com.typesafe.config.ConfigFactory import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Row, DataFrame, SQLContext} import org.scalatest.{BeforeAndAfterAll, FunSuite} import org.slf4j.LoggerFactory trait IntegrationSuiteBase extends FunSuite with BeforeAndAfterAll with QueryTest{ private val log = LoggerFactory.getLogger(getClass) protected var sc: SparkContext = _ protected var sqlContext: SQLContext = _ protected var conn: Connection = _ protected val prop = new java.util.Properties // Configurable vals protected var configFile = "application" protected var testURL: String = _ protected var testTable: String = _ protected var user: String = _ protected var password: String = _ protected var numPartitions: Int = _ protected var sampleDbmaxNumTables: Int = _ override def beforeAll(): Unit = { super.beforeAll() sc = new SparkContext("local[*]", "IntegrationTest", new SparkConf()) sqlContext = new SQLContext(sc) val conf = ConfigFactory.load(configFile) testURL = conf.getString("test.integration.dbURL") testTable = conf.getString("test.integration.table") user = conf.getString("test.integration.user") password = conf.getString("test.integration.password") numPartitions = conf.getInt("test.integration.partition.number") sampleDbmaxNumTables = conf.getInt("test.integration.max.numtables") prop.setProperty("user", user) prop.setProperty("password", password) log.info("Attempting to get connection from" + testURL) conn = NetezzaJdbcUtils.getConnector(testURL, prop)() log.info("got connection.") } override def afterAll(): Unit = { try { sc.stop() } finally { conn.close() super.afterAll() } } def withTable(tableNames: String*)(f: => Unit): Unit = { try f finally { tableNames.foreach { name => executeJdbcStmt(s"DROP TABLE $name") } } } }
Example 198
Source File: DataFrameExtensions.scala From spark-powerbi-connector with Apache License 2.0 | 5 votes |
package com.microsoft.azure.powerbi.extensions import java.sql.Timestamp import java.util.Date import scala.collection.mutable.ListBuffer import com.microsoft.azure.powerbi.authentication.PowerBIAuthentication import com.microsoft.azure.powerbi.common.PowerBIUtils import com.microsoft.azure.powerbi.models.{table, PowerBIDatasetDetails} import org.apache.spark.sql.DataFrame object DataFrameExtensions { implicit def PowerBIDataFrame(dataFrame: DataFrame): PowerBIDataFrame = new PowerBIDataFrame(dataFrame: DataFrame) class PowerBIDataFrame(dataFrame: DataFrame) extends Serializable{ def toPowerBI(powerbiDatasetDetails: PowerBIDatasetDetails, powerbiTable: table, powerBIAuthentication: PowerBIAuthentication): Unit = { var authenticationToken: String = powerBIAuthentication.getAccessToken dataFrame.foreachPartition { partition => // PowerBI row limit in single request is 10,000. We limit it to 1000. partition.grouped(1000).foreach { group => { val powerbiRowListBuffer: ListBuffer[Map[String, Any]] = ListBuffer[Map[String, Any]]() group.foreach { record => { var powerbiRow: Map[String, Any] = Map[String, Any]() for (i <- 0 until record.length) { powerbiRow += (powerbiTable.columns(i).name -> record(i)) } powerbiRowListBuffer += powerbiRow } var attemptCount = 0 var pushSuccessful = false while (!pushSuccessful && attemptCount < this.retryCount) { try { PowerBIUtils.addMultipleRows(powerbiDatasetDetails, powerbiTable, powerbiRowListBuffer, authenticationToken) pushSuccessful = true } catch { case e: Exception => println(f"Exception inserting multiple rows: ${e.getMessage}") Thread.sleep(secondsBetweenRetry * 1000) attemptCount += 1 authenticationToken = powerBIAuthentication.refreshAccessToken } } } } } } } def countTimelineToPowerBI(powerbiDatasetDetails: PowerBIDatasetDetails, powerbiTable: table, powerBIAuthentication: PowerBIAuthentication): Unit = { var authenticationToken: String = powerBIAuthentication.getAccessToken val currentTimestamp = new Timestamp(new Date().getTime) val powerbiRow = Map(powerbiTable.columns.head.name -> currentTimestamp, powerbiTable.columns(1).name -> dataFrame.count()) var attemptCount = 0 var pushSuccessful = false while (!pushSuccessful && attemptCount < this.retryCount) { try { PowerBIUtils.addRow(powerbiDatasetDetails, powerbiTable, powerbiRow, authenticationToken) pushSuccessful = true } catch { case e: Exception => println("Exception inserting row: " + e.getMessage) Thread.sleep(secondsBetweenRetry * 1000) attemptCount += 1 authenticationToken = powerBIAuthentication.refreshAccessToken } } } private val retryCount: Int = 3 private val secondsBetweenRetry: Int = 1 } }
Example 199
Source File: ClassifierDatasetEncoder.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.ml.tensorflow import com.johnsnowlabs.nlp.Annotation import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{size, explode, col} import scala.collection.mutable class ClassifierDatasetEncoder(val params: ClassifierDatasetEncoderParams) extends Serializable { val tags2Id: Map[String, Int] = params.tags.zipWithIndex .map(p => (p._1, p._2)) .toMap val tags: Array[String] = tags2Id .map(p => (p._2, p._1)) .toArray .sortBy(p => p._1) .map(p => p._2) def encodeTags(labels: Array[String]): Array[Array[Int]] = { labels.map { t => val labelIDsArray = Array.fill(tags.length)(0) labelIDsArray(tags2Id(t)) = 1 labelIDsArray } } def decodeOutputData(tagIds: Array[Array[Float]]): Array[Array[(String, Float)]] = { val scoresMetadata = tagIds.map { scores => scores.zipWithIndex.flatMap { case (score, idx) => val tag = tags2Id.find(_._2 == idx).map(_._1).getOrElse("NA") Map(tag -> score) } } scoresMetadata } } case class ClassifierDatasetEncoderParams(tags: Array[String])
Example 200
Source File: LightPipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.ml.{PipelineModel, Transformer} import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.JavaConverters._ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddingsVectors: Boolean = false) { private var ignoreUnsupported = false def setIgnoreUnsupported(v: Boolean): Unit = ignoreUnsupported = v def getIgnoreUnsupported: Boolean = ignoreUnsupported def getStages: Array[Transformer] = pipelineModel.stages def transform(dataFrame: Dataset[_]): DataFrame = pipelineModel.transform(dataFrame) def fullAnnotate(target: String, startWith: Map[String, Seq[Annotation]] = Map.empty[String, Seq[Annotation]]): Map[String, Seq[Annotation]] = { getStages.foldLeft(startWith)((annotations, transformer) => { transformer match { case documentAssembler: DocumentAssembler => annotations.updated(documentAssembler.getOutputCol, documentAssembler.assemble(target, Map.empty[String, String])) case lazyAnnotator: AnnotatorModel[_] if lazyAnnotator.getLazyAnnotator => annotations case recursiveAnnotator: HasRecursiveTransform[_] with AnnotatorModel[_] => val combinedAnnotations = recursiveAnnotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil)) annotations.updated(recursiveAnnotator.getOutputCol, recursiveAnnotator.annotate(combinedAnnotations, pipelineModel)) case annotator: AnnotatorModel[_] => val combinedAnnotations = annotator.getInputCols.foldLeft(Seq.empty[Annotation])((inputs, name) => inputs ++ annotations.getOrElse(name, Nil)) annotations.updated(annotator.getOutputCol, annotator.annotate(combinedAnnotations)) case finisher: Finisher => annotations.filterKeys(finisher.getInputCols.contains) case rawModel: RawAnnotator[_] => if (ignoreUnsupported) annotations else throw new IllegalArgumentException(s"model ${rawModel.uid} does not support LightPipeline." + s" Call setIgnoreUnsupported(boolean) on LightPipeline to ignore") case pipeline: PipelineModel => new LightPipeline(pipeline, parseEmbeddingsVectors).fullAnnotate(target, annotations) case _ => annotations } }) } def fullAnnotate(targets: Array[String]): Array[Map[String, Seq[Annotation]]] = { targets.par.map(target => { fullAnnotate(target) }).toArray } def fullAnnotateJava(target: String): java.util.Map[String, java.util.List[JavaAnnotation]] = { fullAnnotate(target).mapValues(_.map(aa => JavaAnnotation(aa.annotatorType, aa.begin, aa.end, aa.result, aa.metadata.asJava)).asJava).asJava } def fullAnnotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[JavaAnnotation]]] = { targets.asScala.par.map(target => { fullAnnotateJava(target) }).toList.asJava } def annotate(target: String): Map[String, Seq[String]] = { fullAnnotate(target).mapValues(_.map(a => { a.annotatorType match { case (AnnotatorType.WORD_EMBEDDINGS | AnnotatorType.SENTENCE_EMBEDDINGS) if (parseEmbeddingsVectors) => a.embeddings.mkString(" ") case _ => a.result } })) } def annotate(targets: Array[String]): Array[Map[String, Seq[String]]] = { targets.par.map(target => { annotate(target) }).toArray } def annotateJava(target: String): java.util.Map[String, java.util.List[String]] = { annotate(target).mapValues(_.asJava).asJava } def annotateJava(targets: java.util.ArrayList[String]): java.util.List[java.util.Map[String, java.util.List[String]]] = { targets.asScala.par.map(target => { annotateJava(target) }).toList.asJava } }