org.apache.spark.sql.types.DoubleType Scala Examples
The following examples show how to use org.apache.spark.sql.types.DoubleType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: BinaryClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 2
Source File: MulticlassClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 3
Source File: RegressionEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 4
Source File: randomExpressions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom @ExpressionDescription( usage = "_FUNC_(a) - Returns a random column with i.i.d. gaussian random distribution.") case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to randn must be an integer literal.") }) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""", isNull = "false") } }
Example 5
Source File: MovieLensTestSuite.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark import com.lucidworks.spark.util.{QueryConstants, ConfigurationConstants} import org.apache.spark.sql.types.DoubleType class MovieLensTestSuite extends MovielensBuilder { test("multiple nested where clauses with NOT and AND") { val sql = s""" | select genre from ${moviesColName} m where | ((m.genre IN ('comedy') and (m.title != 'Here Comes Cookie (1935)'))) | OR | (m.genre IN ('action') and m.title = 'Operation Dumbo Drop (1995)') """.stripMargin val results = sparkSession.sql(sql).collect() // (426-1) comedy results assert(results.count(r => r.getString(0) === "comedy") == 425) assert(results.count(r => r.getString(0) === "action") == 1) } test("multiple nested where clauses with NOT and multiple AND") { val sql = s""" | select genre from ${moviesColName} m where | (m.genre IN ('comedy') and ((m.title != 'Here Comes Cookie (1935)') and (m.title != 'Coneheads (1993)'))) | OR | (m.genre IN ('action') and m.title = 'Operation Dumbo Drop (1995)') """.stripMargin val results = sparkSession.sql(sql).collect() // (426-2) 424 comedy results assert(results.count(r => r.getString(0) === "comedy") == 424) assert(results.count(r => r.getString(0) === "action") == 1) } test("mutliple nested where clauses with NOT and multiple OR") { val sql = s""" | select genre from ${moviesColName} m where | (m.genre IN ('comedy') and ((m.title != 'Here Comes Cookie (1935)') or (m.title != 'Coneheads (1993)'))) """.stripMargin val results = sparkSession.sql(sql).collect() assert(results.length === 424) } test("Score column in SQL statement pushdown to Solr") { val sqlStmt = s"SELECT movie_id,title,score from ${moviesColName} where _query_='title_txt_en:dog' order by score desc LIMIT 100" val opts = Map( "zkhost" -> zkHost, "collection" -> moviesColName, ConfigurationConstants.REQUEST_HANDLER -> QueryConstants.QT_SQL, ConfigurationConstants.SOLR_SQL_STMT -> sqlStmt) val df = sparkSession.read.format("solr").options(opts).load() val schema = df.schema assert (schema.fieldNames.contains("score")) assert (schema("score").dataType == DoubleType) val rows = df.take(10) assert(rows(0).length==3) } test("Provide SQL schema via config") { val sqlStmt = s"SELECT movie_id,title,score from ${moviesColName} where _query_='title_txt_en:dog' order by score desc LIMIT 100" val sqlSchema = "movie_id:string,title:string,score:double" val opts = Map( "zkhost" -> zkHost, "collection" -> moviesColName, ConfigurationConstants.REQUEST_HANDLER -> QueryConstants.QT_SQL, ConfigurationConstants.SOLR_SQL_STMT -> sqlStmt, ConfigurationConstants.SOLR_SQL_SCHEMA -> sqlSchema) val df = sparkSession.read.format("solr").options(opts).load() val schema = df.schema assert (schema.fieldNames.contains("score")) assert (schema("score").dataType == DoubleType) val rows = df.take(10) assert(rows(0).length==3) } test("Test nested where clauses") { val opts = Map( "zkhost" -> zkHost, "collection" -> moviesColName, "query" -> "*:*", "filters" -> """genre:action,title:"Star Wars (1977)" OR title:"Power 98 (1995)" OR title:"Truth or Consequences, N.M. (1997)" OR title:"Romper Stomper (1992)" OR title:"Air Force One (1997)" OR title:"Alien 3 (1992)" OR title:"Best Men (1997)" OR title:"Hellraiser: Bloodline (1996)" OR title:"Alien: Resurrection (1997)" OR title:"Fair Game (1995)" OR title:"Star Trek: First Contact (1996)" OR title:"Long Kiss Goodnight, The (1996)" OR title:"Tomorrow Never Dies (1997)" OR title:"The Deadly Cure (1996)" OR title:"Jaws 2 (1978)" OR title:"Star Trek: The Wrath of Khan (1982)" OR title:"Metro (1997)" OR title:"Rumble in the Bronx (1995)" OR title:"Timecop (1994)" OR title:"Firestorm (1998)" OR title:"Star Trek VI: The Undiscovered Country (1991)" OR title:"Nick of Time (1995)" OR title:"Cliffhanger (1993)" OR title:"In the Line of Duty 2 (1987)" OR title:"Con Air (1997)" OR title:"Rock, The (1996)" OR title:"Crying Game, The (1992)" OR title:"Bloodsport 2 (1995)" OR title:"Mercury Rising (1998)" OR title:"Boot, Das (1981)" OR title:"Mighty Morphin Power Rangers: The Movie (1995)" OR title:"Specialist, The (1994)" OR title:"Bad Company (1995)" OR title:"Good Man in Africa, A (1994)" OR title:"Solo (1996)" OR title:"Palookaville (1996)" OR title:"Rising Sun (1993)" OR title:"Broken Arrow (1996)" OR title:"Heaven & Earth (1993)" OR title:"Star Trek: The Motion Picture (1979)" OR title:"Top Gun (1986)" OR title:"U.S. Marshalls (1998)" OR title:"Stranger, The (1994)" OR title:"Tank Girl (1995)" OR title:"Men With Guns (1997)" OR title:"Deep Rising (1998)" OR title:"Abyss, The (1989)" OR title:"Tokyo Fist (1995)" OR title:"Ben-Hur (1959)" OR title:"Aliens (1986)" OR title:"No Escape (1994)" OR title:"Dead Presidents (1995)" OR title:"Lost World: Jurassic Park, The (1997)" OR title:"Set It Off (1996)" OR title:"Ghost and the Darkness, The (1996)" OR title:"Substitute, The (1996)" OR title:"Star Trek IV: The Voyage Home (1986)" OR title:"Batman (1989)" OR title:"Event Horizon (1997)" OR title:"Stargate (1994)" OR title:"Star Trek III: The Search for Spock (1984)" OR title:"Coldblooded (1995)" OR title:"Raiders of the Lost Ark (1981)" OR title:"Muppet Treasure Island (1996)" OR title:"Batman Forever (1995)" OR title:"Sudden Death (1995)" OR title:"Terminator, The (1984)" OR title:"American Strays (1996)" OR title:"Last Man Standing (1996)" OR title:"Replacement Killers, The (1998)" OR title:"Cowboy Way, The (1994)" OR title:"Glimmer Man, The (1996)" OR title:"Man in the Iron Mask, The (1998)" OR title:"Godfather, The (1972)" OR title:"Demolition Man (1993)" OR title:"Three Musketeers, The (1993)" OR title:"Lost in Space (1998)" OR title:"Last Action Hero (1993)" OR title:"Hunt for Red October, The (1990)" OR title:"Executive Decision (1996)" OR title:"Crow: City of Angels, The (1996)" OR title:"Blown Away (1994)" OR title:"Smilla's Sense of Snow (1997)" OR title:"Conspiracy Theory (1997)" OR title:"Evil Dead II (1987)" OR title:"Crow, The (1994)" OR title:"Shooter, The (1995)" OR title:"Starship Troopers (1997)" OR title:"Fallen (1998)" OR title:"First Knight (1995)" OR title:"Fugitive, The (1993)" OR title:"Transformers: The Movie, The (1986)" OR title:"Young Guns (1988)" OR title:"Bird of Prey (1996)" OR title:"Jaws 3-D (1983)" OR title:"G.I. Jane (1997)" OR title:"Terminal Velocity (1994)" OR title:"Jurassic Park (1993)" OR title:"Mirage (1995)" OR title:"Adventures of Robin Hood, The (1938)" OR title:"Steel (1997)" OR title:"Blues Brothers, The (1980)" OR title:"Hunted, The (1995)" OR title:"Die Hard: With a Vengeance (1995)" OR title:"Desperado (1995)" OR title:"Get Shorty (1995)" OR title:"Braveheart (1995)" OR title:"3 Ninjas: High Noon At Mega Mountain (1998)" OR title:"Return of the Jedi (1983)" OR title:"Under Siege 2: Dark Territory (1995)" OR title:"Street Fighter (1994)" OR title:"Program, The (1993)" OR title:"Devil's Own, The (1997)" OR title:"True Lies (1994)" OR title:"Mission: Impossible (1996)" OR title:"Mars Attacks! (1996)" OR title:"Menace II Society (1993)" OR title:"Clear and Present Danger (1994)" OR title:"U Turn (1997)" OR title:"Peacemaker, The (1997)" OR title:"Highlander (1986)" OR title:"Magnificent Seven, The (1954)" OR title:"Escape from L.A. (1996)" OR title:"Pagemaster, The (1994)" OR title:"Next Karate Kid, The (1994)" OR title:"I Love Trouble (1994)" OR title:"Striking Distance (1993)" OR title:"Mortal Kombat (1995)" OR title:"Perfect World, A (1993)" OR title:"Waterworld (1995)" OR title:"Titanic (1997)" OR title:"Beverly Hills Ninja (1997)" OR title:"Money Train (1995)" OR title:"Saint, The (1997)" OR title:"Money Talks (1997)" OR title:"Judgment Night (1993)" OR title:"Time Tracers (1995)" OR title:"Heat (1995)" OR title:"Fled (1996)" OR title:"Cyrano de Bergerac (1990)" OR title:"Lashou shentan (1992)" OR title:"Double Team (1997)" OR title:"Twister (1996)" OR title:"Marked for Death (1990)" OR title:"Mad City (1997)" OR title:"Butch Cassidy and the Sundance Kid (1969)" OR title:"Drop Zone (1994)" OR title:"Shopping (1994)" OR title:"Highlander III: The Sorcerer (1994)" OR title:"Quest, The (1996)" OR title:"Conan the Barbarian (1981)" OR title:"Hard Target (1993)" OR title:"Jumanji (1995)" OR title:"Best of the Best 3: No Turning Back (1995)" OR title:"Tough and Deadly (1995)" OR title:"Jerky Boys, The (1994)" OR title:"Supercop (1992)" OR title:"GoldenEye (1995)" OR title:"Spawn (1997)" OR title:"Getaway, The (1994)" OR title:"Blood Beach (1981)" OR title:"Batman Returns (1992)" OR title:"Fire Down Below (1997)" OR title:"Target (1995)" OR title:"Faster Pussycat! Kill! Kill! (1965)" OR title:"Apollo 13 (1995)" OR title:"Diva (1981)" OR title:"Arrival, The (1996)" OR title:"Barb Wire (1996)" OR title:"In the Line of Fire (1993)" OR title:"Die xue shuang xiong (Killer, The) (1989)" OR title:"Low Down Dirty Shame, A (1994)" OR title:"Bad Boys (1995)" OR title:"Speed (1994)" OR title:"Johnny 100 Pesos (1993)" OR title:"The Courtyard (1995)" OR title:"Star Trek V: The Final Frontier (1989)" OR title:"Independence Day (ID4) (1996)" OR title:"Warriors of Virtue (1997)" OR title:"Godfather: Part II, The (1974)" OR title:"Operation Dumbo Drop (1995)" OR title:"Strange Days (1995)" OR title:"Kull the Conqueror (1997)" OR title:"New York Cop (1996)" OR title:"Face/Off (1997)" OR title:"Indiana Jones and the Last Crusade (1989)" OR title:"Bulletproof (1996)" OR title:"Jackal, The (1997)" OR title:"Hot Shots! Part Deux (1993)" OR title:"Judge Dredd (1995)" OR title:"Days of Thunder (1990)" OR title:"Men in Black (1997)" OR title:"Escape from New York (1981)" OR title:"Army of Darkness (1993)" OR title:"Glory (1989)" OR title:"Men of Means (1998)" OR title:"Die Hard 2 (1990)" OR title:"Empire Strikes Back, The (1980)" OR title:"Dragonheart (1996)" OR title:"Shadow, The (1994)" OR title:"Die Hard (1988)" OR title:"River Wild, The (1994)" OR title:"Alien (1979)" OR title:"Police Story 4: Project S (Chao ji ji hua) (1993)" OR title:"From Dusk Till Dawn (1996)" OR title:"Turbo: A Power Rangers Movie (1997)" OR title:"True Romance (1993)" OR title:"Cutthroat Island (1995)" OR title:"Hard Rain (1998)" OR title:"Chain Reaction (1996)" OR title:"Star Trek: Generations (1994)" OR title:"Beverly Hills Cop III (1994)" OR title:"Johnny Mnemonic (1995)" OR title:"Condition Red (1995)" OR title:"Terminator 2: Judgment Day (1991)" OR title:"Jaws (1975)" OR title:"Jackie Chan's First Strike (1996)" OR title:"Blues Brothers 2000 (1998)" OR title:"Hackers (1995)" OR title:"Fifth Element, The (1997)" OR title:"Good, The Bad and The Ugly, The (1966)" OR title:"Batman & Robin (1997)" OR title:"Nemesis 2: Nebula (1995)" OR title:"African Queen, The (1951)" OR title:"Outbreak (1995)" OR title:"Quick and the Dead, The (1995)" OR title:"Last of the Mohicans, The (1992)" OR title:"Speed 2: Cruise Control (1997)" OR title:"Surviving the Game (1994)" OR title:"King of New York (1990)" OR title:"Under Siege (1992)" OR title:"Princess Bride, The (1987)" OR title:"Hostile Intentions (1994)" OR title:"Eraser (1996)" OR title:"Young Guns II (1990)" OR title:"Maximum Risk (1996)" OR title:"Mortal Kombat: Annihilation (1997)" OR title:"Maverick (1994)" OR title:"Lawnmower Man, The (1992)" OR title:"Full Metal Jacket (1987)" OR title:"Stag (1997)" OR title:"Super Mario Bros. (1993)" OR title:"Daylight (1996)" OR title:"Congo (1995)" OR title:"Natural Born Killers (1994)" OR title:"Heavy Metal (1981)" OR title:"Dante's Peak (1997)" OR title:"Anaconda (1997)" OR title:"Breakdown (1997)",movie_id:[* TO *]""", "fields" -> "movie_id,title", "sort" -> "id asc" ) val solrConf = new SolrConf(opts) val filters = solrConf.getFilters assert(filters(0) === "genre:action") assert(filters(2) === "movie_id:[* TO *]") assert(filters.length === 3) val df = sparkSession.read.format("solr").options(opts).load() val rows = df.collectAsList() assert(rows.size() === 251) } }
Example 6
Source File: CovarianceSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.{ BaseSummarizerFactory, ColumnList, SummarizerFactory } import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{ DoubleType, StructType } case class CovarianceSummarizerFactory(columnX: String, columnY: String) extends BaseSummarizerFactory(columnX, columnY) { override def apply(inputSchema: StructType): CovarianceSummarizer = new CovarianceSummarizer(inputSchema, prefixOpt, requiredColumns) } class CovarianceSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList ) extends AbstractCorrelationSummarizer(inputSchema, prefixOpt, requiredColumns) { override val schema = Schema.of( s"${columnPrefix}_covariance" -> DoubleType ) override def fromV(v: V): GenericInternalRow = new GenericInternalRow(Array[Any](v.covariance)) }
Example 7
Source File: StandardDeviationSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.ColumnList.Sequence import com.twosigma.flint.timeseries.summarize.{ BaseSummarizerFactory, ColumnList } import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{ DoubleType, StructType } import scala.math.sqrt case class StandardDeviationSummarizerFactory(column: String, applyBesselCorrection: Boolean = true) extends BaseSummarizerFactory(column) { override def apply(inputSchema: StructType): StandardDeviationSummarizer = new StandardDeviationSummarizer(inputSchema, prefixOpt, requiredColumns, applyBesselCorrection) } class StandardDeviationSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, val applyBesselCorrection: Boolean ) extends NthCentralMomentSummarizer(inputSchema, prefixOpt, requiredColumns, 2) { private val Sequence(Seq(column)) = requiredColumns override val schema = Schema.of(s"${column}_stddev" -> DoubleType) override def fromV(v: V): GenericInternalRow = { var variance = v.nthCentralMoment(2) if (applyBesselCorrection) { variance = variance * (v.count / (v.count - 1d)) } new GenericInternalRow(Array[Any](sqrt(variance))) } }
Example 8
Source File: VarianceSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.ColumnList.Sequence import com.twosigma.flint.timeseries.summarize.{ BaseSummarizerFactory, ColumnList } import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{ DoubleType, StructType } case class VarianceSummarizerFactory(column: String, applyBesselCorrection: Boolean = true) extends BaseSummarizerFactory(column) { override def apply(inputSchema: StructType): VarianceSummarizer = new VarianceSummarizer(inputSchema, prefixOpt, requiredColumns, applyBesselCorrection) } class VarianceSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, val applyBesselCorrection: Boolean ) extends NthCentralMomentSummarizer(inputSchema, prefixOpt, requiredColumns, 2) { private val Sequence(Seq(column)) = requiredColumns override val schema = Schema.of(s"${column}_variance" -> DoubleType) override def fromV(v: V): GenericInternalRow = { var variance = v.nthCentralMoment(2) if (applyBesselCorrection) { variance = variance * (v.count / (v.count - 1d)) } new GenericInternalRow(Array[Any](variance)) } }
Example 9
Source File: AssertEqualsSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{ GenericRowWithSchema => SqlRow } import org.apache.spark.sql.types.{ ArrayType, DoubleType } import org.scalatest.exceptions.TestFailedException import scala.collection.mutable class AssertEqualsSpec extends TimeSeriesSuite { "TimeSeriesSuite" should "assertEquals for two sql rows of DoubleType correctly" in { val schema = Schema("x" -> DoubleType) val row1 = new SqlRow(Array(1L, 1.0), schema) val row2 = new SqlRow(Array(1L, 1.0 + defaultAdditivePrecision * 0.1), schema) val row3 = new SqlRow(Array(1L, 1.0 + defaultAdditivePrecision * 10.0), schema) assertAlmostEquals(row1, row2) intercept[TestFailedException] { assertAlmostEquals(row1, row3) } } it should "assertEquals for two sql rows of ArrayType(DoubleType) correctly" in { val schema = Schema("x" -> ArrayType(DoubleType)) val row1: Row = new SqlRow(Array(1L, mutable.WrappedArray.make(Array(1.0))), schema) val row2: Row = new SqlRow( Array(1L, mutable.WrappedArray.make(Array(1.0 + defaultAdditivePrecision * 0.1))), schema ) val row3: Row = new SqlRow( Array(1L, mutable.WrappedArray.make(Array(1.0 + defaultAdditivePrecision * 10.0))), schema ) assertAlmostEquals(row1, row2) intercept[TestFailedException] { assertAlmostEquals(row1, row3) } } it should "assertEquals for two sql rows of ArrayType(DoubleType) that contain NaN values correctly" in { val schema = Schema("x" -> ArrayType(DoubleType)) val row1 = new SqlRow(Array(1L, mutable.WrappedArray.make(Array(Double.NaN))), schema) val row2 = new SqlRow(Array(1L, mutable.WrappedArray.make(Array(Double.NaN))), schema) val row3 = new SqlRow(Array(1L, mutable.WrappedArray.make(Array(1.0))), schema) assertAlmostEquals(row1, row2) intercept[TestFailedException] { assertAlmostEquals(row1, row3) } } }
Example 10
Source File: SummarizeSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ LongType, IntegerType, DoubleType } class SummarizeSpec extends MultiPartitionSuite { override val defaultResourceDir: String = "/timeseries/summarize" it should "`summarize` correctly" in { val expectedSchema = Schema("volume_sum" -> DoubleType) val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 7800.0), expectedSchema)) def test(rdd: TimeSeriesRDD): Unit = { val results = rdd.summarize(Summarizers.sum("volume")) assert(results.schema == expectedSchema) assert(results.collect().deep == expectedResults.deep) } { val volumeRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType)) withPartitionStrategy(volumeRdd)(DEFAULT)(test) } } it should "`summarize` per key correctly" in { val expectedSchema = Schema("id" -> IntegerType, "volume_sum" -> DoubleType) val expectedResults = Array[Row]( new GenericRowWithSchema(Array(0L, 7, 4100.0), expectedSchema), new GenericRowWithSchema(Array(0L, 3, 3700.0), expectedSchema) ) def test(rdd: TimeSeriesRDD): Unit = { val results = rdd.summarize(Summarizers.sum("volume"), Seq("id")) assert(results.schema == expectedSchema) assert(results.collect().sortBy(_.getAs[Int]("id")).deep == expectedResults.sortBy(_.getAs[Int]("id")).deep) } { val volumeTSRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType)) withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } }
Example 11
Source File: SummarizeCyclesSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.types.{ DoubleType, IntegerType, LongType } class SummarizeCyclesSpec extends MultiPartitionSuite with TimeSeriesTestData with TimeTypeSuite { override val defaultResourceDir: String = "/timeseries/summarizecycles" private val volumeSchema = Schema("id" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType) private val volume2Schema = Schema("id" -> IntegerType, "volume" -> LongType) private val volumeWithGroupSchema = Schema( "id" -> IntegerType, "group" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType ) "SummarizeCycles" should "pass `SummarizeSingleColumn` test." in { withAllTimeType { val resultTSRdd = fromCSV("SummarizeSingleColumn.results", Schema("volume_sum" -> DoubleType)) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeCycles(Summarizers.sum("volume")) assertEquals(summarizedVolumeTSRdd, resultTSRdd) } val volumeTSRdd = fromCSV("Volume.csv", volumeSchema) withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass `SummarizeSingleColumnPerKey` test, i.e. with additional a single key." in { withAllTimeType { val resultTSRdd = fromCSV( "SummarizeSingleColumnPerKey.results", Schema("id" -> IntegerType, "volume_sum" -> DoubleType) ) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeCycles(Summarizers.sum("volume"), Seq("id")) assertEquals(summarizedVolumeTSRdd, resultTSRdd) } val volumeTSRdd = fromCSV("Volume2.csv", volume2Schema) withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass `SummarizeSingleColumnPerSeqOfKeys` test, i.e. with additional a sequence of keys." in { withAllTimeType { val resultTSRdd = fromCSV( "SummarizeSingleColumnPerSeqOfKeys.results", Schema("id" -> IntegerType, "group" -> IntegerType, "volume_sum" -> DoubleType) ) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeCycles(Summarizers.sum("volume"), Seq("id", "group")) assertEquals(summarizedVolumeTSRdd, resultTSRdd) } val volumeTSRdd = fromCSV("VolumeWithIndustryGroup.csv", volumeWithGroupSchema) withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass generated cycle data test" in { // TODO: The way cycleData works now doesn't support changing time type. val testData = cycleData1 def sum(rdd: TimeSeriesRDD): TimeSeriesRDD = { rdd.summarizeCycles(Summarizers.compose(Summarizers.count(), Summarizers.sum("v1"))) } withPartitionStrategyCompare(testData)(DEFAULT)(sum) } }
Example 12
Source File: TimeSeriesRDDCacheSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ DoubleType, IntegerType } import org.scalatest.concurrent.Timeouts import org.scalatest.tagobjects.Slow import org.scalatest.time.{ Second, Span } class TimeSeriesRDDCacheSpec extends TimeSeriesSuite with Timeouts { "TimeSeriesRDD" should "correctly cache data" taggedAs Slow in { withResource("/timeseries/csv/Price.csv") { source => val priceSchema = Schema("id" -> IntegerType, "price" -> DoubleType) val timeSeriesRdd = CSV.from(sqlContext, "file://" + source, sorted = true, schema = priceSchema) val slowTimeSeriesRdd = timeSeriesRdd.addColumns("new_column" -> DoubleType -> { row: Row => Thread.sleep(500L) row.getAs[Double]("price") + 1.0 }) // run a dummy addColumns() to initialize TSRDD's internal state slowTimeSeriesRdd.addColumns("foo_column" -> DoubleType -> { _ => 1.0 }) slowTimeSeriesRdd.cache() assert(slowTimeSeriesRdd.count() == 12) // this test succeeds only if all representations are correctly cached failAfter(Span(1, Second)) { assert(slowTimeSeriesRdd.toDF.collect().length == 12) assert(slowTimeSeriesRdd.orderedRdd.count() == 12) assert(slowTimeSeriesRdd.asInstanceOf[TimeSeriesRDDImpl].unsafeOrderedRdd.count == 12) } } } }
Example 13
Source File: CompositeSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.{ CSV, Summarizers, TimeSeriesRDD, TimeSeriesSuite } import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import org.apache.spark.sql.types.{ DoubleType, IntegerType, StructType } class CompositeSummarizerSpec extends SummarizerSuite { // Reuse mean summarizer data override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" var priceTSRdd: TimeSeriesRDD = _ lazy val init: Unit = { priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) } "CompositeSummarizer" should "compute `mean` and `stddev` correctly" in { init val result = priceTSRdd.summarize( Summarizers.compose(Summarizers.mean("price"), Summarizers.stddev("price")) ) val row = result.first() assert(row.getAs[Double]("price_mean") === 3.25) assert(row.getAs[Double]("price_stddev") === 1.8027756377319946) } it should "throw exception for conflicting output columns" in { init intercept[Exception] { priceTSRdd.summarize(Summarizers.compose(Summarizers.mean("price"), Summarizers.mean("price"))) } } it should "handle conflicting output columns using prefix" in { init val result = priceTSRdd.summarize( Summarizers.compose(Summarizers.mean("price"), Summarizers.mean("price").prefix("prefix")) ) val row = result.first() assert(row.getAs[Double]("price_mean") === 3.25) assert(row.getAs[Double]("prefix_price_mean") === 3.25) } it should "handle null values" in { init val inputWithNull = insertNullRows(priceTSRdd, "price") val row = inputWithNull.summarize( Summarizers.compose( Summarizers.count(), Summarizers.count("id"), Summarizers.count("price") ) ).first() val count = priceTSRdd.count() assert(row.getAs[Long]("count") == 2 * count) assert(row.getAs[Long]("id_count") == 2 * count) assert(row.getAs[Long]("price_count") == count) } }
Example 14
Source File: MeanSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import com.twosigma.flint.timeseries.{ Summarizers, TimeSeriesSuite } import org.apache.spark.sql.types.{ DoubleType, IntegerType } class MeanSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" "MeanSummarizer" should "compute `mean` correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val result = priceTSRdd.summarize(Summarizers.mean("price")).first() assert(result.getAs[Double]("price_mean") === 3.25) } it should "ignore null values" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) assertEquals( priceTSRdd.summarize(Summarizers.mean("price")), insertNullRows(priceTSRdd, "price").summarize(Summarizers.mean("price")) ) } it should "pass summarizer property test" in { summarizerPropertyTest(AllProperties)(Summarizers.mean("x1")) summarizerPropertyTest(AllProperties)(Summarizers.mean("x2")) } }
Example 15
Source File: ExtremeSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.rdd.function.summarize.summarizer.Summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.{ SummarizerFactory, SummarizerSuite } import com.twosigma.flint.timeseries.{ CSV, Summarizers, TimeSeriesRDD, TimeSeriesSuite } import org.apache.spark.sql.types.{ DataType, DoubleType, FloatType, IntegerType, LongType, StructType } import java.util.Random import org.apache.spark.sql.Row class ExtremeSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" private def test[T]( dataType: DataType, randValue: Row => Any, summarizer: String => SummarizerFactory, reduceFn: (T, T) => T, inputColumn: String, outputColumn: String ): Unit = { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns( inputColumn -> dataType -> randValue ) val data = priceTSRdd.collect().map{ row => row.getAs[T](inputColumn) } val trueExtreme = data.reduceLeft[T]{ case (x, y) => reduceFn(x, y) } val result = priceTSRdd.summarize(summarizer(inputColumn)) val extreme = result.first().getAs[T](outputColumn) val outputType = result.schema(outputColumn).dataType assert(outputType == dataType, s"$outputType") assert(trueExtreme === extreme, s"extreme: $extreme, trueExtreme: $trueExtreme, data: ${data.toSeq}") } "MaxSummarizer" should "compute double max correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.max, math.max, "x", "x_max") } it should "compute long max correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.max, math.max, "x", "x_max") } it should "compute float max correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.max, math.max, "x", "x_max") } it should "compute int max correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.max, math.max, "x", "x_max") } "MinSummarizer" should "compute double min correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.min, math.min, "x", "x_min") } it should "compute long min correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.min, math.min, "x", "x_min") } it should "compute float min correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.min, math.min, "x", "x_min") } it should "compute int min correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.min, math.min, "x", "x_min") } it should "pass summarizer property test" in { summarizerPropertyTest(AllProperties)(Summarizers.max("x1")) summarizerPropertyTest(AllProperties)(Summarizers.min("x2")) } it should "ignore null values" in { val input = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val inputWithNull = insertNullRows(input, "price") assertEquals( input.summarize(Summarizers.min("price")), inputWithNull.summarize(Summarizers.min("price")) ) } }
Example 16
Source File: GeometricMeanSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer.subtractable import com.twosigma.flint.timeseries.{ Summarizers, Windows } import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import org.apache.spark.sql.types.{ DoubleType, IntegerType } class GeometricMeanSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/geometricmeansummarizer" "GeometricMeanSummarizer" should "compute geometric mean correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema( "id" -> IntegerType, "price" -> DoubleType, "priceWithZero" -> DoubleType, "priceWithNegatives" -> DoubleType )) val results = priceTSRdd.summarize(Summarizers.geometricMean("price"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("price_geometricMean") === 2.621877636494) assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("price_geometricMean") === 2.667168275340) } it should "compute geometric mean with a zero correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema( "id" -> IntegerType, "price" -> DoubleType, "priceWithZero" -> DoubleType, "priceWithNegatives" -> DoubleType )) var results = priceTSRdd.summarize(Summarizers.geometricMean("priceWithZero")).collect() assert(results.head.getAs[Double]("priceWithZero_geometricMean") === 0.0) // Test that having a zero exit the window still computes correctly. results = priceTSRdd.coalesce(1).summarizeWindows( Windows.pastAbsoluteTime("50 ns"), Summarizers.geometricMean("priceWithZero") ).collect() assert(results.head.getAs[Double]("priceWithZero_geometricMean") === 0.0) assert(results.last.getAs[Double]("priceWithZero_geometricMean") === 5.220043408524) } it should "compute geometric mean with negative values correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema( "id" -> IntegerType, "price" -> DoubleType, "priceWithZero" -> DoubleType, "priceWithNegatives" -> DoubleType )) val results = priceTSRdd.summarize(Summarizers.geometricMean("priceWithNegatives"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("priceWithNegatives_geometricMean") === -2.621877636494) assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("priceWithNegatives_geometricMean") === 2.667168275340) } it should "pass summarizer property test" in { summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.geometricMean("x1")) } }
Example 17
Source File: DotProductSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer.subtractable import com.twosigma.flint.timeseries.Summarizers import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import org.apache.spark.sql.types.{ DoubleType, IntegerType } class DotProductSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/dotproductsummarizer" "DotProductSummarizer" should "compute dot product correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val results = priceTSRdd.summarize(Summarizers.dotProduct("price", "price"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("price_price_dotProduct") === 72.25) assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("price_price_dotProduct") === 90.25) } it should "pass summarizer property test" in { summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.dotProduct("x1", "x2")) } }
Example 18
Source File: ProductSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer.subtractable import com.twosigma.flint.timeseries.{ Summarizers, Windows } import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import org.apache.spark.sql.types.{ DoubleType, IntegerType } class ProductSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/productsummarizer" "ProductSummarizer" should "compute product correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema( "id" -> IntegerType, "price" -> DoubleType, "priceWithZero" -> DoubleType, "priceWithNegatives" -> DoubleType )) val results = priceTSRdd.summarize(Summarizers.product("price"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("price_product") === 324.84375) assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("price_product") === 360.0) } it should "compute product with a zero correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema( "id" -> IntegerType, "price" -> DoubleType, "priceWithZero" -> DoubleType, "priceWithNegatives" -> DoubleType )) var results = priceTSRdd.summarize(Summarizers.product("priceWithZero")).collect() assert(results.head.getAs[Double]("priceWithZero_product") === 0.0) // Test that having a zero exit the window still computes correctly. results = priceTSRdd.coalesce(1).summarizeWindows( Windows.pastAbsoluteTime("50 ns"), Summarizers.product("priceWithZero") ).collect() assert(results.head.getAs[Double]("priceWithZero_product") === 0.0) assert(results.last.getAs[Double]("priceWithZero_product") === 742.5) } it should "compute product with negative values correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema( "id" -> IntegerType, "price" -> DoubleType, "priceWithZero" -> DoubleType, "priceWithNegatives" -> DoubleType )) val results = priceTSRdd.summarize(Summarizers.product("priceWithNegatives"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("priceWithNegatives_product") === -324.84375) assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("priceWithNegatives_product") === 360.0) } it should "pass summarizer property test" in { summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.product("x1")) } }
Example 19
Source File: StandardizedMomentSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer.subtractable import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import com.twosigma.flint.timeseries.Summarizers import org.apache.spark.sql.types.{ DoubleType, IntegerType } class StandardizedMomentSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/standardizedmomentsummarizer" "SkewnessSummarizer" should "compute skewness correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val results = priceTSRdd.summarize(Summarizers.skewness("price")) assert(results.collect().head.getAs[Double]("price_skewness") === 0.0) } it should "ignore null values" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) assertEquals( priceTSRdd.summarize(Summarizers.skewness("price")), insertNullRows(priceTSRdd, "price").summarize(Summarizers.skewness("price")) ) } it should "pass summarizer property test" in { summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.skewness("x1")) } "KurtosisSummarizer" should "compute kurtosis correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val results = priceTSRdd.summarize(Summarizers.kurtosis("price")) assert(results.collect().head.getAs[Double]("price_kurtosis") === -1.2167832167832167) } it should "ignore null values" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) assertEquals( priceTSRdd.summarize(Summarizers.kurtosis("price")), insertNullRows(priceTSRdd, "price").summarize(Summarizers.kurtosis("price")) ) } it should "pass summarizer property test" in { summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.kurtosis("x1")) } }
Example 20
Source File: ZScoreSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer.subtractable import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import com.twosigma.flint.timeseries.Summarizers import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ DoubleType, IntegerType } class ZScoreSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/zscoresummarizer" "ZScoreSummarizer" should "compute in-sample `zScore` correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val expectedSchema = Schema("price_zScore" -> DoubleType) val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 1.5254255396193801), expectedSchema)) val results = priceTSRdd.summarize(Summarizers.zScore("price", true)) assert(results.schema == expectedSchema) assert(results.collect().deep == expectedResults.deep) } it should "compute out-of-sample `zScore` correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val expectedSchema = Schema("price_zScore" -> DoubleType) val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 1.8090680674665818), expectedSchema)) val results = priceTSRdd.summarize(Summarizers.zScore("price", false)) assert(results.schema == expectedSchema) assert(results.collect().deep == expectedResults.deep) } it should "ignore null values" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) assertEquals( priceTSRdd.summarize(Summarizers.zScore("price", true)), insertNullRows(priceTSRdd, "price").summarize(Summarizers.zScore("price", true)) ) } it should "pass summarizer property test" in { summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.zScore("x1", true)) summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.zScore("x2", false)) } }
Example 21
Source File: StandardDeviationSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import com.twosigma.flint.timeseries.Summarizers import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ DoubleType, IntegerType } class StandardDeviationSummarizerSpec extends SummarizerSuite { // It is by intention to reuse the files override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" "StandardDeviationSummarizer" should "compute `stddev` correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns( "price2" -> DoubleType -> { r: Row => r.getAs[Double]("price") }, "price3" -> DoubleType -> { r: Row => -r.getAs[Double]("price") }, "price4" -> DoubleType -> { r: Row => r.getAs[Double]("price") * 2 }, "price5" -> DoubleType -> { r: Row => 0d } ) val result = priceTSRdd.summarize(Summarizers.stddev("price")).first() assert(result.getAs[Double]("price_stddev") === 1.802775638) } it should "ignore null values" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) assertEquals( priceTSRdd.summarize(Summarizers.stddev("price")), insertNullRows(priceTSRdd, "price").summarize(Summarizers.stddev("price")) ) } it should "pass summarizer property test" in { summarizerPropertyTest(AllProperties)(Summarizers.stddev("x1")) } }
Example 22
Source File: PredicateSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.{ Summarizers, TimeSeriesRDD } import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ DoubleType, IntegerType } class PredicateSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" var priceTSRdd: TimeSeriesRDD = _ private lazy val init = { priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) } "PredicateSummarizer" should "return the same results as filtering TSRDD first" in { init val summarizer = Summarizers.compose(Summarizers.mean("price"), Summarizers.stddev("price")) val predicate: Int => Boolean = id => id == 3 val resultWithPredicate = priceTSRdd.summarize(summarizer.where(predicate)("id")).first() val filteredTSRDD = priceTSRdd.keepRows { row: Row => row.getAs[Int]("id") == 3 } val filteredResults = filteredTSRDD.summarize(summarizer).first() assert(resultWithPredicate.getAs[Double]("price_mean") === filteredResults.getAs[Double]("price_mean")) assert(resultWithPredicate.getAs[Double]("price_stddev") === filteredResults.getAs[Double]("price_stddev")) assertEquals( priceTSRdd.summarize(summarizer.where(predicate)("id")), insertNullRows(priceTSRdd, "price").summarize(summarizer.where(predicate)("id")) ) } it should "pass summarizer property test" in { val predicate: Double => Boolean = num => num > 0 summarizerPropertyTest(AllProperties)(Summarizers.sum("x1").where(predicate)("x2")) } }
Example 23
Source File: VarianceSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import com.twosigma.flint.timeseries.Summarizers import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ DoubleType, IntegerType } class VarianceSummarizerSpec extends SummarizerSuite { // It is by intention to reuse the files override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" "StandardDeviationSummarizer" should "compute `stddev` correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns( "price2" -> DoubleType -> { r: Row => r.getAs[Double]("price") }, "price3" -> DoubleType -> { r: Row => -r.getAs[Double]("price") }, "price4" -> DoubleType -> { r: Row => r.getAs[Double]("price") * 2 }, "price5" -> DoubleType -> { r: Row => 0d } ) val result = priceTSRdd.summarize(Summarizers.variance("price")).first() assert(result.getAs[Double]("price_variance") === 3.250000000) } it should "ignore null values" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) assertEquals( priceTSRdd.summarize(Summarizers.variance("price")), insertNullRows(priceTSRdd, "price").summarize(Summarizers.variance("price")) ) } it should "pass summarizer property test" in { summarizerPropertyTest(AllProperties)(Summarizers.variance("x1")) } }
Example 24
Source File: CovarianceSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import com.twosigma.flint.timeseries.{ Summarizers, TimeSeriesRDD, TimeSeriesSuite } import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ DoubleType, IntegerType } class CovarianceSummarizerSpec extends SummarizerSuite { // It is by intention to reuse the files from correlation summarizer override val defaultResourceDir: String = "/timeseries/summarize/summarizer/correlationsummarizer" private var priceTSRdd: TimeSeriesRDD = null private var forecastTSRdd: TimeSeriesRDD = null private var input: TimeSeriesRDD = null private lazy val init: Unit = { priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) forecastTSRdd = fromCSV("Forecast.csv", Schema("id" -> IntegerType, "forecast" -> DoubleType)) input = priceTSRdd.leftJoin(forecastTSRdd, key = Seq("id")).addColumns( "price2" -> DoubleType -> { r: Row => r.getAs[Double]("price") }, "price3" -> DoubleType -> { r: Row => -r.getAs[Double]("price") }, "price4" -> DoubleType -> { r: Row => r.getAs[Double]("price") * 2 }, "price5" -> DoubleType -> { r: Row => 0d } ) } "CovarianceSummarizer" should "`computeCovariance` correctly" in { init var results = input.summarize(Summarizers.covariance("price", "price2"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("price_price2_covariance") === 3.368055556) assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("price_price2_covariance") === 2.534722222) results = input.summarize(Summarizers.covariance("price", "price3"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("price_price3_covariance") === -3.368055556) assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("price_price3_covariance") === -2.534722222) results = input.summarize(Summarizers.covariance("price", "price4"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("price_price4_covariance") === 6.736111111) assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("price_price4_covariance") === 5.069444444) results = input.summarize(Summarizers.covariance("price", "price5"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("price_price5_covariance") === 0d) assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("price_price5_covariance") === 0d) results = input.summarize(Summarizers.covariance("price", "forecast"), Seq("id")).collect() assert(results.find(_.getAs[Int]("id") == 7).head.getAs[Double]("price_forecast_covariance") === -0.190277778) assert(results.find(_.getAs[Int]("id") == 3).head.getAs[Double]("price_forecast_covariance") === -3.783333333) } it should "ignore null values" in { init val inputWithNull = insertNullRows(input, "price", "forecast") assertEquals( inputWithNull.summarize(Summarizers.covariance("price", "forecast")), input.summarize(Summarizers.covariance("price", "forecast")) ) assertEquals( inputWithNull.summarize(Summarizers.covariance("price", "forecast"), Seq("id")), input.summarize(Summarizers.covariance("price", "forecast"), Seq("id")) ) } it should "pass summarizer property test" in { summarizerPropertyTest(AllProperties)(Summarizers.covariance("x1", "x2")) summarizerPropertyTest(AllProperties)(Summarizers.covariance("x0", "x3")) } }
Example 25
Source File: SummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ DoubleType, IntegerType } class SummarizerSpec extends TimeSeriesSuite { "SummarizerFactory" should "support alias." in { withResource("/timeseries/csv/Price.csv") { source => val expectedSchema = Schema("C1" -> IntegerType, "C2" -> DoubleType) val timeseriesRdd = CSV.from(sqlContext, "file://" + source, sorted = true, schema = expectedSchema) assert(timeseriesRdd.schema == expectedSchema) val result: Row = timeseriesRdd.summarize(Summarizers.count().prefix("alias")).first() assert(result.getAs[Long]("alias_count") == timeseriesRdd.count()) } } }
Example 26
Source File: SummarizeIntervalsSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.types.{ DoubleType, LongType, IntegerType } class SummarizeIntervalsSpec extends MultiPartitionSuite with TimeSeriesTestData with TimeTypeSuite { override val defaultResourceDir: String = "/timeseries/summarizeintervals" "SummarizeInterval" should "pass `SummarizeSingleColumn` test." in { withAllTimeType { val volumeTSRdd = fromCSV( "Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType) ) volumeTSRdd.toDF.show() val clockTSRdd = fromCSV("Clock.csv", Schema()) val resultTSRdd = fromCSV("SummarizeSingleColumn.results", Schema("volume_sum" -> DoubleType)) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeIntervals(clockTSRdd, Summarizers.sum("volume")) summarizedVolumeTSRdd.toDF.show() assert(summarizedVolumeTSRdd.collect().deep == resultTSRdd.collect().deep) } withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass `SummarizeSingleColumnPerKey` test, i.e. with additional a single key." in { withAllTimeType { val volumeTSRdd = fromCSV( "Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType) ) val clockTSRdd = fromCSV("Clock.csv", Schema()) val resultTSRdd = fromCSV( "SummarizeSingleColumnPerKey.results", Schema("id" -> IntegerType, "volume_sum" -> DoubleType) ) val result2TSRdd = fromCSV( "SummarizeV2PerKey.results", Schema("id" -> IntegerType, "v2_sum" -> DoubleType) ) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeIntervals(clockTSRdd, Summarizers.sum("volume"), Seq("id")) assertEquals(summarizedVolumeTSRdd, resultTSRdd) val summarizedV2TSRdd = rdd.summarizeIntervals(clockTSRdd, Summarizers.sum("v2"), Seq("id")) assertEquals(summarizedV2TSRdd, result2TSRdd) } withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass `SummarizeSingleColumnPerSeqOfKeys` test, i.e. with additional a sequence of keys." in { withAllTimeType { val volumeTSRdd = fromCSV( "VolumeWithIndustryGroup.csv", Schema("id" -> IntegerType, "group" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType) ) val clockTSRdd = fromCSV("Clock.csv", Schema()) val resultTSRdd = fromCSV( "SummarizeSingleColumnPerSeqOfKeys.results", Schema("id" -> IntegerType, "group" -> IntegerType, "volume_sum" -> DoubleType) ) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeIntervals( clockTSRdd, Summarizers.sum("volume"), Seq("id", "group") ) assertEquals(summarizedVolumeTSRdd, resultTSRdd) } withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } }
Example 27
Source File: MergeSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.types.{ DoubleType, IntegerType } class MergeSpec extends MultiPartitionSuite with TimeSeriesTestData { override val defaultResourceDir: String = "/timeseries/merge" "Merge" should "pass `Merge` test." in { val resultsTSRdd = fromCSV("Merge.results", Schema("id" -> IntegerType, "price" -> DoubleType)) def test(rdd1: TimeSeriesRDD, rdd2: TimeSeriesRDD): Unit = { val mergedTSRdd = rdd1.merge(rdd2) assert(resultsTSRdd.schema == mergedTSRdd.schema) assert(resultsTSRdd.collect().deep == mergedTSRdd.collect().deep) } { val priceTSRdd1 = fromCSV("Price1.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val priceTSRdd2 = fromCSV("Price2.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) withPartitionStrategy(priceTSRdd1, priceTSRdd2)(DEFAULT)(test) } } it should "pass generated cycle data test" in { val testData1 = cycleData1 val testData2 = cycleData2 def merge(rdd1: TimeSeriesRDD, rdd2: TimeSeriesRDD): TimeSeriesRDD = { rdd1.merge(rdd2) } withPartitionStrategyCompare(testData1, testData2)(ALL)(merge) } }
Example 28
Source File: Preprocess.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.BitCoin import java.io.{ BufferedWriter, File, FileWriter } import org.apache.spark.sql.types.{ DoubleType, IntegerType, StructField, StructType } import org.apache.spark.sql.{ DataFrame, Row, SparkSession } import scala.collection.mutable.ListBuffer object Preprocess { //how many of first rows are omitted val dropFirstCount: Int = 612000 def rollingWindow(data: DataFrame, window: Int, xFilename: String, yFilename: String): Unit = { var i = 0 val xWriter = new BufferedWriter(new FileWriter(new File(xFilename))) val yWriter = new BufferedWriter(new FileWriter(new File(yFilename))) val zippedData = data.rdd.zipWithIndex().collect() System.gc() val dataStratified = zippedData.drop(dropFirstCount) //todo slice fisrt 614K while (i < (dataStratified.length - window)) { val x = dataStratified .slice(i, i + window) .map(r => r._1.getAs[Double]("Delta")).toList val y = dataStratified.apply(i + window)._1.getAs[Integer]("label") val stringToWrite = x.mkString(",") xWriter.write(stringToWrite + "\n") yWriter.write(y + "\n") i += 1 if (i % 10 == 0) { xWriter.flush() yWriter.flush() } } xWriter.close() yWriter.close() } def main(args: Array[String]): Unit = { //todo modify these variables to match desirable files val priceDataFileName: String = "C:/Users/admin-karim/Desktop/bitstampUSD_1-min_data_2012-01-01_to_2017-10-20.csv/bitstampUSD_1-min_data_2012-01-01_to_2017-10-20.csv" val outputDataFilePath: String = "output/scala_test_x.csv" val outputLabelFilePath: String = "output/scala_test_y.csv" val spark = SparkSession .builder() .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Bitcoin Preprocessing") .getOrCreate() val data = spark.read.format("com.databricks.spark.csv").option("header", "true").load(priceDataFileName) data.show(10) println((data.count(), data.columns.size)) val dataWithDelta = data.withColumn("Delta", data("Close") - data("Open")) import org.apache.spark.sql.functions._ import spark.sqlContext.implicits._ val dataWithLabels = dataWithDelta.withColumn("label", when($"Close" - $"Open" > 0, 1).otherwise(0)) rollingWindow(dataWithLabels, 22, outputDataFilePath, outputLabelFilePath) spark.stop() } }
Example 29
Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.http.HttpStreamClient import org.junit.Assert import org.junit.Test import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.types.BooleanType import org.apache.spark.sql.types.FloatType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.ByteType import org.apache.spark.sql.execution.streaming.http.HttpStreamServer import org.apache.spark.sql.execution.streaming.http.StreamPrinter import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException class HttpStreamServerClientTest { val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte), Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte), Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte)); val ROWS2 = Array(Row("hello"), Row("world"), Row("bye"), Row("world")); @Test def testHttpStreamIO() { //starts a http server val kryoSerializer = new KryoSerializer(new SparkConf()); val server = HttpStreamServer.start("/xxxx", 8080); val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]") .getOrCreate(); spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/"); val sqlContext = spark.sqlContext; import spark.implicits._ //add a local message buffer to server, with 2 topics registered server.withBuffer() .addListener(new StreamPrinter()) .createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1") .createTopic[String]("topic-2"); val client = HttpStreamClient.connect("http://localhost:8080/xxxx"); //tests schema of topics val schema1 = client.fetchSchema("topic-1"); Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType), schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]); val schema2 = client.fetchSchema("topic-2"); Assert.assertArrayEquals(Array[Object](StringType), schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]); //prepare to consume messages val sid1 = client.subscribe("topic-1")._1; val sid2 = client.subscribe("topic-2")._1; //produces some data client.sendRows("topic-1", 1, ROWS1); val sid4 = client.subscribe("topic-1")._1; val sid5 = client.subscribe("topic-2")._1; client.sendRows("topic-2", 1, ROWS2); //consumes data val fetched = client.fetchStream(sid1).map(_.originalRow); Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]); //it is empty now Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); client.unsubscribe(sid4); try { client.fetchStream(sid4); //exception should be thrown, because subscriber id is invalidated Assert.assertTrue(false); } catch { case e: Throwable ⇒ e.printStackTrace(); Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass); } server.stop(); } }
Example 30
Source File: MultinomialLogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class MultinomialLogisticRegressionParitySpec extends SparkParityBase { val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0) val ages = Seq(15, 30, 40, 50, 15, 80) val heights = Seq(175, 190, 155, 160, 170, 180) val weights = Seq(67, 100, 57, 56, 56, 88) val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) }) val schema = new StructType().add("label", DoubleType, nullable = false) .add("age", IntegerType, nullable = false) .add("height", IntegerType, nullable = false) .add("weight", IntegerType, nullable = false) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new VectorAssembler(). setInputCols(Array("age", "height", "weight")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)), interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703), numClasses = 3, isMultinomial = true))).fit(dataset) }
Example 31
Source File: SparkTransformBuilderSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.spark import java.util.UUID import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{DoubleType, StructType} import SparkSupport._ import ml.combust.mleap.core.{Model, types} import ml.combust.mleap.core.types.{NodeShape, ScalarType, StructField} import ml.combust.mleap.runtime.frame.{FrameBuilder, Transformer} import org.scalatest.FunSpec import scala.collection.JavaConverters._ import scala.util.Try case class MyTransformer() extends Transformer { override val uid: String = UUID.randomUUID().toString override def transform[TB <: FrameBuilder[TB]](builder: TB): Try[TB] = { builder.withColumns(Seq("output1", "output2"), "input") { (input: Double) => (input + 23, input.toString) } } override val shape: NodeShape = NodeShape().withStandardInput("input"). withOutput("output1", "output1").withOutput("output2", "output2") override val model: Model = new Model { override def inputSchema: types.StructType = types.StructType("input" -> ScalarType.Double).get override def outputSchema: types.StructType = types.StructType("output1" -> ScalarType.Double, "output2" -> ScalarType.String).get } } class SparkTransformBuilderSpec extends FunSpec { describe("transformer with multiple outputs") { it("works with Spark as well") { val spark = SparkSession.builder(). appName("Spark/MLeap Parity Tests"). master("local[2]"). getOrCreate() val schema = new StructType(). add("input", DoubleType) val data = Seq(Row(45.7d)).asJava val dataset = spark.createDataFrame(data, schema) val transformer = MyTransformer() val outputDataset = transformer.sparkTransform(dataset).collect() assert(outputDataset.head.getDouble(1) == 68.7) assert(outputDataset.head.getString(2) == "45.7") } } describe("input/output schema") { it("has the correct inputs and outputs") { val transformer = MyTransformer() assert(transformer.schema.fields == Seq(StructField("input", types.ScalarType.Double), StructField("output1", types.ScalarType.Double), StructField("output2", types.ScalarType.String))) } } }
Example 32
Source File: MathUnary.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType} import org.apache.spark.sql.functions.udf private val className = classOf[MathUnary].getName override def load(path: String): MathUnary = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("operation").head() val operation = data.getAs[String](0) val model = MathUnaryModel(UnaryOperation.forName(operation)) val transformer = new MathUnary(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 33
Source File: ImputerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import org.apache.spark.ml.Transformer import org.apache.spark.ml.mleap.feature.Imputer import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ import org.apache.spark.sql.types.{DoubleType, StructType} import scala.util.Random class ImputerParitySpec extends SparkParityBase { def randomRow(): Row = { if(Random.nextBoolean()) { if(Random.nextBoolean()) { Row(23.4) } else { Row(Random.nextDouble()) } } else { Row(33.2) } } val rows = spark.sparkContext.parallelize(Seq.tabulate(100) { i => randomRow() }) val schema = new StructType().add("mv", DoubleType, nullable = true) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Imputer(uid = "imputer"). setInputCol("mv"). setOutputCol("mv_imputed"). setMissingValue(23.4). setStrategy("mean").fit(dataset) }
Example 34
Source File: DebugRowOpsSuite.scala From tensorframes with Apache License 2.0 | 5 votes |
package org.tensorframes import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructType} import org.scalatest.FunSuite import org.tensorframes.impl.{DebugRowOpsImpl, ScalarDoubleType} import org.tensorframes.dsl._ class DebugRowOpsSuite extends FunSuite with TensorFramesTestSparkContext with GraphScoping with Logging { lazy val sql = sqlContext import ColumnInformation.structField import Shape.Unknown testGraph("Simple identity") { val rows = Array(Row(1.0)) val input = StructType(Array(structField("x", ScalarDoubleType, Shape(Unknown)))) val p2 = placeholder[Double](1) named "x" val out = identity(p2) named "y" val outputSchema = StructType(Array(structField("y", ScalarDoubleType, Shape(Unknown)))) val (g, _) = TestUtilities.analyzeGraph(out) logDebug(g.toString) val res = DebugRowOpsImpl.performMap(rows, input, Array("x" -> 0), g, outputSchema) assert(res === Array(Row(1.0, 1.0))) } testGraph("Simple add") { val rows = Array(Row(1.0)) val input = StructType(Array(structField("x", ScalarDoubleType, Shape(Unknown)))) val p2 = placeholder[Double](1) named "x" val out = p2 + p2 named "y" val outputSchema = StructType(Array(structField("y", ScalarDoubleType, Shape(Unknown)))) val (g, _) = TestUtilities.analyzeGraph(out) logDebug(g.toString) val res = DebugRowOpsImpl.performMap(rows, input, Array("x" -> 0), g, outputSchema) assert(res === Array(Row(2.0, 1.0))) } }
Example 35
Source File: ExtraOperationsSuite.scala From tensorframes with Apache License 2.0 | 5 votes |
package org.tensorframes import org.apache.spark.sql.types.{DoubleType, IntegerType} import org.scalatest.FunSuite import org.tensorframes.impl.{ScalarDoubleType, ScalarIntType} class ExtraOperationsSuite extends FunSuite with TensorFramesTestSparkContext with Logging { lazy val sql = sqlContext import ExtraOperations._ import sql.implicits._ import Shape.Unknown test("simple test for doubles") { val df = Seq(Tuple1(0.0)).toDF("a") val di = ExtraOperations.explainDetailed(df) val Seq(c1) = di.cols val Some(s) = c1.stf assert(s.dataType === ScalarDoubleType) assert(s.shape === Shape(Unknown)) logDebug(df.toString() + "->" + di.toString) } test("simple test for integers") { val df = Seq(Tuple1(0)).toDF("a") val di = explainDetailed(df) val Seq(c1) = di.cols val Some(s) = c1.stf assert(s.dataType === ScalarIntType) assert(s.shape === Shape(Unknown)) logDebug(df.toString() + "->" + di.toString) } test("test for arrays") { val df = Seq((0.0, Seq(1.0), Seq(Seq(1.0)))).toDF("a", "b", "c") val di = explainDetailed(df) logDebug(df.toString() + "->" + di.toString) val Seq(c1, c2, c3) = di.cols val Some(s1) = c1.stf assert(s1.dataType === ScalarDoubleType) assert(s1.shape === Shape(Unknown)) val Some(s2) = c2.stf assert(s2.dataType === ScalarDoubleType) assert(s2.shape === Shape(Unknown, Unknown)) val Some(s3) = c3.stf assert(s3.dataType === ScalarDoubleType) assert(s3.shape === Shape(Unknown, Unknown, Unknown)) } test("simple analysis") { val df = Seq(Tuple1(0.0)).toDF("a") val df2 = analyze(df) val di = explainDetailed(df2) logDebug(df.toString() + "->" + di.toString) val Seq(c1) = di.cols val Some(s) = c1.stf assert(s.dataType === ScalarDoubleType) assert(s.shape === Shape(1)) // There is only one partition } test("simple analysis with multiple partitions of different sizes") { val df = Seq.fill(10)(0.0).map(Tuple1.apply).toDF("a").repartition(3) val df2 = analyze(df) val di = explainDetailed(df2) logDebug(df.toString() + "->" + di.toString) val Seq(c1) = di.cols val Some(s) = c1.stf assert(s.dataType === ScalarDoubleType) assert(s.shape === Shape(Unknown)) // There is only one partition } test("simple analysis with variable sizes") { val df = Seq( (0.0, Seq(0.0)), (1.0, Seq(1.0, 1.0))).toDF("a", "b") val df2 = analyze(df) val di = explainDetailed(df2) logDebug(df.toString() + "->" + di.toString) val Seq(c1, c2) = di.cols val Some(s2) = c2.stf assert(s2.dataType === ScalarDoubleType) assert(s2.shape === Shape(2, Unknown)) // There is only one partition } test("2nd order analysis") { val df = Seq( (0.0, Seq(0.0, 0.0)), (1.0, Seq(1.0, 1.0)), (2.0, Seq(2.0, 2.0))).toDF("a", "b") val df2 = analyze(df) val di = explainDetailed(df2) logDebug(df.toString() + "->" + di.toString) val Seq(c1, c2) = di.cols val Some(s2) = c2.stf assert(s2.dataType === ScalarDoubleType) assert(s2.shape === Shape(3, 2)) // There is only one partition } }
Example 36
Source File: SlicingSuite.scala From tensorframes with Apache License 2.0 | 5 votes |
package org.tensorframes import org.scalatest.FunSuite import org.tensorframes.dsl.GraphScoping import org.tensorframes.impl.DebugRowOps import org.tensorframes.{dsl => tf} import org.tensorframes.dsl.Implicits._ import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, IntegerType} class SlicingSuite extends FunSuite with TensorFramesTestSparkContext with Logging with GraphScoping { lazy val sql = sqlContext import Shape.Unknown val ops = new DebugRowOps test("2D - 1") { val df = make1(Seq(Seq(1.0, 2.0), Seq(3.0, 4.0)), "x") val x = df.block("x") // val y = } }
Example 37
Source File: BuildAndTeardownData.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.testfixtures import java.io.File import com.holdenkarau.spark.testing.Utils import com.ibm.sparktc.sparkbench.utils.SaveModes import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk import com.ibm.sparktc.sparkbench.workload.ml.KMeansWorkload import org.apache.spark.mllib.util.KMeansDataGenerator import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType} class BuildAndTeardownData(dirname: String = System.currentTimeMillis.toString) { val prefix = "/tmp/spark-bench-scalatest/" + dirname val sparkBenchTestFolder = s"$prefix/spark-bench-test" val kmeansFile = s"$sparkBenchTestFolder/kmeans-data.parquet" val sparkBenchDemoFolder = s"$prefix/spark-bench-demo" val spark = SparkSessionProvider.spark def createFolders(): Unit = { val fileSeq = Seq(new File(sparkBenchTestFolder), new File(sparkBenchDemoFolder)) fileSeq.foreach(folder => folder.mkdirs()) } def deleteFolders(): Unit = { Utils.deleteRecursively(new File(prefix)) } def generateKMeansData(rows: Int, cols: Int, outputFile: String): Unit = { val data: RDD[Array[Double]] = KMeansDataGenerator.generateKMeansRDD( spark.sparkContext, rows, KMeansWorkload.numOfClusters, cols, KMeansWorkload.scaling, KMeansWorkload.numOfPartitions ) val schemaString = data.first().indices.map(_.toString).mkString(" ") val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false)) val schema = StructType(fields) val rowRDD = data.map(arr => Row(arr:_*)) val df = spark.createDataFrame(rowRDD, schema) writeToDisk(outputFile, SaveModes.overwrite, df, spark) } }
Example 38
Source File: KMeansWorkloadTest.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload.ml import java.io.File import com.holdenkarau.spark.testing.Utils import com.ibm.sparktc.sparkbench.testfixtures.SparkSessionProvider import com.ibm.sparktc.sparkbench.utils.SaveModes import com.ibm.sparktc.sparkbench.utils.SparkFuncs.{load, writeToDisk} import org.apache.spark.mllib.util.KMeansDataGenerator import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} class KMeansWorkloadTest extends FlatSpec with Matchers with BeforeAndAfterEach { private val spark = SparkSessionProvider.spark private val fileName = s"/tmp/spark-bench-scalatest/kmeans-${java.util.UUID.randomUUID.toString}.csv" override def afterEach() { Utils.deleteRecursively(new File(fileName)) } def makeDataFrame(): DataFrame = { val data: RDD[Array[Double]] = KMeansDataGenerator.generateKMeansRDD( spark.sparkContext, 1, 1, 1, KMeansWorkload.scaling, KMeansWorkload.numOfPartitions ) val schemaString = data.first().indices.map(_.toString).mkString(" ") val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false)) val schema = StructType(fields) val rowRDD = data.map(arr => Row(arr: _*)) spark.createDataFrame(rowRDD, schema) } "reconcileSchema" should "handle a StringType schema and turn it into a DoubleType Schema" in { val df2Disk = makeDataFrame() writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv")) val conf = Map("name" -> "kmeans", "input" -> fileName) val work = KMeansWorkload(conf) val df = load(spark, fileName) val ddf = work.reconcileSchema(df) ddf.schema.head.dataType shouldBe DoubleType } "The load function" should "parse the DataFrame it's given into an RDD[Vector]" in { val df = makeDataFrame() val conf = Map("name" -> "kmeans", "input" -> "") val work = KMeansWorkload(conf) val ddf = work.reconcileSchema(df) val (_, rdd) = work.loadToCache(ddf, spark) rdd.first() } it should "work even when we've pulled the data from disk" in { val df2Disk = makeDataFrame() writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv")) val conf = Map("name" -> "kmeans", "input" -> fileName) val work = KMeansWorkload(conf) val df = load(spark, fileName) val ddf = work.reconcileSchema(df) val (_, rdd) = work.loadToCache(ddf, spark) rdd.first() } "doWorkload" should "work" in { val df2Disk = makeDataFrame() writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv")) val conf = Map("name" -> "kmeans", "input" -> fileName) val work = KMeansWorkload(conf) val df = load(spark, fileName) val ddf = work.reconcileSchema(df) work.doWorkload(Some(ddf), spark) } }
Example 39
Source File: BinaryClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 40
Source File: MulticlassClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 41
Source File: RegressionEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 42
Source File: XGBoost.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import eleflow.uberdata.models.UberXGBOOSTModel import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType} import scala.reflect.ClassTag class XGBoost[I](override val uid: String, val models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))])( implicit kt: ClassTag[I], ord: Ordering[I] = null) extends ForecastBaseModel[XGBoostSmallModel[I]] with HasInputCol with HasOutputCol with DefaultParamsWritable with HasFeaturesCol with HasNFutures with HasGroupByCol { def this( models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))] )(implicit kt: ClassTag[I], ord: Ordering[I] ) = this(Identifiable.randomUID("xgboost"), models) override def transform(dataSet: Dataset[_]): DataFrame = { val schema = dataSet.schema val predSchema = transformSchema(schema) val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)}) val predictions = joined.map { case (id, ((bestModel, metrics), row)) => val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]]( IUberdataForecastUtil.FEATURES_COL_NAME ) val label = DataTransformer.toFloat(row.getAs($(featuresCol))) val labelPoint = features.map { vec => val array = vec.toArray.map(_.toFloat) LabeledPoint(label, null, array) } val matrix = new DMatrix(labelPoint.toIterator) val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance .predict(matrix) .flatMap(_.map(_.toDouble)) .splitAt(features.length) Row( row.toSeq :+ Vectors .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _* ) } dataSet.sqlContext.createDataFrame(predictions, predSchema) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra) }
Example 43
Source File: S2CellTransformer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import com.google.common.geometry.{S2LatLng, S2CellId} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} class S2CellTransformer(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("S2CellTransformer")) // Input/Output column names val latCol: Param[String] = new Param[String](this, "latCol", "latitude column") val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column") val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column") val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]", (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i)) // Default parameters setDefault( latCol -> "lat", lonCol -> "lon", cellCol -> "cell", level -> 10 ) def getLatCol: String = $(latCol) def getLonCol: String = $(lonCol) def getCellCol: String = $(cellCol) def getLevel: Int = $(level) def setLatCol(value: String): this.type = set(latCol, value) def setLonCol(value: String): this.type = set(lonCol, value) def setCellCol(value: String): this.type = set(cellCol, value) def setLevel(value: Int): this.type = set(level, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val currentLevel = $(level) val t = udf { (lat: Double, lon: Double) => val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon)) cellId.parent(currentLevel).toToken } val metadata = outputSchema($(cellCol)).metadata dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val latColumnName = $(latCol) val latDataType = schema(latColumnName).dataType require(latDataType == DoubleType, s"The latitude column $latColumnName must be Double type, " + s"but got $latDataType.") val lonColumnName = $(lonCol) val lonDataType = schema(lonColumnName).dataType require(lonDataType == DoubleType, s"The longitude column $lonColumnName must be Double type, " + s"but got $lonDataType.") val inputFields = schema.fields val outputColName = $(cellCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = NominalAttribute.defaultAttr.withName($(cellCol)) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) }
Example 44
Source File: udfs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Column import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.DoubleType import scala.collection.mutable //scalastyle:off object udfs { def get_value_at(colName: String, i: Int): Column = { udf({ vec: org.apache.spark.ml.linalg.Vector => vec(i) }, DoubleType)(col(colName)) } val to_vector: UserDefinedFunction = udf({ arr: Seq[Double] => Vectors.dense(arr.toArray) }, VectorType) def to_vector(colName: String): Column = to_vector(col(colName)) }
Example 45
Source File: PartitionConsolidatorSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.flaky import com.microsoft.ml.spark.core.test.base.TimeLimitedFlaky import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.io.http.PartitionConsolidator import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalatest.Assertion class PartitionConsolidatorSuite extends TransformerFuzzing[PartitionConsolidator] with TimeLimitedFlaky { import session.implicits._ override val numCores: Option[Int] = Some(2) lazy val df: DataFrame = (1 to 1000).toDF("values") override val sortInDataframeEquality: Boolean = true override def testObjects(): Seq[TestObject[PartitionConsolidator]] = Seq( new TestObject(new PartitionConsolidator(), df)) override def reader: MLReadable[_] = PartitionConsolidator def getPartitionDist(df: DataFrame): List[Int] = { df.rdd.mapPartitions(it => Iterator(it.length)).collect().toList } //TODO figure out what is causing the issue on the build server override def testSerialization(): Unit = {} override def testExperiments(): Unit = {} def basicTest(df: DataFrame): Assertion = { val pd1 = getPartitionDist(df) val newDF = new PartitionConsolidator().transform(df) val pd2 = getPartitionDist(newDF) assert(pd1.sum === pd2.sum) assert(pd2.max >= pd1.max) assert(pd1.length === pd2.length) } test("basic functionality") { basicTest(df) } test("works with more partitions than cores") { basicTest(df.repartition(12)) } test("overheads") { val baseDF = (1 to 1000).toDF("values").cache() println(baseDF.count()) def getDF: Dataset[Row] = baseDF.map { x => Thread.sleep(10); x }( RowEncoder(new StructType().add("values", DoubleType))) val t1 = getTime(3)( getDF.foreach(_ => ()))._2 val t2 = getTime(3)( new PartitionConsolidator().transform(getDF).foreach(_ => ()))._2 println(t2.toDouble / t1.toDouble) assert(t2.toDouble / t1.toDouble < 3.0) } test("works with more partitions than cores2") { basicTest(df.repartition(100)) } test("work with 1 partition") { basicTest(df.repartition(1)) } }
Example 46
Source File: BinaryClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 47
Source File: MulticlassClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 48
Source File: RegressionEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 49
Source File: Binarizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) }
Example 50
Source File: BinaryClassificationEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() case other => throw new IllegalArgumentException(s"Does not support metric $other.") } metrics.unpersist() metric } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 51
Source File: RegressionEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => -metrics.rootMeanSquaredError case "mse" => -metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => -metrics.meanAbsoluteError } metric } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 52
Source File: GBTClassificationModel.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.models import org.apache.spark.ml.classification.{GBTClassificationModel => SparkGBTClassificationModel, GBTClassifier => SparkGBTClassifier} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import ai.deepsense.commons.utils.Logging import ai.deepsense.deeplang.doperables.report.CommonTablesGenerators.SparkSummaryEntry import ai.deepsense.deeplang.doperables.report.{CommonTablesGenerators, Report} import ai.deepsense.deeplang.doperables.spark.wrappers.params.common.PredictorParams import ai.deepsense.deeplang.doperables.stringindexingwrapper.StringIndexingWrapperModel import ai.deepsense.deeplang.doperables.{LoadableWithFallback, SparkModelWrapper} import ai.deepsense.deeplang.params.Param import ai.deepsense.sparkutils.ML class GBTClassificationModel(vanilaModel: VanillaGBTClassificationModel) extends StringIndexingWrapperModel[SparkGBTClassificationModel, SparkGBTClassifier](vanilaModel) { def this() = this(new VanillaGBTClassificationModel()) } class VanillaGBTClassificationModel() extends SparkModelWrapper[SparkGBTClassificationModel, SparkGBTClassifier] with LoadableWithFallback[SparkGBTClassificationModel, SparkGBTClassifier] with PredictorParams with Logging { override protected def applyTransformSchema(schema: StructType): Option[StructType] = { val predictionColumnName = $(predictionColumn) Some(StructType(schema.fields :+ StructField(predictionColumnName, DoubleType))) } override val params: Array[Param[_]] = Array(featuresColumn, predictionColumn) override def report(extended: Boolean = true): Report = { val summary = List( SparkSummaryEntry( name = "number of features", value = sparkModel.numFeatures, description = "Number of features the model was trained on.")) super.report(extended) .withReportName( s"${this.getClass.getSimpleName} with ${sparkModel.numTrees} trees") .withAdditionalTable(CommonTablesGenerators.modelSummary(summary)) .withAdditionalTable( CommonTablesGenerators.decisionTree( sparkModel.treeWeights, sparkModel.trees), 2) } override protected def transformerName: String = classOf[GBTClassificationModel].getSimpleName override def tryToLoadModel(path: String): Option[SparkGBTClassificationModel] = { ML.ModelLoading.GBTClassification(path) } }
Example 53
Source File: RandomForestClassificationModel.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.models import org.apache.spark.ml.classification.{RandomForestClassificationModel => SparkRandomForestClassificationModel, RandomForestClassifier => SparkRandomForestClassifier} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import ai.deepsense.deeplang.doperables.report.CommonTablesGenerators.SparkSummaryEntry import ai.deepsense.deeplang.doperables.report.{CommonTablesGenerators, Report} import ai.deepsense.deeplang.doperables.spark.wrappers.params.common.ProbabilisticClassifierParams import ai.deepsense.deeplang.doperables.stringindexingwrapper.StringIndexingWrapperModel import ai.deepsense.deeplang.doperables.{LoadableWithFallback, SparkModelWrapper} import ai.deepsense.deeplang.params.Param import ai.deepsense.sparkutils.ML class RandomForestClassificationModel( vanillaModel: VanillaRandomForestClassificationModel) extends StringIndexingWrapperModel[ SparkRandomForestClassificationModel, SparkRandomForestClassifier](vanillaModel) { def this() = this(new VanillaRandomForestClassificationModel()) } class VanillaRandomForestClassificationModel extends SparkModelWrapper[ SparkRandomForestClassificationModel, SparkRandomForestClassifier] with LoadableWithFallback[ SparkRandomForestClassificationModel, SparkRandomForestClassifier] with ProbabilisticClassifierParams { override protected def applyTransformSchema(schema: StructType): Option[StructType] = { val predictionColumnName = $(predictionColumn) val probabilityColumnName = $(probabilityColumn) val rawPredictionColumnName = $(rawPredictionColumn) Some(StructType(schema.fields ++ Seq( StructField(predictionColumnName, DoubleType), StructField(probabilityColumnName, new ai.deepsense.sparkutils.Linalg.VectorUDT), StructField(rawPredictionColumnName, new ai.deepsense.sparkutils.Linalg.VectorUDT) ))) } override val params: Array[Param[_]] = Array( featuresColumn, predictionColumn, probabilityColumn, rawPredictionColumn) // thresholds override def report(extended: Boolean = true): Report = { val treeWeight = SparkSummaryEntry( name = "tree weights", value = sparkModel.treeWeights, description = "Weights for each tree." ) super.report(extended) .withAdditionalTable(CommonTablesGenerators.modelSummary(List(treeWeight))) } override protected def transformerName: String = classOf[RandomForestClassificationModel].getSimpleName override def tryToLoadModel(path: String): Option[SparkRandomForestClassificationModel] = { ML.ModelLoading.randomForestClassification(path) } }
Example 54
Source File: UnionIntegSpec.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperations.exceptions.SchemaMismatchException import ai.deepsense.deeplang.inference.{InferContext, InferenceWarnings} import ai.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport} class UnionIntegSpec extends DeeplangIntegTestSupport { import DeeplangIntegTestSupport._ val schema1 = StructType(List( StructField("column1", DoubleType), StructField("column2", DoubleType))) val rows1_1 = Seq( Row(1.0, 2.0), Row(2.0, 3.0) ) "Union" should { "return a union of two DataFrames" in { val rows1_2 = Seq( Row(2.0, 4.0), Row(4.0, 6.0) ) val df1 = createDataFrame(rows1_1, schema1) val df2 = createDataFrame(rows1_2, schema1) val merged = Union() .executeUntyped(Vector(df1, df2))(executionContext) .head.asInstanceOf[DataFrame] assertDataFramesEqual( merged, createDataFrame(rows1_1 ++ rows1_2, schema1)) } "throw for mismatching types in DataFrames" in { val schema2 = StructType(List( StructField("column1", StringType), StructField("column2", DoubleType))) val rows2_1 = Seq( Row("a", 1.0), Row("b", 1.0) ) val df1 = createDataFrame(rows1_1, schema1) val df2 = createDataFrame(rows2_1, schema2) a [SchemaMismatchException] should be thrownBy { Union().executeUntyped(Vector(df1, df2))(executionContext) } } "throw for mismatching column names in DataFrames" in { val schema2 = StructType(List( StructField("column1", DoubleType), StructField("different_column_name", DoubleType))) val rows2_1 = Seq( Row(1.1, 1.0), Row(1.1, 1.0) ) val df1 = createDataFrame(rows1_1, schema1) val df2 = createDataFrame(rows2_1, schema2) a [SchemaMismatchException] should be thrownBy { Union().executeUntyped(Vector(df1, df2))(executionContext) } } } it should { "propagate schema when both schemas match" in { val structType = StructType(Seq( StructField("x", DoubleType), StructField("y", DoubleType))) val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType)) val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType)) Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext]) shouldBe (Vector(knowledgeDF1), InferenceWarnings()) } "generate error when schemas don't match" in { val structType1 = StructType(Seq( StructField("x", DoubleType))) val structType2 = StructType(Seq( StructField("y", DoubleType))) val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType1)) val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType2)) an [SchemaMismatchException] shouldBe thrownBy( Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext])) } } }
Example 55
Source File: DataFrameReportPerformanceSpec.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.dataframe import java.sql.Timestamp import java.text.{DateFormat, SimpleDateFormat} import java.util.TimeZone import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType, TimestampType} import org.scalatest.{BeforeAndAfter, Ignore} import ai.deepsense.commons.utils.{DoubleUtils, Logging} import ai.deepsense.deeplang.{TestFiles, DeeplangIntegTestSupport} // It's ignored because it does not have got assertions, it only prints report generation time. @Ignore class DataFrameReportPerformanceSpec extends DeeplangIntegTestSupport with BeforeAndAfter with TestFiles with Logging { val testFile = absoluteTestsDirPath.pathWithoutScheme + "/demand_without_header.csv" "DataFrame" should { "generate report" when { "DataFrame has 17K of rows" in { val numberOfTries = 10 var results: Seq[Double] = Seq() for (i <- 1 to numberOfTries) { val dataFrame: DataFrame = demandDataFrame() val start = System.nanoTime() val report = dataFrame.report() val end = System.nanoTime() val time1: Double = (end - start).toDouble / 1000000000.0 results = results :+ time1 logger.debug("Report generation time: {}", DoubleUtils.double2String(time1)) } logger.debug( "Mean report generation time: {}", DoubleUtils.double2String(results.fold(0D)(_ + _) / numberOfTries.toDouble)) } } } private def demandDataFrame(): DataFrame = { val rddString: RDD[String] = executionContext.sparkContext.textFile(testFile) val data: RDD[Row] = rddString.map(DataFrameHelpers.demandString2Row) executionContext.dataFrameBuilder.buildDataFrame(demandSchema, data) } private def demandSchema: StructType = StructType(Seq( StructField("datetime", TimestampType), StructField("log_count", DoubleType), StructField("workingday", DoubleType), StructField("holiday", DoubleType), StructField("season2", DoubleType), StructField("season3", DoubleType), StructField("season4", DoubleType))) private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } } private object DataFrameHelpers { def demandString2Row(s: String): Row = { val split = s.split(",") Row( timestamp(split(0)), split(1).toDouble, split(2).toDouble, split(3).toDouble, split(4).toDouble, split(5).toDouble, split(6).toDouble ) } private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } }
Example 56
Source File: AbstractEvaluatorSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.ParamPair import ai.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport} import ai.deepsense.sparkutils.Linalg.Vectors abstract class AbstractEvaluatorSmokeTest extends DeeplangIntegTestSupport { def className: String val evaluator: Evaluator val evaluatorParams: Seq[ParamPair[_]] val inputDataFrameSchema = StructType(Seq( StructField("s", StringType), StructField("prediction", DoubleType), StructField("rawPrediction", new ai.deepsense.sparkutils.Linalg.VectorUDT), StructField("label", DoubleType) )) val inputDataFrame: DataFrame = { val rowSeq = Seq( Row("aAa bBb cCc dDd eEe f", 1.0, Vectors.dense(2.1, 2.2, 2.3), 3.0), Row("das99213 99721 8i!#@!", 4.0, Vectors.dense(5.1, 5.2, 5.3), 6.0) ) createDataFrame(rowSeq, inputDataFrameSchema) } def setUpStubs(): Unit = () className should { "successfully run _evaluate()" in { setUpStubs() evaluator.set(evaluatorParams: _*)._evaluate(executionContext, inputDataFrame) } "successfully run _infer()" in { evaluator.set(evaluatorParams: _*)._infer(DKnowledge(inputDataFrame)) } "successfully run report" in { evaluator.set(evaluatorParams: _*).report() } } }
Example 57
Source File: BinarizerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{DataType, DoubleType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class BinarizerSmokeTest extends AbstractTransformerWrapperSmokeTest[Binarizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: Binarizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("binarizerOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("d")) .setInPlace(inPlace) val binarizer = new Binarizer() binarizer.set( binarizer.singleOrMultiChoiceParam -> single, binarizer.threshold -> 0.5) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(0.2, 0.5, 1.8) val outputNumbers = Seq(0.0, 0.0, 1.0) inputNumbers.zip(outputNumbers) } override def inputType: DataType = DoubleType override def outputType: DataType = DoubleType }
Example 58
Source File: OneHotEncoderSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import ai.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.{DataType, DoubleType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class OneHotEncoderSmokeTest extends AbstractTransformerWrapperSmokeTest[OneHotEncoder] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: OneHotEncoder = { val inPlace = NoInPlaceChoice() .setOutputColumn("oneHotEncoderOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("d")) .setInPlace(inPlace) val oneHotEncoder = new OneHotEncoder() oneHotEncoder.set( oneHotEncoder.singleOrMultiChoiceParam -> single, oneHotEncoder.dropLast -> false) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(0.0, 1.0) val outputNumbers = Seq(Vectors.dense(1.0, 0.0), Vectors.dense(0.0, 1.0)) inputNumbers.zip(outputNumbers) } override def inputType: DataType = DoubleType override def outputType: DataType = new ai.deepsense.sparkutils.Linalg.VectorUDT }
Example 59
Source File: GBTClassifierSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.estimators import org.apache.spark.sql.types.{DoubleType, Metadata, StructType} import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperables.spark.wrappers.params.common.ClassificationImpurity import ai.deepsense.deeplang.params.ParamPair import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection import ai.deepsense.deeplang.utils.DataFrameUtils class GBTClassifierSmokeTest extends AbstractEstimatorModelWrapperSmokeTest { override def className: String = "GBTClassifier" override val estimator = new GBTClassifier() private val labelColumnName = "myRating" import estimator.vanillaGBTClassifier._ override val estimatorParams: Seq[ParamPair[_]] = Seq( featuresColumn -> NameSingleColumnSelection("myFeatures"), impurity -> ClassificationImpurity.Entropy(), labelColumn -> NameSingleColumnSelection(labelColumnName), lossType -> GBTClassifier.Logistic(), maxBins -> 2.0, maxDepth -> 6.0, maxIterations -> 10.0, minInfoGain -> 0.0, minInstancesPerNode -> 1, predictionColumn -> "prediction", seed -> 100.0, stepSize -> 0.11, subsamplingRate -> 0.999 ) override def assertTransformedDF(dataFrame: DataFrame): Unit = { val possibleValues = DataFrameUtils.collectValues(dataFrame, labelColumnName) val actualValues = DataFrameUtils.collectValues(dataFrame, "prediction") actualValues.diff(possibleValues) shouldBe empty } override def assertTransformedSchema(schema: StructType): Unit = { val predictionColumn = schema.fields.last predictionColumn.name shouldBe "prediction" predictionColumn.dataType shouldBe DoubleType predictionColumn.metadata shouldBe Metadata.empty } }
Example 60
Source File: ReportContentTestFactory.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.reportlib.model.factory import ai.deepsense.reportlib.model.{ReportType, ReportContent} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType} trait ReportContentTestFactory { import ReportContentTestFactory._ def testReport: ReportContent = ReportContent( reportName, reportType, Seq(TableTestFactory.testEmptyTable), Map(ReportContentTestFactory.categoricalDistName -> DistributionTestFactory.testCategoricalDistribution( ReportContentTestFactory.categoricalDistName), ReportContentTestFactory.continuousDistName -> DistributionTestFactory.testContinuousDistribution( ReportContentTestFactory.continuousDistName) ) ) } object ReportContentTestFactory extends ReportContentTestFactory { val continuousDistName = "continuousDistributionName" val categoricalDistName = "categoricalDistributionName" val reportName = "TestReportContentName" val reportType = ReportType.Empty val someReport: ReportContent = ReportContent("empty", ReportType.Empty) }
Example 61
Source File: ArrangePostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.postprocessors import com.typesafe.config.Config import it.agilelab.bigdata.DataQuality.checks.CheckResult import it.agilelab.bigdata.DataQuality.metrics.MetricResult import it.agilelab.bigdata.DataQuality.sources.HdfsFile import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils import it.agilelab.bigdata.DataQuality.utils.DQSettings import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter} import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, NumericType} import org.apache.spark.sql.{Column, DataFrame, SQLContext} import scala.collection.JavaConversions._ final class ArrangePostprocessor(config: Config, settings: DQSettings) extends BasicPostprocessor(config, settings) { private case class ColumnSelector(name: String, tipo: Option[String] = None, format: Option[String] = None, precision: Option[Integer] = None) { def toColumn()(implicit df: DataFrame): Column = { val dataType: Option[NumericType with Product with Serializable] = tipo.getOrElse("").toUpperCase match { case "DOUBLE" => Some(DoubleType) case "INT" => Some(IntegerType) case "LONG" => Some(LongType) case _ => None } import org.apache.spark.sql.functions.format_number import org.apache.spark.sql.functions.format_string (dataType, precision, format) match { case (Some(dt), None, None) => df(name).cast(dt) case(Some(dt), None, Some(f)) => format_string(f, df(name).cast(dt)).alias(name) case (Some(dt), Some(p),None) => format_number(df(name).cast(dt), p).alias(name) case (None, Some(p), None) => format_number(df(name), p).alias(name) case (None, None, Some(f)) => format_string(f, df(name)).alias(name) case _ => df(name) } } } private val vs = config.getString("source") private val target: HdfsTargetConfig = { val conf = config.getConfig("saveTo") utils.parseTargetConfig(conf)(settings).get } private val columns: Seq[ColumnSelector] = config.getAnyRefList("columnOrder").map { case x: String => ColumnSelector(x) case x: java.util.HashMap[_, String] => { val (name, v) = x.head.asInstanceOf[String Tuple2 _] v match { case v: String => ColumnSelector(name, Option(v)) case v: java.util.HashMap[String, _] => { val k = v.head._1 val f = v.head._2 f match { case f: Integer => ColumnSelector(name, Option(k), None, Option(f)) case f: String => ColumnSelector(name, Option(k), Option(f)) } } } } } override def process(vsRef: Set[HdfsFile], metRes: Seq[MetricResult], chkRes: Seq[CheckResult])( implicit fs: FileSystem, sqlContext: SQLContext, settings: DQSettings): HdfsFile = { val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vs).head implicit val df: DataFrame = HdfsReader.load(reqVS, settings.ref_date).head val arrangeDF = df.select(columns.map(_.toColumn): _*) HdfsWriter.saveVirtualSource(arrangeDF, target, settings.refDateString)( fs, sqlContext.sparkContext) new HdfsFile(target) } }
Example 62
Source File: UnaryEstimatorTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.base.unary import com.salesforce.op.UID import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{DoubleType, MetadataBuilder, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class UnaryEstimatorTest extends OpEstimatorSpec[Real, UnaryModel[Real, Real], UnaryEstimator[Real, Real]] { val expectedResult = Seq(0.0, 0.8, 0.4, 0.2, 1.0).map(_.toReal) } class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator]) extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) { def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = { val grouped = dataset.groupBy() val maxVal = grouped.max().first().getDouble(0) val minVal = grouped.min().first().getDouble(0) new MinMaxNormEstimatorModel(min = minVal, max = maxVal, operationName = operationName, uid = uid) } } final class MinMaxNormEstimatorModel private[op](val min: Double, val max: Double, operationName: String, uid: String) extends UnaryModel[Real, Real](operationName = operationName, uid = uid) { def transformFn: Real => Real = _.v.map(v => (v - min) / (max - min)).toReal }
Example 63
Source File: Binarizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) }
Example 64
Source File: BinaryClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //ROC曲线下面积 setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { //ROC曲线下面积为1.0时表示一个完美的分类器 case "areaUnderROC" => metrics.areaUnderROC() //准确率与召回率 case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true//ROC曲线下面积为1.0时表示一个完美的分类器,0.5则表示一个随机的性能 case "areaUnderPR" => true //准确率与召回率 } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 65
Source File: MulticlassClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 setDefault(metricName -> "f1") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision//准确率 case "recall" => metrics.recall//召回率 case "weightedPrecision" => metrics.weightedPrecision//加权准确率 case "weightedRecall" => metrics.weightedRecall//加权召回率 } metric } override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true//F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "precision" => true//准确率 case "recall" => true//召回率 case "weightedPrecision" => true//加权准确率 case "weightedRecall" => true//加权召回率 } override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) }
Example 66
Source File: RegressionEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //默认均方根误差 setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { //均方根误差 case "rmse" => metrics.rootMeanSquaredError //均方差 case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 //平均绝对误差 case "mae" => metrics.meanAbsoluteError } metric } override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false//均方根误差 case "mse" => false//均方差 case "r2" => true//平方系统 case "mae" => false//平均绝对误差 } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 67
Source File: randomExpressions.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to rand must be an integer literal.") }) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian(); """ } }
Example 68
Source File: SemiJoinSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.{SQLConf, DataFrame, Row} import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.catalyst.expressions.{And, LessThan, Expression} import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} //半连接测试套件 class SemiJoinSuite extends SparkPlanTest with SharedSQLContext { private lazy val left = ctx.createDataFrame( ctx.sparkContext.parallelize(Seq( Row(1, 2.0), Row(1, 2.0), Row(2, 1.0), Row(2, 1.0), Row(3, 3.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("a", IntegerType).add("b", DoubleType)) private lazy val right = ctx.createDataFrame( ctx.sparkContext.parallelize(Seq( Row(2, 3.0), Row(2, 3.0), Row(3, 2.0), Row(4, 1.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("c", IntegerType).add("d", DoubleType)) private lazy val condition = { And((left.col("a") === right.col("c")).expr, LessThan(left.col("b").expr, right.col("d").expr)) } // Note: the input dataframes and expression must be evaluated lazily because // the SQLContext should be used only within a test to keep SQL tests stable private def testLeftSemiJoin( testName: String, leftRows: => DataFrame, rightRows: => DataFrame, condition: => Expression, expectedAnswer: Seq[Product]): Unit = { def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = { val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition)) ExtractEquiJoinKeys.unapply(join) } test(s"$testName using LeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => EnsureRequirements(left.sqlContext).apply( LeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using BroadcastLeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using LeftSemiJoinBNL") { withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => LeftSemiJoinBNL(left, right, Some(condition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } //测试左半连接 testLeftSemiJoin( "basic test", left, right, condition, Seq( (2, 1.0), (2, 1.0) ) ) }
Example 69
Source File: RegressionEvaluator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl class RegressionEvaluator(override val uid: String) extends Evaluator[RegressionEvaluator](uid) { val throughOrigin = new BooleanParam(this, "throughOrigin", "True if the regression is through the origin. For example, in " + "linear regression, it will be true without fitting intercept.") def setThroughOrigin(value: Boolean): this.type = set(throughOrigin, value) def getThroughOrigin: Boolean = $(throughOrigin) def this() = this(Identifiable.randomUID("regressionEvaluator")) override def transform(dataset: Dataset[_]): DataFrame = { try { val predictions: RDD[(Double, Double)] = dataset.select($(predictionCol), $(labelCol)) .rdd.map { case Row(score: Double, label: Double) => (score, label) } val metrics = Try(new RegressionMetrics(predictions)) val rows = metrics.toOption.map(m => Seq( "r2" -> m.r2, "rmse" -> m.rootMeanSquaredError, "explainedVariance" -> m.explainedVariance, "meanAbsoluteError" -> m.meanAbsoluteError, "meanSquaredError" -> m.meanSquaredError ).map(Row.fromTuple)).getOrElse(Seq()) SparkSqlUtils.reflectionLock.synchronized( dataset.sqlContext.createDataFrame( dataset.sparkSession.sparkContext.parallelize(rows, 1), transformSchema(dataset.schema))) } catch { // Most probably evaluation dataset is empty case e: Exception => logWarning("Failed to calculate metrics due to " + e.getMessage) SparkSqlUtils.reflectionLock.synchronized( dataset.sqlContext.createDataFrame( dataset.sparkSession.sparkContext.emptyRDD[Row], transformSchema(dataset.schema))) } } override def copy(extra: ParamMap): RegressionEvaluator = { copyValues(new RegressionEvaluator(), extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { new StructType() .add("metric", StringType, nullable = false) .add("value", DoubleType, nullable = false) } }
Example 70
Source File: VectorExplodeSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.TestEnv import odkl.analysis.spark.util.SQLOperations import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.{functions, Row} import org.apache.spark.sql.types.{StructType, StructField, DoubleType} import org.scalatest.FlatSpec class VectorExplodeSpec extends FlatSpec with TestEnv with org.scalatest.Matchers with SQLOperations with WithModels with HasMetricsBlock { case class Point(id: Int, vector: Vector, mean: Vector) lazy val data = sqlc.createDataFrame(Seq( Point(1, Vectors.dense(1.0, 3.0), Vectors.dense(10.0, 30.0)), Point(2, Vectors.dense(2.0, 4.0), Vectors.sparse(2, Array(1), Array(20.0))) )) lazy val withMetadata = data.withColumn( "vector", data("vector").as("vector", new AttributeGroup("vector", Array[Attribute]( NumericAttribute.defaultAttr.withName("fixed"), NumericAttribute.defaultAttr.withName("var") )).toMetadata())) .withColumn( "mean", data("mean").as("mean", new AttributeGroup("vector", Array[Attribute]( NumericAttribute.defaultAttr.withName("fixed"), NumericAttribute.defaultAttr.withName("var") )).toMetadata())) lazy val explode = new VectorExplode().transform(withMetadata) "Explode " should " add data" in { val result = explode.orderBy("id", "value").collect() result(0).getInt(0) should be(1) result(0).getString(1) should be("fixed") result(0).getDouble(2) should be(1.0) result(0).getDouble(3) should be(10.0) result(1).getInt(0) should be(1) result(1).getString(1) should be("var") result(1).getDouble(2) should be(3.0) result(1).getDouble(3) should be(30.0) result(2).getInt(0) should be(2) result(2).getString(1) should be("fixed") result(2).getDouble(2) should be(2.0) result(2).isNullAt(3) should be(true) result(3).getInt(0) should be(2) result(3).getString(1) should be("var") result(3).getDouble(2) should be(4.0) result(3).getDouble(3) should be(20.0) } "Explode " should " create schema" in { val fields = explode.schema.fields fields(0).name should be("id") fields(1).name should be("value") fields(2).name should be("vector") fields(3).name should be("mean") } }
Example 71
Source File: EWStatsTransformerSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package odkl.analysis.spark.texts import odkl.analysis.spark.TestEnv import org.apache.spark.ml.odkl.texts.EWStatsTransformer import org.apache.spark.ml.odkl.texts.EWStatsTransformer.EWStruct import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import org.scalatest.FlatSpec class EWStatsTransformerSpec extends FlatSpec with TestEnv with org.scalatest.Matchers { import sqlc.implicits._ case class dummyCase(Term: String, sig: Double, ewma: Double, ewmvar: Double) case class ewStruct(sig: Double, ewma: Double, ewmvar: Double) extends Serializable "CorrectEWFreqStatsTransformer" should "count existing and non-existing today words" in { val oldData = Seq(Seq("a", 0.0, 0.1, 0.01), Seq("b", 0.0, 0.2, 0.02), Seq("c", 0.0, 0.3, 0.015)) val oldDF = sqlc.createDataFrame(sc.parallelize(oldData).map(f => { Row.fromSeq(f) }), new StructType().add("term", StringType) .add("sig", DoubleType).add("ewma", DoubleType).add("ewmvar", DoubleType)) val rddRes = oldDF.rdd. map { case Row(term, sig, ewma, ewmvar) => Row(term, Row(sig, ewma, ewmvar)) } val schemaRes = StructType( StructField("term", StringType, false) :: StructField("ewStruct", StructType( StructField("sig", DoubleType, false) :: StructField("ewma", DoubleType, false) :: StructField("ewmvar", DoubleType, false) :: Nil ), true) :: Nil ) val modernOldDF = sqlc.createDataFrame(rddRes, schemaRes) .withColumnRenamed("ewStruct", "old_EWStruct").withColumnRenamed("term", "old_Term") oldDF.collect() val fTransformer = new EWStatsTransformer() .setAlpha(0.7) .setBeta(0.055) .setInputFreqColName("Freq") .setInputTermColName("Term") .setOldEWStructColName("old_EWStruct") .setNewEWStructColName("EWStruct") .setOldTermColName("old_Term") val schema = new StructType().add("Term", StringType).add("Freq", DoubleType) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(("a", 0.2), ("b", 0.1), ("d", 0.05))) .map(f => { Row.fromSeq(Seq(f._1, f._2)) }), schema) val joined = inDF.join(modernOldDF, $"Term" === $"old_Term", "outer") val outDF = fTransformer.transform(joined) val ans: Array[Row] = outDF.sort("Term").collect() assertResult(4)(ans.size) } "CorrectEWStatsTransformer" should "count EWStats correct" in { val mathTransformFun: (String, Double, Double, Double) => EWStruct = EWStatsTransformer.termEWStatsComputing(_:String,_:Double,_:Double,_:Double,0.7,0.005) val input = ("test", 0.01, 0.006, 0.003) val expected = (0.0669, 0.0088, 0.0009) val real = mathTransformFun(input._1, input._2, input._3, input._4) val realRounded = (Math.round(real.sig * 10000D) / 10000D, Math.round(real.ewma * 10000D) / 10000D, Math.round(real.ewmvar * 10000D) / 10000D) assertResult(expected)(realRounded) } }
Example 72
Source File: KLLCheckExample.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.examples import ExampleUtils.{itemsAsDataframe, withSpark} import com.amazon.deequ.VerificationSuite import com.amazon.deequ.analyzers.KLLParameters import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus} import com.amazon.deequ.constraints.ConstraintStatus import org.apache.spark.sql.types.DoubleType private[examples] object KLLCheckExample extends App { withSpark { session => val data = itemsAsDataframe(session, Item(1, "Thingy A", "awesome thing.", "high", 0), Item(2, "Thingy B", "available at http://thingb.com", null, 0), Item(3, null, null, "low", 5), Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10), Item(5, "Thingy E", null, "high", 12)) val newData = data.select(data("numViews").cast(DoubleType).as("numViews")) val verificationResult = VerificationSuite() .onData(newData) .addCheck( Check(CheckLevel.Error, "integrity checks") // we expect 5 records .hasSize(_ == 5) // we expect the maximum of tips to be not more than 10 .hasMax("numViews", _ <= 10) // we expect the sketch size to be at least 16 .kllSketchSatisfies("numViews", _.parameters(1) >= 16, kllParameters = Option(KLLParameters(2, 0.64, 2)))) .run() if (verificationResult.status == CheckStatus.Success) { println("The data passed the test, everything is fine!") } else { println("We found errors in the data, the following constraints were not satisfied:\n") val resultsForAllConstraints = verificationResult.checkResults .flatMap { case (_, checkResult) => checkResult.constraintResults } resultsForAllConstraints .filter { _.status != ConstraintStatus.Success } .foreach { result => println(s"${result.constraint} failed: ${result.message.get}") } } } }
Example 73
Source File: Mean.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{count, sum} import org.apache.spark.sql.types.{DoubleType, StructType, LongType} import Analyzers._ case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] { override def sum(other: MeanState): MeanState = { MeanState(sum + other.sum, count + other.count) } override def metricValue(): Double = { if (count == 0L) Double.NaN else sum / count } } case class Mean(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MeanState]("Mean", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { sum(conditionalSelection(column, where)).cast(DoubleType) :: count(conditionalSelection(column, where)).cast(LongType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = { ifNoNullsIn(result, offset, howMany = 2) { _ => MeanState(result.getDouble(offset), result.getLong(offset + 1)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 74
Source File: UniqueValueRatio.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import com.amazon.deequ.metrics.DoubleMetric import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{col, count, lit, sum} import org.apache.spark.sql.types.DoubleType case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil } override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { val numUniqueValues = result.getDouble(offset) val numDistinctValues = result.getLong(offset + 1).toDouble toSuccessMetric(numUniqueValues / numDistinctValues) } override def filterCondition: Option[String] = where } object UniqueValueRatio { def apply(column: String): UniqueValueRatio = { new UniqueValueRatio(column :: Nil) } def apply(column: String, where: Option[String]): UniqueValueRatio = { new UniqueValueRatio(column :: Nil, where) } }
Example 75
Source File: Maximum.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.max import org.apache.spark.sql.types.{DoubleType, StructType} import Analyzers._ case class MaxState(maxValue: Double) extends DoubleValuedState[MaxState] { override def sum(other: MaxState): MaxState = { MaxState(math.max(maxValue, other.maxValue)) } override def metricValue(): Double = { maxValue } } case class Maximum(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MaxState]("Maximum", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { max(conditionalSelection(column, where)).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = { ifNoNullsIn(result, offset) { _ => MaxState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 76
Source File: MaxLength.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers._ import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString} import org.apache.spark.sql.functions.{length, max} import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.sql.{Column, Row} case class MaxLength(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MaxState]("MaxLength", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { max(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = { ifNoNullsIn(result, offset) { _ => MaxState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column):: isString(column) :: Nil } override def filterCondition: Option[String] = where }
Example 77
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.functions.sum import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.sql.{Column, Row} import Analyzers._ case class SumState(sum: Double) extends DoubleValuedState[SumState] { override def sum(other: SumState): SumState = { SumState(sum + other.sum) } override def metricValue(): Double = { sum } } case class Sum(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[SumState]("Sum", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { sum(conditionalSelection(column, where)).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[SumState] = { ifNoNullsIn(result, offset) { _ => SumState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 78
Source File: Uniqueness.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, lit, sum} import org.apache.spark.sql.types.DoubleType case class Uniqueness(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil } override def filterCondition: Option[String] = where } object Uniqueness { def apply(column: String): Uniqueness = { new Uniqueness(column :: Nil) } def apply(column: String, where: Option[String]): Uniqueness = { new Uniqueness(column :: Nil, where) } }
Example 79
Source File: MinLength.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers._ import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString} import org.apache.spark.sql.functions.{length, min} import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.sql.{Column, Row} case class MinLength(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MinState]("MinLength", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { min(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = { ifNoNullsIn(result, offset) { _ => MinState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isString(column) :: Nil } override def filterCondition: Option[String] = where }
Example 80
Source File: Distinctness.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.functions.{col, sum} import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.Column case class Distinctness(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Distinctness", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { (sum(col(COUNT_COL).geq(1).cast(DoubleType)) / numRows) :: Nil } override def filterCondition: Option[String] = where } object Distinctness { def apply(column: String): Distinctness = { new Distinctness(column :: Nil) } }
Example 81
Source File: Minimum.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.min import org.apache.spark.sql.types.{DoubleType, StructType} import Analyzers._ case class MinState(minValue: Double) extends DoubleValuedState[MinState] { override def sum(other: MinState): MinState = { MinState(math.min(minValue, other.minValue)) } override def metricValue(): Double = { minValue } } case class Minimum(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MinState]("Minimum", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { min(conditionalSelection(column, where)).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = { ifNoNullsIn(result, offset) { _ => MinState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 82
Source File: BinaryClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 83
Source File: MulticlassClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 84
Source File: RegressionEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 85
Source File: LibSVMResponseRowDeserializer.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers import org.apache.spark.ml.linalg.{SparseVector, SQLDataTypes} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.{ContentTypes, ResponseRowDeserializer} override val accepts: String = ContentTypes.TEXT_LIBSVM private def parseLibSVMRow(record: String): Row = { val items = record.split(' ') val label = items.head.toDouble val (indices, values) = items.tail.filter(_.nonEmpty).map { item => val entry = item.split(':') val index = entry(0).toInt - 1 val value = entry(1).toDouble (index, value) }.unzip Row(label, new SparseVector(dim, indices.toArray, values.toArray)) } override val schema: StructType = StructType( Array( StructField(labelColumnName, DoubleType, nullable = false), StructField(featuresColumnName, SQLDataTypes.VectorType, nullable = false))) }
Example 86
Source File: SchemaValidators.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.apache.spark.ml.linalg.SQLDataTypes import org.apache.spark.sql.types.{DoubleType, StructType} private[serializers] object SchemaValidators { def labeledSchemaValidator(schema: StructType, labelColumnName: String, featuresColumnName: String): Unit = { if ( !schema.exists(f => f.name == labelColumnName && f.dataType == DoubleType) || !schema.exists(f => f.name == featuresColumnName && f.dataType == SQLDataTypes.VectorType)) { throw new IllegalArgumentException(s"Expecting schema with DoubleType column with name " + s"$labelColumnName and Vector column with name $featuresColumnName. Got ${schema.toString}") } } def unlabeledSchemaValidator(schema: StructType, featuresColumnName: String): Unit = { if (!schema.exists(f => f.name == featuresColumnName && f.dataType == SQLDataTypes.VectorType)) { throw new IllegalArgumentException( s"Expecting schema with Vector column with name" + s" $featuresColumnName. Got ${schema.toString}") } } }
Example 87
Source File: ProtobufRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val labelColumnName = "label" val featuresColumnName = "features" val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField( featuresColumnName, VectorType))) it should "serialize a dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "serialize a sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "fail to set schema on invalid features name" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) intercept[IllegalArgumentException] { val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist") } } it should "fail on invalid types" in { val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) new ProtobufRequestRowSerializer(Some(validSchema)) } }
Example 88
Source File: LibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest._ import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema = new LibSVMResponseRowDeserializer(10).schema "LibSVMRequestRowSerializer" should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert ("1.0 1:-100.0 11:100.1\n" == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "ignore other columns" in { val schemaWithExtraColumns = StructType(Array( StructField("name", StringType, nullable = false), StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false), StructField("favorite activity", StringType, nullable = false))) val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray, schema = schemaWithExtraColumns) val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "fail on invalid features column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!") } } it should "fail on invalid label column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), labelColumnName = "Sir! I must protest! I do not exist!") } } it should "fail on invalid types" in { val schemaWithInvalidLabelType = StructType(Array( StructField("label", StringType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType)) } val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) new LibSVMRequestRowSerializer(Some(validSchema)) } }
Example 89
Source File: Binarizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 90
Source File: BinaryClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("1.2.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 91
Source File: MulticlassClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("1.5.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision case "recall" => metrics.recall case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall } metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true case "precision" => true case "recall" => true case "weightedPrecision" => true case "weightedRecall" => true } @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 92
Source File: RegressionEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("1.4.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema val predictionColName = $(predictionCol) val predictionType = schema($(predictionCol)).dataType require(predictionType == FloatType || predictionType == DoubleType, s"Prediction column $predictionColName must be of type float or double, " + s" but not $predictionType") val labelColName = $(labelCol) val labelType = schema($(labelCol)).dataType require(labelType == FloatType || labelType == DoubleType, s"Label column $labelColName must be of type float or double, but not $labelType") val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 93
Source File: LibSVMRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import com.google.common.base.Objects import org.apache.spark.Logging import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @Since("1.6.0") class DefaultSource extends RelationProvider with DataSourceRegister { @Since("1.6.0") override def shortName(): String = "libsvm" @Since("1.6.0") override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) : BaseRelation = { val path = parameters.getOrElse("path", throw new IllegalArgumentException("'path' must be specified")) val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt val vectorType = parameters.getOrElse("vectorType", "sparse") new LibSVMRelation(path, numFeatures, vectorType)(sqlContext) } }
Example 94
Source File: randomExpressions.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to rand must be an integer literal.") }) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian(); """ } }
Example 95
Source File: SemiJoinSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.{SQLConf, DataFrame, Row} import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.catalyst.expressions.{And, LessThan, Expression} import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class SemiJoinSuite extends SparkPlanTest with SharedSQLContext { private lazy val left = sqlContext.createDataFrame( sparkContext.parallelize(Seq( Row(1, 2.0), Row(1, 2.0), Row(2, 1.0), Row(2, 1.0), Row(3, 3.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("a", IntegerType).add("b", DoubleType)) private lazy val right = sqlContext.createDataFrame( sparkContext.parallelize(Seq( Row(2, 3.0), Row(2, 3.0), Row(3, 2.0), Row(4, 1.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("c", IntegerType).add("d", DoubleType)) private lazy val condition = { And((left.col("a") === right.col("c")).expr, LessThan(left.col("b").expr, right.col("d").expr)) } // Note: the input dataframes and expression must be evaluated lazily because // the SQLContext should be used only within a test to keep SQL tests stable private def testLeftSemiJoin( testName: String, leftRows: => DataFrame, rightRows: => DataFrame, condition: => Expression, expectedAnswer: Seq[Product]): Unit = { def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = { val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition)) ExtractEquiJoinKeys.unapply(join) } test(s"$testName using LeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => EnsureRequirements(left.sqlContext).apply( LeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using BroadcastLeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using LeftSemiJoinBNL") { withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => LeftSemiJoinBNL(left, right, Some(condition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } testLeftSemiJoin( "basic test", left, right, condition, Seq( (2, 1.0), (2, 1.0) ) ) }
Example 96
Source File: IrisKMeansClusteringSpec.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.clustering import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import com.github.mrpowers.spark.fast.tests.ColumnComparer import com.github.mrpowers.spark.spec.SparkSessionTestWrapper import org.apache.spark.ml.evaluation.ClusteringEvaluator import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType import org.scalatest.FunSpec class IrisKMeansClusteringSpec extends FunSpec with SparkSessionTestWrapper with ColumnComparer { describe("withVectorizedFeatures") { it("converts all the features to a vector without blowing up") { val df = spark.createDF( List( (5.1, 3.5, 1.4, 0.2) ), List( ("SepalLengthCm", DoubleType, true), ("SepalWidthCm", DoubleType, true), ("PetalLengthCm", DoubleType, true), ("PetalWidthCm", DoubleType, true) ) ).transform(IrisKMeansClustering.withVectorizedFeatures()) df.show() df.printSchema() } } describe("model") { it("prints the cluster centers") { println("Cluster Centers: ") IrisKMeansClustering.model().clusterCenters.foreach(println) } it("trains a KMeans Clustering model that's Silhouette with squared euclidean distance above 0.70 percent") { val trainData: DataFrame = IrisKMeansClustering.trainingDF .transform(IrisKMeansClustering.withVectorizedFeatures()) .select("features") val testData: DataFrame = IrisKMeansClustering.testDF .transform(IrisKMeansClustering.withVectorizedFeatures()) .select("features") val predictions: DataFrame = IrisKMeansClustering .model() .transform(testData) .select( col("features"), col("prediction") ) val res = new ClusteringEvaluator() .evaluate(predictions) assert(res >= 0.60) } } }
Example 97
Source File: DatasetUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata} import org.apache.spark.sql.{Column, DataFrame, Dataset} object DatasetUtil { def withColumns[T](ds: Dataset[T], colNames: Seq[String], cols: Seq[Column], metadata: Seq[Metadata]): DataFrame = { require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") require(colNames.size == metadata.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of metadata elements: ${metadata.size}") val sparkSession = ds.sparkSession val queryExecution = ds.queryExecution val resolver = sparkSession.sessionState.analyzer.resolver val output = queryExecution.analyzed.output checkColumnNameDuplication(colNames, "in given column names", sparkSession.sessionState.conf.caseSensitiveAnalysis) val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) => colName -> col.as(colName, metadata) }.toMap val replacedAndExistingColumns = output.map { field => columnMap.find { case (colName, _) => resolver(field.name, colName) } match { case Some((colName: String, col: Column)) => col.as(colName) case _ => new Column(field) } } val newColumns = columnMap.filter { case (colName, col) => !output.exists(f => resolver(f.name, colName)) }.map { case (colName, col) => col.as(colName) } ds.select(replacedAndExistingColumns ++ newColumns: _*) } def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = { withColumns(ds, Seq(colName), Seq(col), Seq(metadata)) } private def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } /** * Cast a column in a Dataset to Vector type. * * The supported data types of the input column are * - Vector * - float/double type Array. * * Note: The returned column does not have Metadata. * * @param dataset input DataFrame * @param colName column name. * @return Vector column */ def columnToVector(dataset: Dataset[_], colName: String): Column = { val columnDataType = dataset.schema(colName).dataType columnDataType match { case _: VectorUDT => col(colName) case fdt: ArrayType => val transferUDF = fdt.elementType match { case _: FloatType => udf(f = (vector: Seq[Float]) => { val inputArray = Array.fill[Double](vector.size)(0.0) vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble) Vectors.dense(inputArray) }) case _: DoubleType => udf((vector: Seq[Double]) => { Vectors.dense(vector.toArray) }) case other => throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector") } transferUDF(col(colName)) case other => throw new IllegalArgumentException(s"$other column cannot be cast to Vector") } } }
Example 98
Source File: RegressionEvaluator.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.evaluation import com.tencent.angel.sona.ml.evaluation.evaluating.RegressionSummaryImpl import com.tencent.angel.sona.ml.param.{Param, ParamMap, ParamValidators} import com.tencent.angel.sona.ml.param.shared.{HasLabelCol, HasPredictionCol} import com.tencent.angel.sona.ml.util._ import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{DoubleType, FloatType} import org.apache.spark.sql.util.SONASchemaUtils /** * :: Experimental :: * Evaluator for regression, which expects two input columns: prediction and label. */ final class RegressionEvaluator(override val uid: String) extends Evaluator with HasPredictionCol with HasLabelCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("regEval")) /** * Param for metric name in evaluation. Supports: * - `"rmse"` (default): root mean squared error * - `"mse"`: mean squared error * - `"r2"`: R^2^ metric * - `"mae"`: mean absolute error * * @group param */ val metricName: Param[String] = { val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae")) new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae)", allowedParams) } def getMetricName: String = $(metricName) def setMetricName(value: String): this.type = set(metricName, value) def setPredictionCol(value: String): this.type = set(predictionCol, value) def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SONASchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SONASchemaUtils.checkNumericType(schema, $(labelCol)) val summary = new RegressionSummaryImpl(dataset.toDF(), $(predictionCol), $(labelCol)) val metrics = summary.regMetrics val metric = $(metricName) match { case "rmse" => summary.rmse case "mse" => summary.mse case "r2" => summary.r2 case "mae" => summary.absDiff } metric } override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { override def load(path: String): RegressionEvaluator = super.load(path) }
Example 99
Source File: Predictor.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.common import com.tencent.angel.mlcore.conf.{MLCoreConf, SharedConf} import com.tencent.angel.ml.math2.utils.{DataBlock, LabeledData} import org.apache.spark.broadcast.Broadcast import com.tencent.angel.sona.ml.common.MathImplicits._ import com.tencent.angel.sona.core.{AngelGraphModel, ExecutorContext} import com.tencent.angel.sona.data.LocalMemoryDataBlock import org.apache.spark.linalg import org.apache.spark.linalg.Vectors import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.apache.spark.sql.{Row, SPKSQLUtils} import scala.collection.mutable.ListBuffer class Predictor(bcValue: Broadcast[ExecutorContext], featIdx: Int, predictionCol: String, probabilityCol: String, bcConf: Broadcast[SharedConf]) extends Serializable { @transient private lazy val executorContext: ExecutorContext = { bcValue.value } @transient private lazy implicit val dim: Long = { executorContext.conf.getLong(MLCoreConf.ML_FEATURE_INDEX_RANGE) } @transient private lazy val appendedSchema: StructType = if (probabilityCol.nonEmpty) { new StructType(Array[StructField](StructField(probabilityCol, DoubleType), StructField(predictionCol, DoubleType))) } else { new StructType(Array[StructField](StructField(predictionCol, DoubleType))) } def predictRDD(data: Iterator[Row]): Iterator[Row] = { val localModel = executorContext.borrowModel(bcConf.value) val batchSize = 1024 val storage = new LocalMemoryDataBlock(batchSize, batchSize * 1024 * 1024) var count = 0 val cachedRows: Array[Row] = new Array[Row](batchSize) val result: ListBuffer[Row] = ListBuffer[Row]() data.foreach { case row if count != 0 && count % batchSize == 0 => predictInternal(localModel, storage, cachedRows, result) storage.clean() storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0)) cachedRows(count % batchSize) = row count += 1 case row => storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0)) cachedRows(count % batchSize) = row count += 1 } predictInternal(localModel, storage, cachedRows, result) executorContext.returnModel(localModel) result.toIterator } private def predictInternal(model: AngelGraphModel, storage: DataBlock[LabeledData], cachedRows: Array[Row], result: ListBuffer[Row]): Unit = { val predicted = model.predict(storage) if (appendedSchema.length == 1) { predicted.zipWithIndex.foreach { case (res, idx) => result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.pred)) } } else { predicted.zipWithIndex.foreach { case (res, idx) => result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.proba, res.predLabel)) } } } def predictRaw(features: linalg.Vector): linalg.Vector = { val localModel = executorContext.borrowModel(bcConf.value) val res = localModel.predict(new LabeledData(features, 0.0)) executorContext.returnModel(localModel) Vectors.dense(res.pred, -res.pred) } }
Example 100
Source File: MomentAggState.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import io.projectglow.common.GlowLogging def toInternalRow(row: InternalRow, offset: Int = 0): InternalRow = { row.update(offset, if (count > 0) mean else null) row.update(offset + 1, if (count > 0) Math.sqrt(m2 / (count - 1)) else null) row.update(offset + 2, if (count > 0) min else null) row.update(offset + 3, if (count > 0) max else null) row } def toInternalRow: InternalRow = { toInternalRow(new GenericInternalRow(4)) } } object MomentAggState extends GlowLogging { val schema = StructType( Seq( StructField("mean", DoubleType), StructField("stdDev", DoubleType), StructField("min", DoubleType), StructField("max", DoubleType) ) ) def merge(s1: MomentAggState, s2: MomentAggState): MomentAggState = { if (s1.count == 0) { return s2 } else if (s2.count == 0) { return s1 } val newState = MomentAggState() newState.count = s1.count + s2.count val delta = s2.mean - s1.mean val deltaN = delta / newState.count newState.mean = s1.mean + deltaN * s2.count // higher order moments computed according to: // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics newState.m2 = s1.m2 + s2.m2 + delta * deltaN * s1.count * s2.count newState.min = Math.min(s1.min, s2.min) newState.max = Math.max(s1.max, s2.max) newState } }
Example 101
Source File: QuadTreeIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{DoubleType, IntegerType} import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.simba.partitioner.QuadTreePartitioner import org.apache.spark.sql.simba.spatial.Point private[simba] case class QuadTreeIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String)(var _indexedRDD: IndexedRDD = null, var global_index: QuadTree = null) extends IndexedRelation with MultiInstanceRelation { private def checkKeys: Boolean = { for (i <- column_keys.indices) if (!(column_keys(i).dataType.isInstanceOf[DoubleType] || column_keys(i).dataType.isInstanceOf[IntegerType])) { return false } true } require(checkKeys) if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions val sampleRate = simbaSession.sessionState.simbaConf.sampleRate val tranferThreshold = simbaSession.sessionState.simbaConf.transferThreshold val dataRDD = child.execute().map(row => { val now = column_keys.map(x => BindReferences.bindReference(x, child.output).eval(row).asInstanceOf[Number].doubleValue() ).toArray (new Point(now), row) }) val dimension = column_keys.length val (partitionedRDD, _, global_qtree) = QuadTreePartitioner(dataRDD, dimension, numShufflePartitions, sampleRate, tranferThreshold) val indexed = partitionedRDD.mapPartitions { iter => val data = iter.toArray val index: QuadTree = if (data.length > 0) QuadTree(data.map(_._1).zipWithIndex) else null Array(IPartition(data.map(_._2), index)).iterator }.persist(StorageLevel.MEMORY_AND_DISK_SER) indexed.setName(table_name.map(name => s"$name $index_name").getOrElse(child.toString)) _indexedRDD = indexed global_index = global_qtree } override def newInstance(): IndexedRelation = { new QuadTreeIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD) .asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { new QuadTreeIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD, global_index) } }
Example 102
Source File: GBTClassificationModel.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.models import org.apache.spark.ml.classification.{GBTClassificationModel => SparkGBTClassificationModel, GBTClassifier => SparkGBTClassifier} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import io.deepsense.commons.utils.Logging import io.deepsense.deeplang.doperables.report.CommonTablesGenerators.SparkSummaryEntry import io.deepsense.deeplang.doperables.report.{CommonTablesGenerators, Report} import io.deepsense.deeplang.doperables.spark.wrappers.params.common.PredictorParams import io.deepsense.deeplang.doperables.stringindexingwrapper.StringIndexingWrapperModel import io.deepsense.deeplang.doperables.{LoadableWithFallback, SparkModelWrapper} import io.deepsense.deeplang.params.Param import io.deepsense.sparkutils.ML class GBTClassificationModel(vanilaModel: VanillaGBTClassificationModel) extends StringIndexingWrapperModel[SparkGBTClassificationModel, SparkGBTClassifier](vanilaModel) { def this() = this(new VanillaGBTClassificationModel()) } class VanillaGBTClassificationModel() extends SparkModelWrapper[SparkGBTClassificationModel, SparkGBTClassifier] with LoadableWithFallback[SparkGBTClassificationModel, SparkGBTClassifier] with PredictorParams with Logging { override private[deeplang] def _transformSchema(schema: StructType): Option[StructType] = { val predictionColumnName = $(predictionColumn) Some(StructType(schema.fields :+ StructField(predictionColumnName, DoubleType))) } override val params: Array[Param[_]] = Array(featuresColumn, predictionColumn) override def report: Report = { val summary = List( SparkSummaryEntry( name = "number of features", value = sparkModel.numFeatures, description = "Number of features the model was trained on.")) super.report .withReportName( s"${this.getClass.getSimpleName} with ${sparkModel.numTrees} trees") .withAdditionalTable(CommonTablesGenerators.modelSummary(summary)) .withAdditionalTable( CommonTablesGenerators.decisionTree( sparkModel.treeWeights, sparkModel.trees), 2) } override protected def transformerName: String = classOf[GBTClassificationModel].getSimpleName override def tryToLoadModel(path: String): Option[SparkGBTClassificationModel] = { ML.ModelLoading.GBTClassification(path) } }
Example 103
Source File: RandomForestClassificationModel.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.models import org.apache.spark.ml.classification.{RandomForestClassificationModel => SparkRandomForestClassificationModel, RandomForestClassifier => SparkRandomForestClassifier} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import io.deepsense.deeplang.doperables.report.CommonTablesGenerators.SparkSummaryEntry import io.deepsense.deeplang.doperables.report.{CommonTablesGenerators, Report} import io.deepsense.deeplang.doperables.spark.wrappers.params.common.ProbabilisticClassifierParams import io.deepsense.deeplang.doperables.stringindexingwrapper.StringIndexingWrapperModel import io.deepsense.deeplang.doperables.{LoadableWithFallback, SparkModelWrapper} import io.deepsense.deeplang.params.Param import io.deepsense.sparkutils.ML class RandomForestClassificationModel( vanillaModel: VanillaRandomForestClassificationModel) extends StringIndexingWrapperModel[ SparkRandomForestClassificationModel, SparkRandomForestClassifier](vanillaModel) { def this() = this(new VanillaRandomForestClassificationModel()) } class VanillaRandomForestClassificationModel extends SparkModelWrapper[ SparkRandomForestClassificationModel, SparkRandomForestClassifier] with LoadableWithFallback[ SparkRandomForestClassificationModel, SparkRandomForestClassifier] with ProbabilisticClassifierParams { override private[deeplang] def _transformSchema(schema: StructType): Option[StructType] = { val predictionColumnName = $(predictionColumn) val probabilityColumnName = $(probabilityColumn) val rawPredictionColumnName = $(rawPredictionColumn) Some(StructType(schema.fields ++ Seq( StructField(predictionColumnName, DoubleType), StructField(probabilityColumnName, new io.deepsense.sparkutils.Linalg.VectorUDT), StructField(rawPredictionColumnName, new io.deepsense.sparkutils.Linalg.VectorUDT) ))) } override val params: Array[Param[_]] = Array( featuresColumn, predictionColumn, probabilityColumn, rawPredictionColumn) // thresholds override def report: Report = { val treeWeight = SparkSummaryEntry( name = "tree weights", value = sparkModel.treeWeights, description = "Weights for each tree." ) super.report .withAdditionalTable(CommonTablesGenerators.modelSummary(List(treeWeight))) } override protected def transformerName: String = classOf[RandomForestClassificationModel].getSimpleName override def tryToLoadModel(path: String): Option[SparkRandomForestClassificationModel] = { ML.ModelLoading.randomForestClassification(path) } }
Example 104
Source File: UnionIntegSpec.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperations.exceptions.SchemaMismatchException import io.deepsense.deeplang.inference.{InferContext, InferenceWarnings} import io.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport} class UnionIntegSpec extends DeeplangIntegTestSupport { import DeeplangIntegTestSupport._ val schema1 = StructType(List( StructField("column1", DoubleType), StructField("column2", DoubleType))) val rows1_1 = Seq( Row(1.0, 2.0), Row(2.0, 3.0) ) "Union" should { "return a union of two DataFrames" in { val rows1_2 = Seq( Row(2.0, 4.0), Row(4.0, 6.0) ) val df1 = createDataFrame(rows1_1, schema1) val df2 = createDataFrame(rows1_2, schema1) val merged = Union() .executeUntyped(Vector(df1, df2))(executionContext) .head.asInstanceOf[DataFrame] assertDataFramesEqual( merged, createDataFrame(rows1_1 ++ rows1_2, schema1)) } "throw for mismatching types in DataFrames" in { val schema2 = StructType(List( StructField("column1", StringType), StructField("column2", DoubleType))) val rows2_1 = Seq( Row("a", 1.0), Row("b", 1.0) ) val df1 = createDataFrame(rows1_1, schema1) val df2 = createDataFrame(rows2_1, schema2) a [SchemaMismatchException] should be thrownBy { Union().executeUntyped(Vector(df1, df2))(executionContext) } } "throw for mismatching column names in DataFrames" in { val schema2 = StructType(List( StructField("column1", DoubleType), StructField("different_column_name", DoubleType))) val rows2_1 = Seq( Row(1.1, 1.0), Row(1.1, 1.0) ) val df1 = createDataFrame(rows1_1, schema1) val df2 = createDataFrame(rows2_1, schema2) a [SchemaMismatchException] should be thrownBy { Union().executeUntyped(Vector(df1, df2))(executionContext) } } } it should { "propagate schema when both schemas match" in { val structType = StructType(Seq( StructField("x", DoubleType), StructField("y", DoubleType))) val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType)) val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType)) Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext]) shouldBe (Vector(knowledgeDF1), InferenceWarnings()) } "generate error when schemas don't match" in { val structType1 = StructType(Seq( StructField("x", DoubleType))) val structType2 = StructType(Seq( StructField("y", DoubleType))) val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType1)) val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType2)) an [SchemaMismatchException] shouldBe thrownBy( Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext])) } } }
Example 105
Source File: DataFrameReportPerformanceSpec.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.dataframe import java.sql.Timestamp import java.text.{DateFormat, SimpleDateFormat} import java.util.TimeZone import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType, TimestampType} import org.scalatest.{BeforeAndAfter, Ignore} import io.deepsense.commons.utils.{DoubleUtils, Logging} import io.deepsense.deeplang.{TestFiles, DeeplangIntegTestSupport} // It's ignored because it does not have got assertions, it only prints report generation time. @Ignore class DataFrameReportPerformanceSpec extends DeeplangIntegTestSupport with BeforeAndAfter with TestFiles with Logging { val testFile = absoluteTestsDirPath.pathWithoutScheme + "/demand_without_header.csv" "DataFrame" should { "generate report" when { "DataFrame has 17K of rows" in { val numberOfTries = 10 var results: Seq[Double] = Seq() for (i <- 1 to numberOfTries) { val dataFrame: DataFrame = demandDataFrame() val start = System.nanoTime() val report = dataFrame.report val end = System.nanoTime() val time1: Double = (end - start).toDouble / 1000000000.0 results = results :+ time1 logger.debug("Report generation time: {}", DoubleUtils.double2String(time1)) } logger.debug( "Mean report generation time: {}", DoubleUtils.double2String(results.fold(0D)(_ + _) / numberOfTries.toDouble)) } } } private def demandDataFrame(): DataFrame = { val rddString: RDD[String] = executionContext.sparkContext.textFile(testFile) val data: RDD[Row] = rddString.map(DataFrameHelpers.demandString2Row) executionContext.dataFrameBuilder.buildDataFrame(demandSchema, data) } private def demandSchema: StructType = StructType(Seq( StructField("datetime", TimestampType), StructField("log_count", DoubleType), StructField("workingday", DoubleType), StructField("holiday", DoubleType), StructField("season2", DoubleType), StructField("season3", DoubleType), StructField("season4", DoubleType))) private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } } private object DataFrameHelpers { def demandString2Row(s: String): Row = { val split = s.split(",") Row( timestamp(split(0)), split(1).toDouble, split(2).toDouble, split(3).toDouble, split(4).toDouble, split(5).toDouble, split(6).toDouble ) } private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } }
Example 106
Source File: AbstractEvaluatorSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.ParamPair import io.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport} import io.deepsense.sparkutils.Linalg.Vectors abstract class AbstractEvaluatorSmokeTest extends DeeplangIntegTestSupport { def className: String val evaluator: Evaluator val evaluatorParams: Seq[ParamPair[_]] val inputDataFrameSchema = StructType(Seq( StructField("s", StringType), StructField("prediction", DoubleType), StructField("rawPrediction", new io.deepsense.sparkutils.Linalg.VectorUDT), StructField("label", DoubleType) )) val inputDataFrame: DataFrame = { val rowSeq = Seq( Row("aAa bBb cCc dDd eEe f", 1.0, Vectors.dense(2.1, 2.2, 2.3), 3.0), Row("das99213 99721 8i!#@!", 4.0, Vectors.dense(5.1, 5.2, 5.3), 6.0) ) createDataFrame(rowSeq, inputDataFrameSchema) } def setUpStubs(): Unit = () className should { "successfully run _evaluate()" in { setUpStubs() evaluator.set(evaluatorParams: _*)._evaluate(executionContext, inputDataFrame) } "successfully run _infer()" in { evaluator.set(evaluatorParams: _*)._infer(DKnowledge(inputDataFrame)) } "successfully run report" in { evaluator.set(evaluatorParams: _*).report } } }
Example 107
Source File: BinarizerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{DataType, DoubleType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class BinarizerSmokeTest extends AbstractTransformerWrapperSmokeTest[Binarizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: Binarizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("binarizerOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("d")) .setInPlace(inPlace) val binarizer = new Binarizer() binarizer.set( binarizer.singleOrMultiChoiceParam -> single, binarizer.threshold -> 0.5) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(0.2, 0.5, 1.8) val outputNumbers = Seq(0.0, 0.0, 1.0) inputNumbers.zip(outputNumbers) } override def inputType: DataType = DoubleType override def outputType: DataType = DoubleType }
Example 108
Source File: OneHotEncoderSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import io.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.{DataType, DoubleType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class OneHotEncoderSmokeTest extends AbstractTransformerWrapperSmokeTest[OneHotEncoder] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: OneHotEncoder = { val inPlace = NoInPlaceChoice() .setOutputColumn("oneHotEncoderOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("d")) .setInPlace(inPlace) val oneHotEncoder = new OneHotEncoder() oneHotEncoder.set( oneHotEncoder.singleOrMultiChoiceParam -> single, oneHotEncoder.dropLast -> false) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(0.0, 1.0) val outputNumbers = Seq(Vectors.dense(1.0, 0.0), Vectors.dense(0.0, 1.0)) inputNumbers.zip(outputNumbers) } override def inputType: DataType = DoubleType override def outputType: DataType = new io.deepsense.sparkutils.Linalg.VectorUDT }
Example 109
Source File: GBTClassifierSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.estimators import org.apache.spark.sql.types.{DoubleType, Metadata, StructType} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperables.spark.wrappers.params.common.ClassificationImpurity import io.deepsense.deeplang.params.ParamPair import io.deepsense.deeplang.params.selections.NameSingleColumnSelection import io.deepsense.deeplang.utils.DataFrameUtils class GBTClassifierSmokeTest extends AbstractEstimatorModelWrapperSmokeTest { override def className: String = "GBTClassifier" override val estimator = new GBTClassifier() private val labelColumnName = "myRating" import estimator.vanillaGBTClassifier._ override val estimatorParams: Seq[ParamPair[_]] = Seq( featuresColumn -> NameSingleColumnSelection("myFeatures"), impurity -> ClassificationImpurity.Entropy(), labelColumn -> NameSingleColumnSelection(labelColumnName), lossType -> GBTClassifier.Logistic(), maxBins -> 2.0, maxDepth -> 6.0, maxIterations -> 10.0, minInfoGain -> 0.0, minInstancesPerNode -> 1, predictionColumn -> "prediction", seed -> 100.0, stepSize -> 0.11, subsamplingRate -> 0.999 ) override def assertTransformedDF(dataFrame: DataFrame): Unit = { val possibleValues = DataFrameUtils.collectValues(dataFrame, labelColumnName) val actualValues = DataFrameUtils.collectValues(dataFrame, "prediction") actualValues.diff(possibleValues) shouldBe empty } override def assertTransformedSchema(schema: StructType): Unit = { val predictionColumn = schema.fields.last predictionColumn.name shouldBe "prediction" predictionColumn.dataType shouldBe DoubleType predictionColumn.metadata shouldBe Metadata.empty } }
Example 110
Source File: ReportContentTestFactory.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.reportlib.model.factory import io.deepsense.reportlib.model.{ReportType, ReportContent} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType} trait ReportContentTestFactory { import ReportContentTestFactory._ def testReport: ReportContent = ReportContent( reportName, reportType, Seq(TableTestFactory.testEmptyTable), Map(ReportContentTestFactory.categoricalDistName -> DistributionTestFactory.testCategoricalDistribution( ReportContentTestFactory.categoricalDistName), ReportContentTestFactory.continuousDistName -> DistributionTestFactory.testContinuousDistribution( ReportContentTestFactory.continuousDistName) ) ) } object ReportContentTestFactory extends ReportContentTestFactory { val continuousDistName = "continuousDistributionName" val categoricalDistName = "categoricalDistributionName" val reportName = "TestReportContentName" val reportType = ReportType.Empty val someReport: ReportContent = ReportContent("empty", ReportType.Empty) }