org.apache.spark.sql.types._ Scala Examples
The following examples show how to use org.apache.spark.sql.types._.
Example 1
Source File: MiscStatement.scala From spark-snowflake with Apache License 2.0 | 5 votes |
package net.snowflake.spark.snowflake.pushdowns.querygeneration import net.snowflake.spark.snowflake.{ ConstantString, EmptySnowflakeSQLStatement, IntVariable, SnowflakeSQLStatement } import org.apache.spark.sql.catalyst.expressions.{ Alias, Ascending, Attribute, Cast, DenseRank, Descending, Expression, If, In, InSet, Literal, MakeDecimal, PercentRank, Rank, ScalarSubquery, ShiftLeft, ShiftRight, SortOrder, UnscaledValue, WindowExpression, WindowSpecDefinition } import org.apache.spark.sql.types.{Decimal, _} private[querygeneration] final def getCastType(t: DataType): Option[String] = Option(t match { case StringType => "VARCHAR" case BinaryType => "BINARY" case DateType => "DATE" case TimestampType => "TIMESTAMP" case d: DecimalType => "DECIMAL(" + d.precision + ", " + d.scale + ")" case IntegerType | LongType => "NUMBER" case FloatType => "FLOAT" case DoubleType => "DOUBLE" case _ => null }) }
Example 2
Source File: KustoResponseDeserializer.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package import java.sql.Timestamp import java.util import{KustoResultColumn, KustoResultSetTable, Results} import import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StructType, _} import org.joda.time.DateTime import scala.collection.JavaConverters._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer object KustoResponseDeserializer { def apply(kustoResult: KustoResultSetTable): KustoResponseDeserializer = new KustoResponseDeserializer(kustoResult) } // Timespan columns are casted to strings in kusto side. A simple test to compare the translation to a Duration string // in the format of timespan resulted in less performance. One way was using a new expression that extends UnaryExpression, // second was by a udf function, both were less performant. case class KustoSchema(sparkSchema: StructType, toStringCastedColumns: Set[String]) class KustoResponseDeserializer(val kustoResult: KustoResultSetTable) { val schema: KustoSchema = getSchemaFromKustoResult private def getValueTransformer(valueType: String): Any => Any = { valueType.toLowerCase() match { case "string" => value: Any => value case "int64" => value: Any => value case "datetime" => value: Any => new Timestamp(new DateTime(value).getMillis) case "timespan" => value: Any => value case "sbyte" => value: Any => value case "long" => value: Any => value match { case i: Int => i.toLong case _ => value.asInstanceOf[Long] } case "double" => value: Any => value case "decimal" => value: Any => BigDecimal(value.asInstanceOf[String]) case "int" => value: Any => value case "int32" => value: Any => value case "bool" => value: Any => value case "real" => value: Any => value case _ => value: Any => value.toString } } private def getSchemaFromKustoResult: KustoSchema = { if (kustoResult.getColumns.isEmpty) { KustoSchema(StructType(List()), Set()) } else { val columns = kustoResult.getColumns KustoSchema(StructType( => StructField(col.getColumnName, DataTypeMapping.kustoTypeToSparkTypeMap.getOrElse(col.getColumnType.toLowerCase, StringType)))), columns.filter(c => c.getColumnType.equalsIgnoreCase("TimeSpan")).map(c => c.getColumnName).toSet) } } def getSchema: KustoSchema = { schema } def toRows: java.util.List[Row] = { val columnInOrder = kustoResult.getColumns val value: util.ArrayList[Row] = new util.ArrayList[Row](kustoResult.count()) // Calculate the transformer function for each column to use later by order val valueTransformers: mutable.Seq[Any => Any] = => getValueTransformer(col.getColumnType)) kustoResult.getData.asScala.foreach(row => { val genericRow = row.toArray() column => { if (column._1 == null) null else valueTransformers(column._2)(column._1) }) value.add(new GenericRowWithSchema(genericRow, schema.sparkSchema)) }) value } // private def getOrderedColumnName = { // val columnInOrder = ArrayBuffer.fill(kustoResult.getColumnNameToIndex.size()){ "" } // kustoResult.getColumns.foreach((columnIndexPair: KustoResultColumn) => columnInOrder(columnIndexPair.) = columnIndexPair._1) // columnInOrder // } }
Example 3
Source File: XGBoostBigModel.scala From uberdata with Apache License 2.0 | 5 votes |
package import com.cloudera.sparkts.models.UberXGBoostModel import eleflow.uberdata.IUberdataForecastUtil import import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.annotation.DeveloperApi import{VectorUDT, Vector => SparkVector} import import{LabeledPoint => SparkLabeledPoint} import{HasIdCol, HasLabelCol} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)]) extends ForecastBaseModel[XGBoostBigModel[I]] with HasLabelCol with HasIdCol { def setLabelcol(label: String): this.type = set(labelCol, label) def setIdcol(id: String): this.type = set(idCol, id) override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) ) } .join(prediction) .map { case (id, (features, predictValue)) => Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } protected def predict(dataSet: Dataset[_]) = { val features = { case (row: Row) => val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) val id = row.getAs[I]($(idCol)) SparkLabeledPoint(DataTransformer.toFloat(id), features) }.cache val (_, model) = models.head UberXGBoostModel.labelPredict(, booster = model) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(getPredictionSchema) protected def getPredictionSchema: Array[StructField] = { Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) } }
Example 4
Source File: XGBoostBigModelTimeSeries.scala From uberdata with Apache License 2.0 | 5 votes |
package import java.sql.Timestamp import eleflow.uberdata.IUberdataForecastUtil import import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import org.apache.spark.annotation.DeveloperApi import{VectorUDT, Vector => SparkVector} import import import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModelTimeSeries[I](override val uid: String, override val models: Seq[(ParamMap, XGBoostModel)]) extends XGBoostBigModel[I](uid, models) with HasTimeCol{ def setTimecol(time: String): this.type = set(timeCol, Some(time)) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME), row.getAs[java.sql.Timestamp]($(timeCol).get))) } .join(prediction) .map { case (id, ((features, time), predictValue)) => Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField($(timeCol).get, TimestampType), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) ) }
Example 5
Source File: DruidPlannerHelper.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.druid import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan} import org.apache.spark.sql.types.{DecimalType, _} import org.sparklinedata.druid.DruidOperatorAttribute trait DruidPlannerHelper { def unalias(e: Expression, agg: Aggregate): Option[Expression] = { agg.aggregateExpressions.find { aE => (aE, e) match { case _ if aE == e => true case (_, e:AttributeReference) if e.exprId == aE.exprId => true case (Alias(child, _), e) if child == e => true case _ => false } }.map { case Alias(child, _) => child case x => x } } def findAttribute(e: Expression): Option[AttributeReference] = { e.find(_.isInstanceOf[AttributeReference]).map(_.asInstanceOf[AttributeReference]) } def positionOfAttribute(e: Expression, plan: LogicalPlan): Option[(Expression, (AttributeReference, Int))] = { for (aR <- findAttribute(e); attr <- plan.output.zipWithIndex.find(t => t._1.exprId == aR.exprId)) yield (e, (aR, attr._2)) } def exprIdToAttribute(e: Expression, plan: LogicalPlan): Option[(ExprId, Int)] = { for (aR <- findAttribute(e); attr <- plan.output.zipWithIndex.find(t => t._1.exprId == aR.exprId)) yield (aR.exprId, attr._2) } case class GroupingInfo(gEs: Seq[Expression], expandOpGExps : Seq[Expression], aEs: Seq[NamedExpression], expandOpProjection : Seq[Expression], aEExprIdToPos : Map[ExprId, Int], aEToLiteralExpr: Map[Expression, Expression] = Map()) def isNumericType(dt : DataType) : Boolean = NumericType.acceptsType(dt) }
Example 6
Source File: RawCsvRDDToDataframe.scala From seahorse with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.types.{StructType, _} object RawCsvRDDToDataframe { def parse( rdd: RDD[String], sparkSession: SparkSession, options: Map[String, String]): DataFrame = { val csvOptions = MapToCsvOptions(options, sparkSession.sessionState.conf) val csvReader = SparkCsvReader.create(csvOptions) val firstLine = findFirstLine(csvOptions, rdd) val firstRow = csvReader.parseLine(firstLine) val header = if (csvOptions.headerFlag) { { case (value, index) => if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value } } else { { case (value, index) => s"_c$index" } } val parsedRdd = tokenRdd(rdd, header, csvOptions) // TODO Migrate to Spark's schema inferencer eventually // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions) val schema = { val schemaFields = { fieldName => StructField(fieldName.toString, StringType, nullable = true) } StructType(schemaFields) } val ignoreMalformedRows = 0 val internalRows = parsedRdd.flatMap { row => val parser = CSVRelation.csvParser(schema, header, csvOptions) parser(row, ignoreMalformedRows) } Dataset.ofRows( sparkSession, LogicalRDD( schema.toAttributes, internalRows)(sparkSession)) } private def tokenRdd( rdd: RDD[String], header: Array[String], options: CSVOptions): RDD[Array[String]] = { // Make sure firstLine is materialized before sending to executors val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null SparkCsvReader.univocityTokenizer(rdd, header, firstLine, options) } private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { if (options.isCommentSet) { val comment = options.comment.toString rdd.filter { line => line.trim.nonEmpty && !line.startsWith(comment) }.first() } else { rdd.filter { line => line.trim.nonEmpty }.first() } } }
Example 7
Source File: RawCsvRDDToDataframe.scala From seahorse with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import scala.util.Try import com.univocity.parsers.csv.CsvParser import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.types.{StructType, _} object RawCsvRDDToDataframe { def parse( rdd: RDD[String], sparkSession: SparkSession, options: Map[String, String]): DataFrame = { val csvOptions = new CSVOptions(options, sparkSession.sessionState.conf.sessionLocalTimeZone) val csvReader = new CsvParser(csvOptions.asParserSettings) val firstLine = findFirstLine(csvOptions, rdd) val firstRow = csvReader.parseLine(firstLine) val header = if (csvOptions.headerFlag) { { case (value, index) => if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value } } else { { case (value, index) => s"_c$index" } } // TODO Migrate to Spark's schema inferencer eventually // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions) val schema = { val schemaFields = { fieldName => StructField(fieldName.toString, StringType, nullable = true) } StructType(schemaFields) } val withoutHeader = if (csvOptions.headerFlag) { rdd.zipWithIndex() .filter { case (row, index) => index != 0 } .map { case (row, index) => row } } else { rdd } val internalRows = withoutHeader.filter(row => row.trim.nonEmpty) .flatMap { row => val univocityParser = new UnivocityParser(schema, csvOptions) Try(univocityParser.parse(row)).toOption } Dataset.ofRows( sparkSession, LogicalRDD( schema.toAttributes, internalRows)(sparkSession)) } private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { if (options.isCommentSet) { val comment = options.comment.toString rdd.filter { line => line.trim.nonEmpty && !line.startsWith(comment) }.first() } else { rdd.filter { line => line.trim.nonEmpty }.first() } } }
Example 8
Source File: CustomSchemaTest.scala From spark-sftp with Apache License 2.0 | 5 votes |
package com.springml.spark.sftp import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, _} import org.scalatest.{BeforeAndAfterEach, FunSuite} class CustomSchemaTest extends FunSuite with BeforeAndAfterEach { var ss: SparkSession = _ val csvTypesMap = Map("ProposalId" -> IntegerType, "OpportunityId" -> StringType, "Clicks" -> LongType, "Impressions" -> LongType ) val jsonTypesMap = Map("name" -> StringType, "age" -> IntegerType ) override def beforeEach() { ss = SparkSession.builder().master("local").appName("Custom Schema Test").getOrCreate() } private def validateTypes(field : StructField, typeMap : Map[String, DataType]) = { val expectedType = typeMap( assert(expectedType == field.dataType) } private def columnArray(typeMap : Map[String, DataType]) : Array[StructField] = { val columns = => new StructField(x._1, x._2, true)) val columnStruct = Array[StructField] () columns.copyToArray(columnStruct) columnStruct } test ("Read CSV with custom schema") { val columnStruct = columnArray(csvTypesMap) val expectedSchema = StructType(columnStruct) val fileLocation = getClass.getResource("/sample.csv").getPath val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext) val rdd = dsr.buildScan() assert(dsr.schema.fields.length == columnStruct.length) dsr.schema.fields.foreach(s => validateTypes(s, csvTypesMap)) } test ("Read Json with custom schema") { val columnStruct = columnArray(jsonTypesMap) val expectedSchema = StructType(columnStruct) val fileLocation = getClass.getResource("/people.json").getPath val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext) val rdd = dsr.buildScan() assert(dsr.schema.fields.length == columnStruct.length) dsr.schema.fields.foreach(s => validateTypes(s, jsonTypesMap)) } }
Example 9
Source File: KafkaSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.streaming.StreamingQuery import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper object KafkaSink extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ def writeStream(staticInputDS: Dataset[SimpleSongAggregation]) : StreamingQuery = { log.warn("Writing to Kafka") staticInputDS .select(to_json(struct($"*")).cast(StringType).alias("value")) .writeStream .outputMode("update") .format("kafka") .option("kafka.bootstrap.servers", KafkaService.bootstrapServers) .queryName("Kafka - Count number of broadcasts for a title/artist by radio") .option("topic", "test") .start() } def debugStream(staticKafkaInputDS: Dataset[SimpleSongAggregationKafka]) = { staticKafkaInputDS .writeStream .queryName("Debug Stream Kafka") .format("console") .start() } }
Example 10
Source File: KafkaSource.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper def read(startingOption: String = "startingOffsets", partitionsAndOffsets: String = "earliest") : Dataset[SimpleSongAggregationKafka] = { log.warn("Reading from Kafka") spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", KafkaService.topicName) .option("", false) // Cannot be set to true in Spark Strucutured Streaming .option("", "Structured-Streaming-Examples") .option("failOnDataLoss", false) // when starting a fresh kafka (default location is temporary (/tmp) and cassandra is not (var/lib)), we have saved different offsets in Cassandra than real offsets in kafka (that contains nothing) .option(startingOption, partitionsAndOffsets) //this only applies when a new query is started and that resuming will always pick up from where the query left off .load() .withColumn(KafkaService.radioStructureName, // nested structure with our json from_json($"value".cast(StringType), KafkaService.schemaOutput) //From binary to JSON object ).as[SimpleSongAggregationKafka] .filter(_.radioCount != null) //TODO find a better way to filter bad json } }
Example 11
Source File: RawCsvRDDToDataframe.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.types.{StructType, _} import io.deepsense.sparkutils.SparkSQLSession object RawCsvRDDToDataframe { def parse( rdd: RDD[String], sparkSQLSession: SparkSQLSession, options: Map[String, String]): DataFrame = { val csvOptions = new CSVOptions(options) val lineCsvReader = new LineCsvReader(csvOptions) val firstLine = findFirstLine(csvOptions, rdd) val firstRow = lineCsvReader.parseLine(firstLine) val header = if (csvOptions.headerFlag) { { case (value, index) => if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value } } else { { case (value, index) => s"_c$index" } } val parsedRdd = tokenRdd(rdd, csvOptions, header) // TODO Migrate to Spark's schema inferencer eventually // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions) val schema = { val schemaFields = { fieldName => StructField(fieldName.toString, StringType, nullable = true) } StructType(schemaFields) } val ignoreMalformedRows = 0 val internalRows = parsedRdd.flatMap { row => val parser = CSVRelation.csvParser(schema, header, csvOptions) parser(row, ignoreMalformedRows) } val sparkSession = sparkSQLSession.getSparkSession Dataset.ofRows( sparkSession, LogicalRDD( schema.toAttributes, internalRows)(sparkSession)) } private def tokenRdd( rdd: RDD[String], options: CSVOptions, header: Array[String]): RDD[Array[String]] = { // Make sure firstLine is materialized before sending to executors val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null CSVRelation.univocityTokenizer(rdd, header, firstLine, options) } private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { if (options.isCommentSet) { val comment = options.comment.toString rdd.filter { line => line.trim.nonEmpty && !line.startsWith(comment) }.first() } else { rdd.filter { line => line.trim.nonEmpty }.first() } } }