org.apache.spark.sql.types._ Scala Examples
The following examples show how to use org.apache.spark.sql.types._.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MiscStatement.scala From spark-snowflake with Apache License 2.0 | 5 votes |
package net.snowflake.spark.snowflake.pushdowns.querygeneration import net.snowflake.spark.snowflake.{ ConstantString, EmptySnowflakeSQLStatement, IntVariable, SnowflakeSQLStatement } import org.apache.spark.sql.catalyst.expressions.{ Alias, Ascending, Attribute, Cast, DenseRank, Descending, Expression, If, In, InSet, Literal, MakeDecimal, PercentRank, Rank, ScalarSubquery, ShiftLeft, ShiftRight, SortOrder, UnscaledValue, WindowExpression, WindowSpecDefinition } import org.apache.spark.sql.types.{Decimal, _} private[querygeneration] final def getCastType(t: DataType): Option[String] = Option(t match { case StringType => "VARCHAR" case BinaryType => "BINARY" case DateType => "DATE" case TimestampType => "TIMESTAMP" case d: DecimalType => "DECIMAL(" + d.precision + ", " + d.scale + ")" case IntegerType | LongType => "NUMBER" case FloatType => "FLOAT" case DoubleType => "DOUBLE" case _ => null }) }
Example 2
Source File: KustoResponseDeserializer.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark.datasource import java.sql.Timestamp import java.util import com.microsoft.azure.kusto.data.{KustoResultColumn, KustoResultSetTable, Results} import com.microsoft.kusto.spark.utils.DataTypeMapping import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StructType, _} import org.joda.time.DateTime import scala.collection.JavaConverters._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer object KustoResponseDeserializer { def apply(kustoResult: KustoResultSetTable): KustoResponseDeserializer = new KustoResponseDeserializer(kustoResult) } // Timespan columns are casted to strings in kusto side. A simple test to compare the translation to a Duration string // in the format of timespan resulted in less performance. One way was using a new expression that extends UnaryExpression, // second was by a udf function, both were less performant. case class KustoSchema(sparkSchema: StructType, toStringCastedColumns: Set[String]) class KustoResponseDeserializer(val kustoResult: KustoResultSetTable) { val schema: KustoSchema = getSchemaFromKustoResult private def getValueTransformer(valueType: String): Any => Any = { valueType.toLowerCase() match { case "string" => value: Any => value case "int64" => value: Any => value case "datetime" => value: Any => new Timestamp(new DateTime(value).getMillis) case "timespan" => value: Any => value case "sbyte" => value: Any => value case "long" => value: Any => value match { case i: Int => i.toLong case _ => value.asInstanceOf[Long] } case "double" => value: Any => value case "decimal" => value: Any => BigDecimal(value.asInstanceOf[String]) case "int" => value: Any => value case "int32" => value: Any => value case "bool" => value: Any => value case "real" => value: Any => value case _ => value: Any => value.toString } } private def getSchemaFromKustoResult: KustoSchema = { if (kustoResult.getColumns.isEmpty) { KustoSchema(StructType(List()), Set()) } else { val columns = kustoResult.getColumns KustoSchema(StructType(columns.map(col => StructField(col.getColumnName, DataTypeMapping.kustoTypeToSparkTypeMap.getOrElse(col.getColumnType.toLowerCase, StringType)))), columns.filter(c => c.getColumnType.equalsIgnoreCase("TimeSpan")).map(c => c.getColumnName).toSet) } } def getSchema: KustoSchema = { schema } def toRows: java.util.List[Row] = { val columnInOrder = kustoResult.getColumns val value: util.ArrayList[Row] = new util.ArrayList[Row](kustoResult.count()) // Calculate the transformer function for each column to use later by order val valueTransformers: mutable.Seq[Any => Any] = columnInOrder.map(col => getValueTransformer(col.getColumnType)) kustoResult.getData.asScala.foreach(row => { val genericRow = row.toArray().zipWithIndex.map( column => { if (column._1 == null) null else valueTransformers(column._2)(column._1) }) value.add(new GenericRowWithSchema(genericRow, schema.sparkSchema)) }) value } // private def getOrderedColumnName = { // val columnInOrder = ArrayBuffer.fill(kustoResult.getColumnNameToIndex.size()){ "" } // kustoResult.getColumns.foreach((columnIndexPair: KustoResultColumn) => columnInOrder(columnIndexPair.) = columnIndexPair._1) // columnInOrder // } }
Example 3
Source File: XGBoostBigModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberXGBoostModel import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint} import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)]) extends ForecastBaseModel[XGBoostBigModel[I]] with HasLabelCol with HasIdCol { def setLabelcol(label: String): this.type = set(labelCol, label) def setIdcol(id: String): this.type = set(idCol, id) override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) ) } .join(prediction) .map { case (id, (features, predictValue)) => Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } protected def predict(dataSet: Dataset[_]) = { val features = dataSet.rdd.map { case (row: Row) => val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) val id = row.getAs[I]($(idCol)) SparkLabeledPoint(DataTransformer.toFloat(id), features) }.cache val (_, model) = models.head UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(getPredictionSchema) protected def getPredictionSchema: Array[StructField] = { Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) } }
Example 4
Source File: XGBoostBigModelTimeSeries.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import java.sql.Timestamp import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasTimeCol import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModelTimeSeries[I](override val uid: String, override val models: Seq[(ParamMap, XGBoostModel)]) extends XGBoostBigModel[I](uid, models) with HasTimeCol{ def setTimecol(time: String): this.type = set(timeCol, Some(time)) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME), row.getAs[java.sql.Timestamp]($(timeCol).get))) } .join(prediction) .map { case (id, ((features, time), predictValue)) => Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField($(timeCol).get, TimestampType), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) ) }
Example 5
Source File: DruidPlannerHelper.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.druid import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan} import org.apache.spark.sql.types.{DecimalType, _} import org.sparklinedata.druid.DruidOperatorAttribute trait DruidPlannerHelper { def unalias(e: Expression, agg: Aggregate): Option[Expression] = { agg.aggregateExpressions.find { aE => (aE, e) match { case _ if aE == e => true case (_, e:AttributeReference) if e.exprId == aE.exprId => true case (Alias(child, _), e) if child == e => true case _ => false } }.map { case Alias(child, _) => child case x => x } } def findAttribute(e: Expression): Option[AttributeReference] = { e.find(_.isInstanceOf[AttributeReference]).map(_.asInstanceOf[AttributeReference]) } def positionOfAttribute(e: Expression, plan: LogicalPlan): Option[(Expression, (AttributeReference, Int))] = { for (aR <- findAttribute(e); attr <- plan.output.zipWithIndex.find(t => t._1.exprId == aR.exprId)) yield (e, (aR, attr._2)) } def exprIdToAttribute(e: Expression, plan: LogicalPlan): Option[(ExprId, Int)] = { for (aR <- findAttribute(e); attr <- plan.output.zipWithIndex.find(t => t._1.exprId == aR.exprId)) yield (aR.exprId, attr._2) } case class GroupingInfo(gEs: Seq[Expression], expandOpGExps : Seq[Expression], aEs: Seq[NamedExpression], expandOpProjection : Seq[Expression], aEExprIdToPos : Map[ExprId, Int], aEToLiteralExpr: Map[Expression, Expression] = Map()) def isNumericType(dt : DataType) : Boolean = NumericType.acceptsType(dt) }
Example 6
Source File: RawCsvRDDToDataframe.scala From seahorse with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.types.{StructType, _} object RawCsvRDDToDataframe { def parse( rdd: RDD[String], sparkSession: SparkSession, options: Map[String, String]): DataFrame = { val csvOptions = MapToCsvOptions(options, sparkSession.sessionState.conf) val csvReader = SparkCsvReader.create(csvOptions) val firstLine = findFirstLine(csvOptions, rdd) val firstRow = csvReader.parseLine(firstLine) val header = if (csvOptions.headerFlag) { firstRow.zipWithIndex.map { case (value, index) => if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value } } else { firstRow.zipWithIndex.map { case (value, index) => s"_c$index" } } val parsedRdd = tokenRdd(rdd, header, csvOptions) // TODO Migrate to Spark's schema inferencer eventually // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions) val schema = { val schemaFields = header.map { fieldName => StructField(fieldName.toString, StringType, nullable = true) } StructType(schemaFields) } val ignoreMalformedRows = 0 val internalRows = parsedRdd.flatMap { row => val parser = CSVRelation.csvParser(schema, header, csvOptions) parser(row, ignoreMalformedRows) } Dataset.ofRows( sparkSession, LogicalRDD( schema.toAttributes, internalRows)(sparkSession)) } private def tokenRdd( rdd: RDD[String], header: Array[String], options: CSVOptions): RDD[Array[String]] = { // Make sure firstLine is materialized before sending to executors val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null SparkCsvReader.univocityTokenizer(rdd, header, firstLine, options) } private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { if (options.isCommentSet) { val comment = options.comment.toString rdd.filter { line => line.trim.nonEmpty && !line.startsWith(comment) }.first() } else { rdd.filter { line => line.trim.nonEmpty }.first() } } }
Example 7
Source File: RawCsvRDDToDataframe.scala From seahorse with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import scala.util.Try import com.univocity.parsers.csv.CsvParser import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.types.{StructType, _} object RawCsvRDDToDataframe { def parse( rdd: RDD[String], sparkSession: SparkSession, options: Map[String, String]): DataFrame = { val csvOptions = new CSVOptions(options, sparkSession.sessionState.conf.sessionLocalTimeZone) val csvReader = new CsvParser(csvOptions.asParserSettings) val firstLine = findFirstLine(csvOptions, rdd) val firstRow = csvReader.parseLine(firstLine) val header = if (csvOptions.headerFlag) { firstRow.zipWithIndex.map { case (value, index) => if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value } } else { firstRow.zipWithIndex.map { case (value, index) => s"_c$index" } } // TODO Migrate to Spark's schema inferencer eventually // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions) val schema = { val schemaFields = header.map { fieldName => StructField(fieldName.toString, StringType, nullable = true) } StructType(schemaFields) } val withoutHeader = if (csvOptions.headerFlag) { rdd.zipWithIndex() .filter { case (row, index) => index != 0 } .map { case (row, index) => row } } else { rdd } val internalRows = withoutHeader.filter(row => row.trim.nonEmpty) .flatMap { row => val univocityParser = new UnivocityParser(schema, csvOptions) Try(univocityParser.parse(row)).toOption } Dataset.ofRows( sparkSession, LogicalRDD( schema.toAttributes, internalRows)(sparkSession)) } private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { if (options.isCommentSet) { val comment = options.comment.toString rdd.filter { line => line.trim.nonEmpty && !line.startsWith(comment) }.first() } else { rdd.filter { line => line.trim.nonEmpty }.first() } } }
Example 8
Source File: CustomSchemaTest.scala From spark-sftp with Apache License 2.0 | 5 votes |
package com.springml.spark.sftp import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, _} import org.scalatest.{BeforeAndAfterEach, FunSuite} class CustomSchemaTest extends FunSuite with BeforeAndAfterEach { var ss: SparkSession = _ val csvTypesMap = Map("ProposalId" -> IntegerType, "OpportunityId" -> StringType, "Clicks" -> LongType, "Impressions" -> LongType ) val jsonTypesMap = Map("name" -> StringType, "age" -> IntegerType ) override def beforeEach() { ss = SparkSession.builder().master("local").appName("Custom Schema Test").getOrCreate() } private def validateTypes(field : StructField, typeMap : Map[String, DataType]) = { val expectedType = typeMap(field.name) assert(expectedType == field.dataType) } private def columnArray(typeMap : Map[String, DataType]) : Array[StructField] = { val columns = typeMap.map(x => new StructField(x._1, x._2, true)) val columnStruct = Array[StructField] () columns.copyToArray(columnStruct) columnStruct } test ("Read CSV with custom schema") { val columnStruct = columnArray(csvTypesMap) val expectedSchema = StructType(columnStruct) val fileLocation = getClass.getResource("/sample.csv").getPath val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext) val rdd = dsr.buildScan() assert(dsr.schema.fields.length == columnStruct.length) dsr.schema.fields.foreach(s => validateTypes(s, csvTypesMap)) } test ("Read Json with custom schema") { val columnStruct = columnArray(jsonTypesMap) val expectedSchema = StructType(columnStruct) val fileLocation = getClass.getResource("/people.json").getPath val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext) val rdd = dsr.buildScan() assert(dsr.schema.fields.length == columnStruct.length) dsr.schema.fields.foreach(s => validateTypes(s, jsonTypesMap)) } }
Example 9
Source File: KafkaSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.streaming.StreamingQuery import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper object KafkaSink extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ def writeStream(staticInputDS: Dataset[SimpleSongAggregation]) : StreamingQuery = { log.warn("Writing to Kafka") staticInputDS .select(to_json(struct($"*")).cast(StringType).alias("value")) .writeStream .outputMode("update") .format("kafka") .option("kafka.bootstrap.servers", KafkaService.bootstrapServers) .queryName("Kafka - Count number of broadcasts for a title/artist by radio") .option("topic", "test") .start() } def debugStream(staticKafkaInputDS: Dataset[SimpleSongAggregationKafka]) = { staticKafkaInputDS .writeStream .queryName("Debug Stream Kafka") .format("console") .start() } }
Example 10
Source File: KafkaSource.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper def read(startingOption: String = "startingOffsets", partitionsAndOffsets: String = "earliest") : Dataset[SimpleSongAggregationKafka] = { log.warn("Reading from Kafka") spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", KafkaService.topicName) .option("enable.auto.commit", false) // Cannot be set to true in Spark Strucutured Streaming https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#kafka-specific-configurations .option("group.id", "Structured-Streaming-Examples") .option("failOnDataLoss", false) // when starting a fresh kafka (default location is temporary (/tmp) and cassandra is not (var/lib)), we have saved different offsets in Cassandra than real offsets in kafka (that contains nothing) .option(startingOption, partitionsAndOffsets) //this only applies when a new query is started and that resuming will always pick up from where the query left off .load() .withColumn(KafkaService.radioStructureName, // nested structure with our json from_json($"value".cast(StringType), KafkaService.schemaOutput) //From binary to JSON object ).as[SimpleSongAggregationKafka] .filter(_.radioCount != null) //TODO find a better way to filter bad json } }
Example 11
Source File: RawCsvRDDToDataframe.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.types.{StructType, _} import io.deepsense.sparkutils.SparkSQLSession object RawCsvRDDToDataframe { def parse( rdd: RDD[String], sparkSQLSession: SparkSQLSession, options: Map[String, String]): DataFrame = { val csvOptions = new CSVOptions(options) val lineCsvReader = new LineCsvReader(csvOptions) val firstLine = findFirstLine(csvOptions, rdd) val firstRow = lineCsvReader.parseLine(firstLine) val header = if (csvOptions.headerFlag) { firstRow.zipWithIndex.map { case (value, index) => if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value } } else { firstRow.zipWithIndex.map { case (value, index) => s"_c$index" } } val parsedRdd = tokenRdd(rdd, csvOptions, header) // TODO Migrate to Spark's schema inferencer eventually // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions) val schema = { val schemaFields = header.map { fieldName => StructField(fieldName.toString, StringType, nullable = true) } StructType(schemaFields) } val ignoreMalformedRows = 0 val internalRows = parsedRdd.flatMap { row => val parser = CSVRelation.csvParser(schema, header, csvOptions) parser(row, ignoreMalformedRows) } val sparkSession = sparkSQLSession.getSparkSession Dataset.ofRows( sparkSession, LogicalRDD( schema.toAttributes, internalRows)(sparkSession)) } private def tokenRdd( rdd: RDD[String], options: CSVOptions, header: Array[String]): RDD[Array[String]] = { // Make sure firstLine is materialized before sending to executors val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null CSVRelation.univocityTokenizer(rdd, header, firstLine, options) } private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { if (options.isCommentSet) { val comment = options.comment.toString rdd.filter { line => line.trim.nonEmpty && !line.startsWith(comment) }.first() } else { rdd.filter { line => line.trim.nonEmpty }.first() } } }