org.apache.spark.sql.types Scala Examples
The following examples show how to use org.apache.spark.sql.types.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: cogroup.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.plumbus import org.apache.spark.Partitioner import org.apache.spark.rdd.{ CoGroupedRDD, RDD } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ ArrayType, StructField } import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row } import scala.reflect.ClassTag import scala.util.Try object cogroup { implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) { def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] = //Use SparkAddOn ? ??? } def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)( implicit encA: Encoder[A], encB: Encoder[B], encC: Encoder[K], enc: Encoder[(K, Seq[A], Seq[B])], ca: ClassTag[A], ck: ClassTag[K], cb: ClassTag[B] ): Dataset[(K, Seq[A], Seq[B])] = left.sparkSession.implicits .rddToDatasetHolder( RDD .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft)) .cogroup(right.rdd.keyBy(keyRight)) .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) }) ) .toDS def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)( byKey: String, partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*) ): Try[DataFrame] = Try { val subGroup: Seq[DataFrame] = namedSubGroup.map(_._2) val allFrames: Seq[DataFrame] = group +: subGroup val allFramesKeyed: Seq[RDD[(String, Row)]] = allFrames.map(df => { val idx = df.columns.indexOf(byKey) df.rdd.keyBy(_.get(idx).toString) }) val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner) val rowRdd: RDD[Row] = cogroupRdd.map(x => { val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq) val seq = rows.head.head.toSeq ++ rows.tail new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row] }) val schema = types.StructType( group.schema.fields ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) } ) group.sparkSession.createDataFrame(rowRdd, schema) } }
Example 2
Source File: TimeType.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.time.types import com.twosigma.flint.FlintConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.{ SQLContext, SparkSession, types } trait TimeType { def roundDownPrecision(nanosSinceEpoch: Long): Long } object TimeType { case object LongType extends TimeType { override def internalToNanos(value: Long): Long = value override def nanosToInternal(nanos: Long): Long = nanos override def roundDownPrecision(nanos: Long): Long = nanos } // Spark sql represents timestamp as microseconds internally case object TimestampType extends TimeType { override def internalToNanos(value: Long): Long = value * 1000 override def nanosToInternal(nanos: Long): Long = nanos / 1000 override def roundDownPrecision(nanos: Long): Long = nanos - nanos % 1000 } def apply(timeType: String): TimeType = { timeType match { case "long" => LongType case "timestamp" => TimestampType case _ => throw new IllegalAccessException(s"Unsupported time type: ${timeType}. " + s"Only `long` and `timestamp` are supported.") } } def apply(sqlType: types.DataType): TimeType = { sqlType match { case types.LongType => LongType case types.TimestampType => TimestampType case _ => throw new IllegalArgumentException(s"Unsupported time type: ${sqlType}") } } def get(sparkSession: SparkSession): TimeType = { TimeType(sparkSession.conf.get( FlintConf.TIME_TYPE_CONF, FlintConf.TIME_TYPE_DEFAULT )) } }
Example 3
Source File: SparkLeapFrame.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.spark import ml.combust.mleap.core.types.{StructField, StructType} import ml.combust.mleap.runtime.frame.{FrameBuilder, Row, RowUtil} import ml.combust.mleap.runtime.function.{Selector, UserDefinedFunction} import org.apache.spark.rdd.RDD import org.apache.spark.sql import org.apache.spark.sql.mleap.TypeConverters import org.apache.spark.sql.{DataFrame, SQLContext, types} import scala.util.Try case class SparkLeapFrame(schema: StructType, dataset: RDD[Row], sqlContext: SQLContext) extends FrameBuilder[SparkLeapFrame] { override def withColumn(output: String, inputs: Selector *) (udf: UserDefinedFunction): Try[SparkLeapFrame] = { RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap { rowSelectors => val field = StructField(output, udf.outputTypes.head) schema.withField(field).map { schema2 => val dataset2 = dataset.map { row => row.withValue(rowSelectors: _*)(udf) } copy(schema = schema2, dataset = dataset2) } } } override def withColumns(outputs: Seq[String], inputs: Selector*) (udf: UserDefinedFunction): Try[SparkLeapFrame] = { RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap { rowSelectors => val fields = outputs.zip(udf.outputTypes).map { case (name, dt) => StructField(name, dt) } schema.withFields(fields).map { schema2 => val dataset2 = dataset.map { row => row.withValues(rowSelectors: _*)(udf) } copy(schema = schema2, dataset = dataset2) } } } override def select(fieldNames: String *): Try[SparkLeapFrame] = { for(indices <- schema.indicesOf(fieldNames: _*); schema2 <- schema.selectIndices(indices: _*)) yield { val dataset2 = dataset.map(row => row.selectIndices(indices: _*)) copy(schema = schema2, dataset = dataset2) } } override def drop(names: String*): Try[SparkLeapFrame] = { for(indices <- schema.indicesOf(names: _*); schema2 <- schema.dropIndices(indices: _*)) yield { val dataset2 = dataset.map(row => row.dropIndices(indices: _*)) copy(schema = schema2, dataset = dataset2) } } override def filter(selectors: Selector*) (udf: UserDefinedFunction): Try[SparkLeapFrame] = { RowUtil.createRowSelectors(schema, selectors: _*)(udf).map { rowSelectors => val dataset2 = dataset.filter(row => row.shouldFilter(rowSelectors: _*)(udf)) copy(schema = schema, dataset = dataset2) } } def toSpark: DataFrame = { val spec = schema.fields.map(TypeConverters.mleapToSparkConverter) val fields = spec.map(_._1) val converters = spec.map(_._2) val sparkSchema = new types.StructType(fields.toArray) val data = dataset.map { r => val values = r.zip(converters).map { case (v, c) => c(v) } sql.Row(values.toSeq: _*) } sqlContext.createDataFrame(data, sparkSchema) } }
Example 4
Source File: LeapFrameToSpark.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter import com.truecar.mleap.core.linalg.Vector import com.truecar.mleap.runtime.types.StructType import com.truecar.mleap.spark.{SparkLeapFrame, MleapSparkSupport} import org.apache.spark.sql.{types, Row, DataFrame, SQLContext} import MleapSparkSupport._ trait LeapFrameToSpark[T] { def toSpark(t: T)(implicit sqlContext: SQLContext): DataFrame } case class LeapFrameToSparkWrapper[T: LeapFrameToSpark](t: T) { def toSpark(implicit sqlContext: SQLContext): DataFrame = { implicitly[LeapFrameToSpark[T]].toSpark(t) } } object LeapFrameToSpark { implicit object SparkLeapFrameToSpark extends LeapFrameToSpark[SparkLeapFrame] { override def toSpark(t: SparkLeapFrame) (implicit sqlContext: SQLContext): DataFrame = { val outputNames = t.schema.fields.map(_.name).toSet -- t.sparkSchema.fields.map(_.name).toSet val outputs = outputNames.map { name => (t.schema(name), t.schema.indexOf(name)) }.toArray.sortBy(_._2) val (outputFields, outputIndices) = outputs.unzip val outputMleapSchema = StructTypeToSpark(StructType(outputFields)).toSpark val outputSchema = types.StructType(t.sparkSchema.fields ++ outputMleapSchema.fields) val rows = t.dataset.rdd.map { case (mleapRow, sparkValues) => val mleapData = outputIndices.map { index => mleapRow.get(index) match { case value: Vector => value.toSpark case value => value } } Row(sparkValues ++ mleapData: _*) } sqlContext.createDataFrame(rows, outputSchema) } } }