org.apache.spark.sql.catalyst.expressions.GenericRow Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.GenericRow.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ConfigurableDataGeneratorMain.scala From Spark.TableStatsExample with Apache License 2.0 | 5 votes |
package com.cloudera.sa.examples.tablestats import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType} import org.apache.spark.{SparkContext, SparkConf} import scala.collection.mutable import scala.util.Random object ConfigurableDataGeneratorMain { def main(args: Array[String]): Unit = { if (args.length == 0) { println("ConfigurableDataGeneratorMain <outputPath> <numberOfColumns> <numberOfRecords> <numberOfPartitions> <local>") return } val outputPath = args(0) val numberOfColumns = args(1).toInt val numberOfRecords = args(2).toInt val numberOfPartitions = args(3).toInt val runLocal = (args.length == 5 && args(4).equals("L")) var sc: SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "test", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("ConfigurableDataGeneratorMain") sc = new SparkContext(sparkConfig) } val sqlContext = new org.apache.spark.sql.SQLContext(sc) //Part A val rowRDD = sc.parallelize( (0 until numberOfPartitions).map( i => i), numberOfPartitions) //Part B val megaDataRDD = rowRDD.flatMap( r => { val random = new Random() val dataRange = (0 until numberOfRecords/numberOfPartitions).iterator dataRange.map[Row]( x => { val values = new mutable.ArrayBuffer[Any] for (i <- 0 until numberOfColumns) { if (i % 2 == 0) { values.+=(random.nextInt(100).toLong) } else { values.+=(random.nextInt(100).toString) } } new GenericRow(values.toArray) }) }) //Part C val schema = StructType( (0 until numberOfColumns).map( i => { if (i % 2 == 0) { StructField("longColumn_" + i, LongType, true) } else { StructField("stringColumn_" + i, StringType, true) } }) ) val df = sqlContext.createDataFrame(megaDataRDD, schema) df.saveAsParquetFile(outputPath) //Part D sc.stop() } }
Example 2
Source File: HailKryoRegistrator.scala From hail with MIT License | 5 votes |
package is.hail.kryo import com.esotericsoftware.kryo.Kryo import com.esotericsoftware.kryo.serializers.JavaSerializer import is.hail.annotations.{Region, UnsafeIndexedSeq, UnsafeRow} import is.hail.utils.{Interval, SerializableHadoopConfiguration} import is.hail.variant.Locus import org.apache.spark.serializer.KryoRegistrator import org.apache.spark.sql.catalyst.expressions.GenericRow class HailKryoRegistrator extends KryoRegistrator { override def registerClasses(kryo: Kryo) { kryo.register(classOf[SerializableHadoopConfiguration], new JavaSerializer()) kryo.register(classOf[UnsafeRow]) kryo.register(classOf[GenericRow]) kryo.register(classOf[Locus]) kryo.register(classOf[Interval]) kryo.register(classOf[UnsafeIndexedSeq]) kryo.register(classOf[Region]) } }
Example 3
Source File: RowTest.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema} import org.apache.spark.sql.types._ import org.scalatest.{Matchers, FunSpec} class RowTest extends FunSpec with Matchers { val schema = StructType( StructField("col1", StringType) :: StructField("col2", StringType) :: StructField("col3", IntegerType) :: Nil) val values = Array("value1", "value2", 1) val sampleRow: Row = new GenericRowWithSchema(values, schema) val noSchemaRow: Row = new GenericRow(values) describe("Row (without schema)") { it("throws an exception when accessing by fieldName") { intercept[UnsupportedOperationException] { noSchemaRow.fieldIndex("col1") } intercept[UnsupportedOperationException] { noSchemaRow.getAs("col1") } } } describe("Row (with schema)") { it("fieldIndex(name) returns field index") { sampleRow.fieldIndex("col1") shouldBe 0 sampleRow.fieldIndex("col3") shouldBe 2 } it("getAs[T] retrieves a value by fieldname") { sampleRow.getAs[String]("col1") shouldBe "value1" sampleRow.getAs[Int]("col3") shouldBe 1 } it("Accessing non existent field throws an exception") { intercept[IllegalArgumentException] { sampleRow.getAs[String]("non_existent") } } it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") { val expected = Map( "col1" -> "value1", "col2" -> "value2" ) sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected } } }
Example 4
Source File: RowTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema} import org.apache.spark.sql.types._ import org.scalatest.{Matchers, FunSpec} class RowTest extends FunSpec with Matchers { val schema = StructType( StructField("col1", StringType) :: StructField("col2", StringType) :: StructField("col3", IntegerType) :: Nil)//列表结尾为Nil val values = Array("value1", "value2", 1) val sampleRow: Row = new GenericRowWithSchema(values, schema) val noSchemaRow: Row = new GenericRow(values) //行(无模式) describe("Row (without schema)") { //通过fieldName访问时会抛出异常 it("throws an exception when accessing by fieldName") { intercept[UnsupportedOperationException] { noSchemaRow.fieldIndex("col1") } intercept[UnsupportedOperationException] { noSchemaRow.getAs("col1") } } } //行(带模式) describe("Row (with schema)") { //fieldIndex(name)返回字段索引 it("fieldIndex(name) returns field index") { sampleRow.fieldIndex("col1") shouldBe 0 sampleRow.fieldIndex("col3") shouldBe 2 } //getAs [T]通过字段名检索值 it("getAs[T] retrieves a value by fieldname") { sampleRow.getAs[String]("col1") shouldBe "value1" sampleRow.getAs[Int]("col3") shouldBe 1 } //访问不存在的字段会引发异常 it("Accessing non existent field throws an exception") { intercept[IllegalArgumentException] { sampleRow.getAs[String]("non_existent") } } //getValuesMap()检索多个字段的值作为Map(field -> value) it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") { val expected = Map( "col1" -> "value1", "col2" -> "value2" ) sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected } } //行等于 describe("row equals") { val externalRow = Row(1, 2) val externalRow2 = Row(1, 2) val internalRow = InternalRow(1, 2) val internalRow2 = InternalRow(1, 2) //外部行的等式检查 it("equality check for external rows") { externalRow shouldEqual externalRow2 } //相等检查内部行 it("equality check for internal rows") { internalRow shouldEqual internalRow2 } } }
Example 5
Source File: RowTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema} import org.apache.spark.sql.types._ import org.scalatest.{Matchers, FunSpec} class RowTest extends FunSpec with Matchers { val schema = StructType( StructField("col1", StringType) :: StructField("col2", StringType) :: StructField("col3", IntegerType) :: Nil) val values = Array("value1", "value2", 1) val valuesWithoutCol3 = Array[Any](null, "value2", null) val sampleRow: Row = new GenericRowWithSchema(values, schema) val sampleRowWithoutCol3: Row = new GenericRowWithSchema(valuesWithoutCol3, schema) val noSchemaRow: Row = new GenericRow(values) describe("Row (without schema)") { it("throws an exception when accessing by fieldName") { intercept[UnsupportedOperationException] { noSchemaRow.fieldIndex("col1") } intercept[UnsupportedOperationException] { noSchemaRow.getAs("col1") } } } describe("Row (with schema)") { it("fieldIndex(name) returns field index") { sampleRow.fieldIndex("col1") shouldBe 0 sampleRow.fieldIndex("col3") shouldBe 2 } it("getAs[T] retrieves a value by fieldname") { sampleRow.getAs[String]("col1") shouldBe "value1" sampleRow.getAs[Int]("col3") shouldBe 1 } it("Accessing non existent field throws an exception") { intercept[IllegalArgumentException] { sampleRow.getAs[String]("non_existent") } } it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") { val expected = Map( "col1" -> "value1", "col2" -> "value2" ) sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected } it("getValuesMap() retrieves null value on non AnyVal Type") { val expected = Map( "col1" -> null, "col2" -> "value2" ) sampleRowWithoutCol3.getValuesMap[String](List("col1", "col2")) shouldBe expected } it("getAs() on type extending AnyVal throws an exception when accessing field that is null") { intercept[NullPointerException] { sampleRowWithoutCol3.getInt(sampleRowWithoutCol3.fieldIndex("col3")) } } it("getAs() on type extending AnyVal does not throw exception when value is null"){ sampleRowWithoutCol3.getAs[String](sampleRowWithoutCol3.fieldIndex("col1")) shouldBe null } } describe("row equals") { val externalRow = Row(1, 2) val externalRow2 = Row(1, 2) val internalRow = InternalRow(1, 2) val internalRow2 = InternalRow(1, 2) it("equality check for external rows") { externalRow shouldEqual externalRow2 } it("equality check for internal rows") { internalRow shouldEqual internalRow2 } } }
Example 6
Source File: SparkAvroDecoder.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark.avro import org.apache.log4j.Logger import java.io.ByteArrayOutputStream import scala.reflect.runtime.universe._ import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord } import org.apache.avro.io.{ DecoderFactory, EncoderFactory } import org.apache.spark.sql.{ Dataset, Encoder, Row } import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder } import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.StructType import org.apache.avro.Schema import cloudflow.spark.sql.SQLImplicits._ case class EncodedKV(key: String, value: Array[Byte]) case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) { val encoder: Encoder[T] = implicitly[Encoder[T]] val sqlSchema: StructType = encoder.schema val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema) @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) @transient lazy val rowConverter = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema) @transient lazy val datumReader = new GenericDatumReader[GenericRecord](_avroSchema) @transient lazy val decoder = DecoderFactory.get def decode(bytes: Array[Byte]): Row = { val binaryDecoder = decoder.binaryDecoder(bytes, null) val record = datumReader.read(null, binaryDecoder) rowConverter(record).asInstanceOf[GenericRow] } } case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) { @transient lazy val log = Logger.getLogger(getClass.getName) val BufferSize = 5 * 1024 // 5 Kb val encoder = implicitly[Encoder[T]] val sqlSchema = encoder.schema @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) val recordName = "topLevelRecord" // ??? val recordNamespace = "recordNamespace" // ??? @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace) // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage def rowToBytes(row: Row): Array[Byte] = { val genRecord = converter(row).asInstanceOf[GenericRecord] if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord") val datumWriter = new GenericDatumWriter[GenericRecord](_avroSchema) val avroEncoder = EncoderFactory.get val byteArrOS = new ByteArrayOutputStream(BufferSize) val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null) datumWriter.write(genRecord, binaryEncoder) binaryEncoder.flush() byteArrOS.toByteArray } def encode(dataset: Dataset[T]): Dataset[Array[Byte]] = dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]] // Note to self: I'm not sure how heavy this chain of transformations is def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = { val encoder = encoderFor[T] implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind() dataset.map { value ⇒ val key = keyFun(value) val internalRow = encoder.toRow(value) val row = rowEncoder.fromRow(internalRow) val bytes = rowToBytes(row) EncodedKV(key, bytes) } } }
Example 7
Source File: SPKSQLUtils.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema} import org.apache.spark.sql.types.{StructType, UDTRegistration} object SPKSQLUtils { def append(row: Row, fields: StructType, values: Any*): Row = { row match { case r: GenericRowWithSchema => val newValues = new Array[Any](r.length + values.length) val rLength: Int = r.length (0 until rLength).foreach(idx => newValues(idx) = r(idx)) values.zipWithIndex.foreach { case (value, idx) => newValues(idx + rLength) = value } val newSchema = if (r.schema != null) { val schemaTemp = StructType(r.schema) fields.foreach(field => schemaTemp.add(field)) schemaTemp } else { null.asInstanceOf[StructType] } new GenericRowWithSchema(newValues, newSchema) case r: GenericRow => val newValues = new Array[Any](r.length + values.length) val rLength: Int = r.length (0 until rLength).foreach(idx => newValues(idx) = r(idx)) values.zipWithIndex.foreach { case (value, idx) => newValues(idx + rLength) = value } new GenericRow(newValues) case _ => throw new Exception("Row Error!") } } def registerUDT(): Unit = synchronized{ UDTRegistration.register("org.apache.spark.linalg.Vector", "org.apache.spark.linalg.VectorUDT") UDTRegistration.register("org.apache.spark.linalg.DenseVector", "org.apache.spark.linalg.VectorUDT") UDTRegistration.register("org.apache.spark.linalg.SparseVector", "org.apache.spark.linalg.VectorUDT") UDTRegistration.register("org.apache.spark.linalg.Matrix", "org.apache.spark.linalg.MatrixUDT") UDTRegistration.register("org.apache.spark.linalg.DenseMatrix", "org.apache.spark.linalg.MatrixUDT") UDTRegistration.register("org.apache.spark.linalg.SparseMatrix", "org.apache.spark.linalg.MatrixUDT") } }
Example 8
Source File: LocalWriteSuite.scala From ecosystem with Apache License 2.0 | 5 votes |
package org.tensorflow.spark.datasources.tfrecords import java.nio.file.Files import java.nio.file.Paths import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types._ import org.apache.commons.io.FileUtils class LocalWriteSuite extends SharedSparkSessionSuite { val testRows: Array[Row] = Array( new GenericRow(Array[Any](11, 1, 23L, 10.0F, 14.0, List(1.0, 3.0), "r1")), new GenericRow(Array[Any](21, 2, 24L, 12.0F, 15.0, List(2.0, 3.0), "r2")), new GenericRow(Array[Any](31, 3, 25L, 14.0F, 16.0, List(3.0, 3.0), "r3"))) val schema = StructType(List(StructField("id", IntegerType), StructField("IntegerTypeLabel", IntegerType), StructField("LongTypeLabel", LongType), StructField("FloatTypeLabel", FloatType), StructField("DoubleTypeLabel", DoubleType), StructField("VectorLabel", ArrayType(DoubleType, true)), StructField("name", StringType))) "Propagate" should { "write data locally" in { // Create a dataframe with 2 partitions val rdd = spark.sparkContext.parallelize(testRows, numSlices = 2) val df = spark.createDataFrame(rdd, schema) // Write the partitions onto the local hard drive. Since it is going to be the // local file system, the partitions will be written in the same directory of the // same machine. // In a distributed setting though, two different machines would each hold a single // partition. val localPath = Files.createTempDirectory("spark-connector-propagate").toAbsolutePath.toString val savePath = localPath + "/testResult" df.write.format("tfrecords") .option("recordType", "Example") .option("writeLocality", "local") .save(savePath) // Read again this directory, this time using the Hadoop file readers, it should // return the same data. // This only works in this test and does not hold in general, because the partitions // will be written on the workers. Everything runs locally for tests. val df2 = spark.read.format("tfrecords").option("recordType", "Example") .load(savePath).sort("id").select("id", "IntegerTypeLabel", "LongTypeLabel", "FloatTypeLabel", "DoubleTypeLabel", "VectorLabel", "name") // Correct column order. assert(df2.collect().toSeq === testRows.toSeq) } } }