org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RowTest.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema} import org.apache.spark.sql.types._ import org.scalatest.{Matchers, FunSpec} class RowTest extends FunSpec with Matchers { val schema = StructType( StructField("col1", StringType) :: StructField("col2", StringType) :: StructField("col3", IntegerType) :: Nil) val values = Array("value1", "value2", 1) val sampleRow: Row = new GenericRowWithSchema(values, schema) val noSchemaRow: Row = new GenericRow(values) describe("Row (without schema)") { it("throws an exception when accessing by fieldName") { intercept[UnsupportedOperationException] { noSchemaRow.fieldIndex("col1") } intercept[UnsupportedOperationException] { noSchemaRow.getAs("col1") } } } describe("Row (with schema)") { it("fieldIndex(name) returns field index") { sampleRow.fieldIndex("col1") shouldBe 0 sampleRow.fieldIndex("col3") shouldBe 2 } it("getAs[T] retrieves a value by fieldname") { sampleRow.getAs[String]("col1") shouldBe "value1" sampleRow.getAs[Int]("col3") shouldBe 1 } it("Accessing non existent field throws an exception") { intercept[IllegalArgumentException] { sampleRow.getAs[String]("non_existent") } } it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") { val expected = Map( "col1" -> "value1", "col2" -> "value2" ) sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected } } }
Example 2
Source File: SPKSQLUtils.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema} import org.apache.spark.sql.types.{StructType, UDTRegistration} object SPKSQLUtils { def append(row: Row, fields: StructType, values: Any*): Row = { row match { case r: GenericRowWithSchema => val newValues = new Array[Any](r.length + values.length) val rLength: Int = r.length (0 until rLength).foreach(idx => newValues(idx) = r(idx)) values.zipWithIndex.foreach { case (value, idx) => newValues(idx + rLength) = value } val newSchema = if (r.schema != null) { val schemaTemp = StructType(r.schema) fields.foreach(field => schemaTemp.add(field)) schemaTemp } else { null.asInstanceOf[StructType] } new GenericRowWithSchema(newValues, newSchema) case r: GenericRow => val newValues = new Array[Any](r.length + values.length) val rLength: Int = r.length (0 until rLength).foreach(idx => newValues(idx) = r(idx)) values.zipWithIndex.foreach { case (value, idx) => newValues(idx + rLength) = value } new GenericRow(newValues) case _ => throw new Exception("Row Error!") } } def registerUDT(): Unit = synchronized{ UDTRegistration.register("org.apache.spark.linalg.Vector", "org.apache.spark.linalg.VectorUDT") UDTRegistration.register("org.apache.spark.linalg.DenseVector", "org.apache.spark.linalg.VectorUDT") UDTRegistration.register("org.apache.spark.linalg.SparseVector", "org.apache.spark.linalg.VectorUDT") UDTRegistration.register("org.apache.spark.linalg.Matrix", "org.apache.spark.linalg.MatrixUDT") UDTRegistration.register("org.apache.spark.linalg.DenseMatrix", "org.apache.spark.linalg.MatrixUDT") UDTRegistration.register("org.apache.spark.linalg.SparseMatrix", "org.apache.spark.linalg.MatrixUDT") } }
Example 3
Source File: GDBRowIterator.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.StructType class GDBRowIterator(indexIter: Iterator[IndexInfo], dataBuffer: DataBuffer, fields: Array[Field], schema: StructType) extends Iterator[Row] with Serializable { val numFieldsWithNullAllowed = fields.count(_.nullable) val nullValueMasks = new Array[Byte]((numFieldsWithNullAllowed / 8.0).ceil.toInt) def hasNext() = indexIter.hasNext def next() = { val index = indexIter.next() val numBytes = dataBuffer.seek(index.seek).readBytes(4).getInt val byteBuffer = dataBuffer.readBytes(numBytes) 0 until nullValueMasks.length foreach (nullValueMasks(_) = byteBuffer.get) var bit = 0 val values = fields.map(field => { if (field.nullable) { val i = bit >> 3 val m = 1 << (bit & 7) bit += 1 if ((nullValueMasks(i) & m) == 0) { field.readValue(byteBuffer, index.objectID) } else { null // TODO - Do not like null here - but...it is nullable ! } } else { field.readValue(byteBuffer, index.objectID) } } ) new GenericRowWithSchema(values, schema) } }
Example 4
Source File: Executor.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark import java.time.{LocalDate, LocalDateTime, OffsetTime, ZoneOffset, ZonedDateTime} import java.util import java.sql.Timestamp import org.apache.spark.SparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types.StructType import org.neo4j.spark.dataframe.CypherTypes import org.neo4j.spark.utils.{Neo4jSessionAwareIterator, Neo4jUtils} import scala.collection.JavaConverters._ object Executor { def convert(value: AnyRef): Any = value match { case it: util.Collection[_] => it.toArray() case m: java.util.Map[_,_] => m.asScala case _ => Neo4jUtils.convert(value) } def toJava(parameters: Map[String, Any]): java.util.Map[String, Object] = { parameters.mapValues(toJava).asJava } private def toJava(x: Any): AnyRef = x match { case y: Seq[_] => y.asJava case _ => x.asInstanceOf[AnyRef] } val EMPTY = Array.empty[Any] val EMPTY_RESULT = new CypherResult(new StructType(), Iterator.empty) class CypherResult(val schema: StructType, val rows: Iterator[Array[Any]]) { def sparkRows: Iterator[Row] = rows.map(row => new GenericRowWithSchema(row, schema)) def fields = schema.fieldNames } def execute(sc: SparkContext, query: String, parameters: Map[String, AnyRef]): CypherResult = { execute(Neo4jConfig(sc.getConf), query, parameters) } private def rows(result: Iterator[_]) = { var i = 0 while (result.hasNext) i = i + 1 i } def execute(config: Neo4jConfig, query: String, parameters: Map[String, Any], write: Boolean = false): CypherResult = { val result = new Neo4jSessionAwareIterator(config, query, toJava(parameters), write) if (!result.hasNext) { return EMPTY_RESULT } val peek = result.peek() val keyCount = peek.size() if (keyCount == 0) { return new CypherResult(new StructType(), Array.fill[Array[Any]](rows(result))(EMPTY).toIterator) } val keys = peek.keys().asScala val fields = keys.map(k => (k, peek.get(k).`type`())).map(keyType => CypherTypes.field(keyType)) val schema = StructType(fields) val it = result.map(record => { val row = new Array[Any](keyCount) var i = 0 while (i < keyCount) { val value = convert(record.get(i).asObject()) row.update(i, value) i = i + 1 } row }) new CypherResult(schema, it) } }
Example 5
Source File: ObjectMapper.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.sql import java.beans.Introspector import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericRowWithSchema} import org.apache.spark.sql.types.StructType object ObjectMapper { def forBean(schema: StructType, beanClass: Class[_]): (AnyRef, Array[String]) => Row = { val beanInfo = Introspector.getBeanInfo(beanClass) val attrs = schema.fields.map(f => AttributeReference(f.name, f.dataType, f.nullable)()) val extractors = beanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod) val methodsToConverts = extractors.zip(attrs).map { case (e, attr) => (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType)) } (from: Any, columns: Array[String]) => { if (columns.nonEmpty) { from match { case _: Array[_] => new GenericRowWithSchema(from.asInstanceOf[Array[Any]], schema) case f: Any => val rowSchema = StructType(Array(schema(columns.head))) new GenericRowWithSchema(Array(f), rowSchema) } } else { new GenericRowWithSchema(methodsToConverts.map { case (e, convert) => val invoke: AnyRef = e.invoke(from) convert(invoke) }, schema) } } } }
Example 6
Source File: RowTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema} import org.apache.spark.sql.types._ import org.scalatest.{Matchers, FunSpec} class RowTest extends FunSpec with Matchers { val schema = StructType( StructField("col1", StringType) :: StructField("col2", StringType) :: StructField("col3", IntegerType) :: Nil) val values = Array("value1", "value2", 1) val valuesWithoutCol3 = Array[Any](null, "value2", null) val sampleRow: Row = new GenericRowWithSchema(values, schema) val sampleRowWithoutCol3: Row = new GenericRowWithSchema(valuesWithoutCol3, schema) val noSchemaRow: Row = new GenericRow(values) describe("Row (without schema)") { it("throws an exception when accessing by fieldName") { intercept[UnsupportedOperationException] { noSchemaRow.fieldIndex("col1") } intercept[UnsupportedOperationException] { noSchemaRow.getAs("col1") } } } describe("Row (with schema)") { it("fieldIndex(name) returns field index") { sampleRow.fieldIndex("col1") shouldBe 0 sampleRow.fieldIndex("col3") shouldBe 2 } it("getAs[T] retrieves a value by fieldname") { sampleRow.getAs[String]("col1") shouldBe "value1" sampleRow.getAs[Int]("col3") shouldBe 1 } it("Accessing non existent field throws an exception") { intercept[IllegalArgumentException] { sampleRow.getAs[String]("non_existent") } } it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") { val expected = Map( "col1" -> "value1", "col2" -> "value2" ) sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected } it("getValuesMap() retrieves null value on non AnyVal Type") { val expected = Map( "col1" -> null, "col2" -> "value2" ) sampleRowWithoutCol3.getValuesMap[String](List("col1", "col2")) shouldBe expected } it("getAs() on type extending AnyVal throws an exception when accessing field that is null") { intercept[NullPointerException] { sampleRowWithoutCol3.getInt(sampleRowWithoutCol3.fieldIndex("col3")) } } it("getAs() on type extending AnyVal does not throw exception when value is null"){ sampleRowWithoutCol3.getAs[String](sampleRowWithoutCol3.fieldIndex("col1")) shouldBe null } } describe("row equals") { val externalRow = Row(1, 2) val externalRow2 = Row(1, 2) val internalRow = InternalRow(1, 2) val internalRow2 = InternalRow(1, 2) it("equality check for external rows") { externalRow shouldEqual externalRow2 } it("equality check for internal rows") { internalRow shouldEqual internalRow2 } } }
Example 7
Source File: UnlabeledCSVRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package unit.com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.UnlabeledCSVRequestRowSerializer class UnlabeledCSVRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema: StructType = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false))) it should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) val sparseString = "-100.0," + "0.0," * 9 + "100.1," + "0.0," * 88 + "0.0\n" assert (sparseString == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert("10.0,-100.0,2.0\n" == serialized) } }
Example 8
Source File: LibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest._ import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema = new LibSVMResponseRowDeserializer(10).schema "LibSVMRequestRowSerializer" should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert ("1.0 1:-100.0 11:100.1\n" == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "ignore other columns" in { val schemaWithExtraColumns = StructType(Array( StructField("name", StringType, nullable = false), StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false), StructField("favorite activity", StringType, nullable = false))) val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray, schema = schemaWithExtraColumns) val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "fail on invalid features column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!") } } it should "fail on invalid label column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), labelColumnName = "Sir! I must protest! I do not exist!") } } it should "fail on invalid types" in { val schemaWithInvalidLabelType = StructType(Array( StructField("label", StringType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType)) } val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) new LibSVMRequestRowSerializer(Some(validSchema)) } }
Example 9
Source File: UnlabeledLibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StringType, StructField, StructType} class UnlabeledLibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false))) "UnlabeledLibSVMRequestRowSerializer" should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() val serialized = new String(rrs.serializeRow(row)) assert ("0.0 1:-100.0 11:100.1\n" == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() val serialized = new String(rrs.serializeRow(row)) assert("0.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "fail on invalid features column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer(featuresColumnName = "mangoes are not features") intercept[RuntimeException] { rrs.serializeRow(row) } } it should "fail on invalid features type" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, "FEATURESSSSSZ!1!").toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() intercept[RuntimeException] { rrs.serializeRow(row) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) val rrs = new UnlabeledLibSVMRequestRowSerializer(Some(validSchema)) } it should "fail to validate incorrect schema" in { val invalidSchema = StructType(Array( StructField("features", StringType, nullable = false))) intercept[IllegalArgumentException] { new UnlabeledLibSVMRequestRowSerializer(Some(invalidSchema)) } } }
Example 10
Source File: ProtobufRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val labelColumnName = "label" val featuresColumnName = "features" val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField( featuresColumnName, VectorType))) it should "serialize a dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "serialize a sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "fail to set schema on invalid features name" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) intercept[IllegalArgumentException] { val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist") } } it should "fail on invalid types" in { val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) new ProtobufRequestRowSerializer(Some(validSchema)) } }
Example 11
Source File: VectorExplode.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.util.collection.OpenHashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.odkl.SparkSqlUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, functions} class VectorExplode(override val uid: String) extends Transformer with DefaultParamsWritable { val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.") def setValueCol(value: String) : this.type = set(valueCol, value) setDefault(valueCol -> "value") def this() = this(Identifiable.randomUID("vectorExplode")) override def transform(dataset: Dataset[_]): DataFrame = { val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT]) val resultSchema = StructType(Seq( StructField($(valueCol), StringType, nullable = false)) ++ vectors.map(f => StructField(f.name, DoubleType, nullable = true)) ) val arraySize = resultSchema.size - 1 val names: Array[Map[Int, String]] = vectors.map( f => { AttributeGroup.fromStructField(f).attributes .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) }) val maxCapacity = names.map(_.size).max val explodeVectors : (Row => Array[Row]) = (r: Row ) => { val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity) for(i <- 0 until r.length) { val vector = r.getAs[Vector](i) vector.foreachActive((index, value) => { val name = names(i).getOrElse(index, s"${vectors(i).name}_$index") accumulator.changeValue( name, Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN}, v => {v(i) = value; v}) }) } accumulator.map(x => new GenericRowWithSchema( (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray, resultSchema)).toArray } val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*) val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType))) val expression = functions.explode(explodeUDF(vectorsStruct)) dataset .withColumn(uid, expression) .select( dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++ resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.fields.map(x => x.dataType match { case vector: VectorUDT => StructField(x.name, typeFromVector(x)) case _ => x } )) def typeFromVector(field: StructField): StructType = { val attributes = AttributeGroup.fromStructField(field) StructType(attributes.attributes .map(_.map(a => a.name.getOrElse(s"_${a.index.get}"))) .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" }) .map(name => StructField(name, DoubleType, nullable = false))) } }
Example 12
Source File: RowTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema} import org.apache.spark.sql.types._ import org.scalatest.{Matchers, FunSpec} class RowTest extends FunSpec with Matchers { val schema = StructType( StructField("col1", StringType) :: StructField("col2", StringType) :: StructField("col3", IntegerType) :: Nil)//列表结尾为Nil val values = Array("value1", "value2", 1) val sampleRow: Row = new GenericRowWithSchema(values, schema) val noSchemaRow: Row = new GenericRow(values) //行(无模式) describe("Row (without schema)") { //通过fieldName访问时会抛出异常 it("throws an exception when accessing by fieldName") { intercept[UnsupportedOperationException] { noSchemaRow.fieldIndex("col1") } intercept[UnsupportedOperationException] { noSchemaRow.getAs("col1") } } } //行(带模式) describe("Row (with schema)") { //fieldIndex(name)返回字段索引 it("fieldIndex(name) returns field index") { sampleRow.fieldIndex("col1") shouldBe 0 sampleRow.fieldIndex("col3") shouldBe 2 } //getAs [T]通过字段名检索值 it("getAs[T] retrieves a value by fieldname") { sampleRow.getAs[String]("col1") shouldBe "value1" sampleRow.getAs[Int]("col3") shouldBe 1 } //访问不存在的字段会引发异常 it("Accessing non existent field throws an exception") { intercept[IllegalArgumentException] { sampleRow.getAs[String]("non_existent") } } //getValuesMap()检索多个字段的值作为Map(field -> value) it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") { val expected = Map( "col1" -> "value1", "col2" -> "value2" ) sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected } } //行等于 describe("row equals") { val externalRow = Row(1, 2) val externalRow2 = Row(1, 2) val internalRow = InternalRow(1, 2) val internalRow2 = InternalRow(1, 2) //外部行的等式检查 it("equality check for external rows") { externalRow shouldEqual externalRow2 } //相等检查内部行 it("equality check for internal rows") { internalRow shouldEqual internalRow2 } } }
Example 13
Source File: cogroup.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.plumbus import org.apache.spark.Partitioner import org.apache.spark.rdd.{ CoGroupedRDD, RDD } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ ArrayType, StructField } import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row } import scala.reflect.ClassTag import scala.util.Try object cogroup { implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) { def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] = //Use SparkAddOn ? ??? } def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)( implicit encA: Encoder[A], encB: Encoder[B], encC: Encoder[K], enc: Encoder[(K, Seq[A], Seq[B])], ca: ClassTag[A], ck: ClassTag[K], cb: ClassTag[B] ): Dataset[(K, Seq[A], Seq[B])] = left.sparkSession.implicits .rddToDatasetHolder( RDD .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft)) .cogroup(right.rdd.keyBy(keyRight)) .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) }) ) .toDS def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)( byKey: String, partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*) ): Try[DataFrame] = Try { val subGroup: Seq[DataFrame] = namedSubGroup.map(_._2) val allFrames: Seq[DataFrame] = group +: subGroup val allFramesKeyed: Seq[RDD[(String, Row)]] = allFrames.map(df => { val idx = df.columns.indexOf(byKey) df.rdd.keyBy(_.get(idx).toString) }) val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner) val rowRdd: RDD[Row] = cogroupRdd.map(x => { val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq) val seq = rows.head.head.toSeq ++ rows.tail new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row] }) val schema = types.StructType( group.schema.fields ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) } ) group.sparkSession.createDataFrame(rowRdd, schema) } }
Example 14
Source File: MergeProjection.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.mutation.merge import java.sql.{Date, Timestamp} import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection} import org.apache.spark.sql.catalyst.util.DateTimeUtils case class MergeProjection( @transient tableCols: Seq[String], @transient statusCol : String, @transient ds: Dataset[Row], @transient rltn: CarbonDatasourceHadoopRelation, @transient sparkSession: SparkSession, @transient mergeAction: MergeAction) { private val cutOffDate = Integer.MAX_VALUE >> 1 val isUpdate = mergeAction.isInstanceOf[UpdateAction] val isDelete = mergeAction.isInstanceOf[DeleteAction] def apply(row: GenericRowWithSchema): InternalRow = { // TODO we can avoid these multiple conversions if this is added as a SparkPlan node. val values = row.values.map { case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s) case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d) case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b) case d: Date => DateTimeUtils.fromJavaDate(d) case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t) case value => value } projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow] } val (projection, output) = generateProjection private def generateProjection: (Projection, Array[Expression]) = { val existingDsOutput = rltn.carbonRelation.schema.toAttributes val colsMap = mergeAction match { case UpdateAction(updateMap) => updateMap case InsertAction(insertMap) => insertMap case _ => null } if (colsMap != null) { val output = new Array[Expression](tableCols.length) val expecOutput = new Array[Expression](tableCols.length) colsMap.foreach { case (k, v) => val tableIndex = tableCols.indexOf(k.toString().toLowerCase) if (tableIndex < 0) { throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap") } output(tableIndex) = v.expr.transform { case a: Attribute if !a.resolved => ds.queryExecution.analyzed.resolveQuoted(a.name, sparkSession.sessionState.analyzer.resolver).get } expecOutput(tableIndex) = existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get } if (output.contains(null)) { throw new CarbonMergeDataSetException(s"Not all columns are mapped") } (new InterpretedMutableProjection(output++Seq( ds.queryExecution.analyzed.resolveQuoted(statusCol, sparkSession.sessionState.analyzer.resolver).get), ds.queryExecution.analyzed.output), expecOutput) } else { (null, null) } } }
Example 15
Source File: KustoResponseDeserializer.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark.datasource import java.sql.Timestamp import java.util import com.microsoft.azure.kusto.data.{KustoResultColumn, KustoResultSetTable, Results} import com.microsoft.kusto.spark.utils.DataTypeMapping import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StructType, _} import org.joda.time.DateTime import scala.collection.JavaConverters._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer object KustoResponseDeserializer { def apply(kustoResult: KustoResultSetTable): KustoResponseDeserializer = new KustoResponseDeserializer(kustoResult) } // Timespan columns are casted to strings in kusto side. A simple test to compare the translation to a Duration string // in the format of timespan resulted in less performance. One way was using a new expression that extends UnaryExpression, // second was by a udf function, both were less performant. case class KustoSchema(sparkSchema: StructType, toStringCastedColumns: Set[String]) class KustoResponseDeserializer(val kustoResult: KustoResultSetTable) { val schema: KustoSchema = getSchemaFromKustoResult private def getValueTransformer(valueType: String): Any => Any = { valueType.toLowerCase() match { case "string" => value: Any => value case "int64" => value: Any => value case "datetime" => value: Any => new Timestamp(new DateTime(value).getMillis) case "timespan" => value: Any => value case "sbyte" => value: Any => value case "long" => value: Any => value match { case i: Int => i.toLong case _ => value.asInstanceOf[Long] } case "double" => value: Any => value case "decimal" => value: Any => BigDecimal(value.asInstanceOf[String]) case "int" => value: Any => value case "int32" => value: Any => value case "bool" => value: Any => value case "real" => value: Any => value case _ => value: Any => value.toString } } private def getSchemaFromKustoResult: KustoSchema = { if (kustoResult.getColumns.isEmpty) { KustoSchema(StructType(List()), Set()) } else { val columns = kustoResult.getColumns KustoSchema(StructType(columns.map(col => StructField(col.getColumnName, DataTypeMapping.kustoTypeToSparkTypeMap.getOrElse(col.getColumnType.toLowerCase, StringType)))), columns.filter(c => c.getColumnType.equalsIgnoreCase("TimeSpan")).map(c => c.getColumnName).toSet) } } def getSchema: KustoSchema = { schema } def toRows: java.util.List[Row] = { val columnInOrder = kustoResult.getColumns val value: util.ArrayList[Row] = new util.ArrayList[Row](kustoResult.count()) // Calculate the transformer function for each column to use later by order val valueTransformers: mutable.Seq[Any => Any] = columnInOrder.map(col => getValueTransformer(col.getColumnType)) kustoResult.getData.asScala.foreach(row => { val genericRow = row.toArray().zipWithIndex.map( column => { if (column._1 == null) null else valueTransformers(column._2)(column._1) }) value.add(new GenericRowWithSchema(genericRow, schema.sparkSchema)) }) value } // private def getOrderedColumnName = { // val columnInOrder = ArrayBuffer.fill(kustoResult.getColumnNameToIndex.size()){ "" } // kustoResult.getColumns.foreach((columnIndexPair: KustoResultColumn) => columnInOrder(columnIndexPair.) = columnIndexPair._1) // columnInOrder // } }
Example 16
Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.http.HttpStreamClient import org.junit.Assert import org.junit.Test import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.types.BooleanType import org.apache.spark.sql.types.FloatType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.ByteType import org.apache.spark.sql.execution.streaming.http.HttpStreamServer import org.apache.spark.sql.execution.streaming.http.StreamPrinter import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException class HttpStreamServerClientTest { val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte), Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte), Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte)); val ROWS2 = Array(Row("hello"), Row("world"), Row("bye"), Row("world")); @Test def testHttpStreamIO() { //starts a http server val kryoSerializer = new KryoSerializer(new SparkConf()); val server = HttpStreamServer.start("/xxxx", 8080); val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]") .getOrCreate(); spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/"); val sqlContext = spark.sqlContext; import spark.implicits._ //add a local message buffer to server, with 2 topics registered server.withBuffer() .addListener(new StreamPrinter()) .createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1") .createTopic[String]("topic-2"); val client = HttpStreamClient.connect("http://localhost:8080/xxxx"); //tests schema of topics val schema1 = client.fetchSchema("topic-1"); Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType), schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]); val schema2 = client.fetchSchema("topic-2"); Assert.assertArrayEquals(Array[Object](StringType), schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]); //prepare to consume messages val sid1 = client.subscribe("topic-1")._1; val sid2 = client.subscribe("topic-2")._1; //produces some data client.sendRows("topic-1", 1, ROWS1); val sid4 = client.subscribe("topic-1")._1; val sid5 = client.subscribe("topic-2")._1; client.sendRows("topic-2", 1, ROWS2); //consumes data val fetched = client.fetchStream(sid1).map(_.originalRow); Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]); //it is empty now Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); client.unsubscribe(sid4); try { client.fetchStream(sid4); //exception should be thrown, because subscriber id is invalidated Assert.assertTrue(false); } catch { case e: Throwable ⇒ e.printStackTrace(); Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass); } server.stop(); } }
Example 17
Source File: HashRedisPersistence.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis import java.util.{List => JList} import com.redislabs.provider.redis.util.ParseUtils import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types._ import redis.clients.jedis.Pipeline import scala.collection.JavaConverters._ class HashRedisPersistence extends RedisPersistence[Any] { override def save(pipeline: Pipeline, key: String, value: Any, ttl: Int): Unit = { val javaValue = value.asInstanceOf[Map[String, String]].asJava pipeline.hmset(key, javaValue) if (ttl > 0) { pipeline.expire(key, ttl) } } override def load(pipeline: Pipeline, key: String, requiredColumns: Seq[String]): Unit = { pipeline.hmget(key, requiredColumns: _*) } override def encodeRow(keyName: String, value: Row): Map[String, String] = { val fields = value.schema.fields.map(_.name) val kvMap = value.getValuesMap[Any](fields) kvMap .filter { case (_, v) => // don't store null values v != null } .filter { case (k, _) => // don't store key values k != keyName } .map { case (k, v) => k -> String.valueOf(v) } } override def decodeRow(keyMap: (String, String), value: Any, schema: StructType, requiredColumns: Seq[String]): Row = { val scalaValue = value.asInstanceOf[JList[String]].asScala val values = requiredColumns.zip(scalaValue) val results = values :+ keyMap val fieldsValue = ParseUtils.parseFields(results.toMap, schema) new GenericRowWithSchema(fieldsValue, schema) } }
Example 18
Source File: BinaryRedisPersistence.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis import java.nio.charset.StandardCharsets.UTF_8 import org.apache.commons.lang3.SerializationUtils import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.StructType import redis.clients.jedis.Pipeline class BinaryRedisPersistence extends RedisPersistence[Array[Byte]] { override def save(pipeline: Pipeline, key: String, value: Array[Byte], ttl: Int): Unit = { val keyBytes = key.getBytes(UTF_8) if (ttl > 0) { pipeline.setex(keyBytes, ttl, value) } else { pipeline.set(keyBytes, value) } } override def load(pipeline: Pipeline, key: String, requiredColumns: Seq[String]): Unit = pipeline.get(key.getBytes(UTF_8)) override def encodeRow(keyName: String, value: Row): Array[Byte] = { val fields = value.schema.fields.map(_.name) val valuesArray = fields.map(f => value.getAs[Any](f)) SerializationUtils.serialize(valuesArray) } override def decodeRow(keyMap: (String, String), value: Array[Byte], schema: StructType, requiredColumns: Seq[String]): Row = { val valuesArray: Array[Any] = SerializationUtils.deserialize(value) new GenericRowWithSchema(valuesArray, schema) } }
Example 19
Source File: ZScoreSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer.subtractable import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.SummarizerSuite import com.twosigma.flint.timeseries.Summarizers import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ DoubleType, IntegerType } class ZScoreSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/zscoresummarizer" "ZScoreSummarizer" should "compute in-sample `zScore` correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val expectedSchema = Schema("price_zScore" -> DoubleType) val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 1.5254255396193801), expectedSchema)) val results = priceTSRdd.summarize(Summarizers.zScore("price", true)) assert(results.schema == expectedSchema) assert(results.collect().deep == expectedResults.deep) } it should "compute out-of-sample `zScore` correctly" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val expectedSchema = Schema("price_zScore" -> DoubleType) val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 1.8090680674665818), expectedSchema)) val results = priceTSRdd.summarize(Summarizers.zScore("price", false)) assert(results.schema == expectedSchema) assert(results.collect().deep == expectedResults.deep) } it should "ignore null values" in { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) assertEquals( priceTSRdd.summarize(Summarizers.zScore("price", true)), insertNullRows(priceTSRdd, "price").summarize(Summarizers.zScore("price", true)) ) } it should "pass summarizer property test" in { summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.zScore("x1", true)) summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.zScore("x2", false)) } }
Example 20
Source File: ArrowTestUtils.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.arrow.ArrowUtils import org.apache.arrow.memory.RootAllocator import org.apache.arrow.vector.ipc.ArrowFileReader import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import scala.collection.JavaConverters._ object ArrowTestUtils { def fileFormatToRows(bytes: Array[Byte]): Seq[Row] = { val allocator = new RootAllocator(Int.MaxValue) val channel = new ByteArrayReadableSeekableByteChannel(bytes) val reader = new ArrowFileReader(channel, allocator) val root = reader.getVectorSchemaRoot val schema = ArrowUtils.fromArrowSchema(root.getSchema) reader.loadNextBatch() val vectors = root.getFieldVectors.asScala val rowCount = root.getRowCount val columnCount = root.getSchema.getFields.size() val values = (0 until rowCount).map { i => (0 until columnCount).map{ j => vectors(j).getObject(i) } } val rows = values.map { value => new GenericRowWithSchema(value.toArray, schema) } reader.close() root.close() allocator.close() rows } }
Example 21
Source File: SummarizeSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ LongType, IntegerType, DoubleType } class SummarizeSpec extends MultiPartitionSuite { override val defaultResourceDir: String = "/timeseries/summarize" it should "`summarize` correctly" in { val expectedSchema = Schema("volume_sum" -> DoubleType) val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 7800.0), expectedSchema)) def test(rdd: TimeSeriesRDD): Unit = { val results = rdd.summarize(Summarizers.sum("volume")) assert(results.schema == expectedSchema) assert(results.collect().deep == expectedResults.deep) } { val volumeRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType)) withPartitionStrategy(volumeRdd)(DEFAULT)(test) } } it should "`summarize` per key correctly" in { val expectedSchema = Schema("id" -> IntegerType, "volume_sum" -> DoubleType) val expectedResults = Array[Row]( new GenericRowWithSchema(Array(0L, 7, 4100.0), expectedSchema), new GenericRowWithSchema(Array(0L, 3, 3700.0), expectedSchema) ) def test(rdd: TimeSeriesRDD): Unit = { val results = rdd.summarize(Summarizers.sum("volume"), Seq("id")) assert(results.schema == expectedSchema) assert(results.collect().sortBy(_.getAs[Int]("id")).deep == expectedResults.sortBy(_.getAs[Int]("id")).deep) } { val volumeTSRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType)) withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } }
Example 22
Source File: CatalystTypeConvertersWrapper.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.DataType object CatalystTypeConvertersWrapper { def toCatalystRowConverter(dataType: DataType): Row => InternalRow = { CatalystTypeConverters.createToCatalystConverter(dataType)(_).asInstanceOf[InternalRow] } def toScalaRowConverter(dataType: DataType): InternalRow => GenericRowWithSchema = { CatalystTypeConverters.createToScalaConverter(dataType)(_).asInstanceOf[GenericRowWithSchema] } def toCatalystConverter(dataType: DataType): Any => Any = CatalystTypeConverters.createToCatalystConverter(dataType) def toScalaConverter(dataType: DataType): Any => Any = CatalystTypeConverters.createToScalaConverter(dataType) }
Example 23
Source File: SparkScoreDoc.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd.models import org.apache.lucene.document.Document import org.apache.lucene.index.IndexableField import org.apache.lucene.search.{IndexSearcher, ScoreDoc} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.inferNumericType import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField} import scala.collection.JavaConverters._ sealed trait FieldType extends Serializable object TextType extends FieldType object IntType extends FieldType object DoubleType extends FieldType object LongType extends FieldType object FloatType extends FieldType private def inferNumericType(num: Number): FieldType = { num match { case _: java.lang.Double => DoubleType case _: java.lang.Long => LongType case _: java.lang.Integer => IntType case _: java.lang.Float => FloatType case _ => TextType } } }
Example 24
Source File: ShapeLuceneRDDKryoRegistrator.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd.spatial.shape import com.twitter.algebird.TopK import com.twitter.chill.Kryo import org.apache.spark.SparkConf import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types._ import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc} import org.zouzias.spark.lucenerdd.spatial.shape.partition.ShapeLuceneRDDPartition class ShapeLuceneRDDKryoRegistrator extends KryoRegistrator { def registerClasses(kryo: Kryo): Unit = { kryo.register(classOf[ShapeLuceneRDD[_, _]]) kryo.register(classOf[ShapeLuceneRDDPartition[_, _]]) kryo.register(classOf[Number]) kryo.register(classOf[java.lang.Double]) kryo.register(classOf[java.lang.Float]) kryo.register(classOf[java.lang.Integer]) kryo.register(classOf[java.lang.Long]) kryo.register(classOf[java.lang.Short]) kryo.register(classOf[StructType]) kryo.register(classOf[StructField]) kryo.register(classOf[IntegerType]) kryo.register(classOf[IntegerType$]) kryo.register(classOf[DoubleType]) kryo.register(classOf[DoubleType$]) kryo.register(classOf[FloatType]) kryo.register(classOf[StringType]) kryo.register(classOf[StringType$]) kryo.register(classOf[GenericRowWithSchema]) kryo.register(classOf[Metadata]) kryo.register(classOf[Object]) kryo.register(classOf[Array[Object]]) kryo.register(classOf[Array[Array[Byte]]]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofRef]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofFloat]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofDouble]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofInt]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofLong]) kryo.register(classOf[Array[String]]) kryo.register(classOf[Array[Number]]) kryo.register(classOf[Array[Float]]) kryo.register(classOf[Array[Int]]) kryo.register(classOf[Array[Long]]) kryo.register(classOf[Array[Double]]) kryo.register(classOf[Array[Boolean]]) kryo.register(classOf[Array[SparkScoreDoc]]) kryo.register(classOf[Array[StructType]]) kryo.register(classOf[Array[StructField]]) kryo.register(classOf[Range]) kryo.register(classOf[scala.collection.immutable.Map[String, String]]) kryo.register(classOf[scala.collection.immutable.Map[String, Number]]) kryo.register(classOf[scala.collection.immutable.Map$EmptyMap$]) kryo.register(classOf[scala.collection.immutable.Set$EmptySet$]) kryo.register(classOf[scala.collection.immutable.Map[_, _]]) kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]]) kryo.register(classOf[SparkFacetResult]) kryo.register(classOf[SparkScoreDoc]) kryo.register(classOf[TopK[_]]) () } } } }