org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema Scala Examples

The following examples show how to use org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: RowTest.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
import org.apache.spark.sql.types._
import org.scalatest.{Matchers, FunSpec}

class RowTest extends FunSpec with Matchers {

  val schema = StructType(
    StructField("col1", StringType) ::
    StructField("col2", StringType) ::
    StructField("col3", IntegerType) :: Nil)
  val values = Array("value1", "value2", 1)

  val sampleRow: Row = new GenericRowWithSchema(values, schema)
  val noSchemaRow: Row = new GenericRow(values)

  describe("Row (without schema)") {
    it("throws an exception when accessing by fieldName") {
      intercept[UnsupportedOperationException] {
        noSchemaRow.fieldIndex("col1")
      }
      intercept[UnsupportedOperationException] {
        noSchemaRow.getAs("col1")
      }
    }
  }

  describe("Row (with schema)") {
    it("fieldIndex(name) returns field index") {
      sampleRow.fieldIndex("col1") shouldBe 0
      sampleRow.fieldIndex("col3") shouldBe 2
    }

    it("getAs[T] retrieves a value by fieldname") {
      sampleRow.getAs[String]("col1") shouldBe "value1"
      sampleRow.getAs[Int]("col3") shouldBe 1
    }

    it("Accessing non existent field throws an exception") {
      intercept[IllegalArgumentException] {
        sampleRow.getAs[String]("non_existent")
      }
    }

    it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") {
      val expected = Map(
        "col1" -> "value1",
        "col2" -> "value2"
      )
      sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected
    }
  }
} 
Example 2
Source File: SPKSQLUtils.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
import org.apache.spark.sql.types.{StructType, UDTRegistration}

object SPKSQLUtils {
  def append(row: Row, fields: StructType, values: Any*): Row = {
    row match {
      case r: GenericRowWithSchema =>
        val newValues = new Array[Any](r.length + values.length)
        val rLength: Int = r.length
        (0 until rLength).foreach(idx => newValues(idx) = r(idx))
        values.zipWithIndex.foreach { case (value, idx) =>
          newValues(idx + rLength) = value
        }

        val newSchema = if (r.schema != null) {
          val schemaTemp = StructType(r.schema)
          fields.foreach(field => schemaTemp.add(field))
          schemaTemp
        } else {
          null.asInstanceOf[StructType]
        }
        new GenericRowWithSchema(newValues, newSchema)
      case r: GenericRow =>
        val newValues = new Array[Any](r.length + values.length)
        val rLength: Int = r.length
        (0 until rLength).foreach(idx => newValues(idx) = r(idx))
        values.zipWithIndex.foreach { case (value, idx) =>
          newValues(idx + rLength) = value
        }

        new GenericRow(newValues)
      case _ =>
        throw new Exception("Row Error!")
    }
  }

  def registerUDT(): Unit = synchronized{
    UDTRegistration.register("org.apache.spark.linalg.Vector", "org.apache.spark.linalg.VectorUDT")
    UDTRegistration.register("org.apache.spark.linalg.DenseVector", "org.apache.spark.linalg.VectorUDT")
    UDTRegistration.register("org.apache.spark.linalg.SparseVector", "org.apache.spark.linalg.VectorUDT")
    UDTRegistration.register("org.apache.spark.linalg.Matrix", "org.apache.spark.linalg.MatrixUDT")
    UDTRegistration.register("org.apache.spark.linalg.DenseMatrix", "org.apache.spark.linalg.MatrixUDT")
    UDTRegistration.register("org.apache.spark.linalg.SparseMatrix", "org.apache.spark.linalg.MatrixUDT")
  }
} 
Example 3
Source File: GDBRowIterator.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.StructType


class GDBRowIterator(indexIter: Iterator[IndexInfo], dataBuffer: DataBuffer, fields: Array[Field], schema: StructType)
  extends Iterator[Row] with Serializable {

  val numFieldsWithNullAllowed = fields.count(_.nullable)
  val nullValueMasks = new Array[Byte]((numFieldsWithNullAllowed / 8.0).ceil.toInt)

  def hasNext() = indexIter.hasNext

  def next() = {
    val index = indexIter.next()
    val numBytes = dataBuffer.seek(index.seek).readBytes(4).getInt
    val byteBuffer = dataBuffer.readBytes(numBytes)
    0 until nullValueMasks.length foreach (nullValueMasks(_) = byteBuffer.get)
    var bit = 0
    val values = fields.map(field => {
      if (field.nullable) {
        val i = bit >> 3
        val m = 1 << (bit & 7)
        bit += 1
        if ((nullValueMasks(i) & m) == 0) {
          field.readValue(byteBuffer, index.objectID)
        }
        else {
          null // TODO - Do not like null here - but...it is nullable !
        }
      } else {
        field.readValue(byteBuffer, index.objectID)
      }
    }
    )
    new GenericRowWithSchema(values, schema)
  }
} 
Example 4
Source File: Executor.scala    From neo4j-spark-connector   with Apache License 2.0 5 votes vote down vote up
package org.neo4j.spark

import java.time.{LocalDate, LocalDateTime, OffsetTime, ZoneOffset, ZonedDateTime}
import java.util
import java.sql.Timestamp

import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types.StructType
import org.neo4j.spark.dataframe.CypherTypes
import org.neo4j.spark.utils.{Neo4jSessionAwareIterator, Neo4jUtils}

import scala.collection.JavaConverters._


object Executor {

  def convert(value: AnyRef): Any = value match {
    case it: util.Collection[_] => it.toArray()
    case m: java.util.Map[_,_] => m.asScala
    case _ => Neo4jUtils.convert(value)
  }

  def toJava(parameters: Map[String, Any]): java.util.Map[String, Object] = {
    parameters.mapValues(toJava).asJava
  }

  private def toJava(x: Any): AnyRef = x match {
    case y: Seq[_] => y.asJava
    case _ => x.asInstanceOf[AnyRef]
  }

  val EMPTY = Array.empty[Any]

  val EMPTY_RESULT = new CypherResult(new StructType(), Iterator.empty)

  class CypherResult(val schema: StructType, val rows: Iterator[Array[Any]]) {
    def sparkRows: Iterator[Row] = rows.map(row => new GenericRowWithSchema(row, schema))

    def fields = schema.fieldNames
  }

  def execute(sc: SparkContext, query: String, parameters: Map[String, AnyRef]): CypherResult = {
    execute(Neo4jConfig(sc.getConf), query, parameters)
  }

  private def rows(result: Iterator[_]) = {
    var i = 0
    while (result.hasNext) i = i + 1
    i
  }

  def execute(config: Neo4jConfig, query: String, parameters: Map[String, Any], write: Boolean = false): CypherResult = {
    val result = new Neo4jSessionAwareIterator(config, query, toJava(parameters), write)
    if (!result.hasNext) {
      return EMPTY_RESULT
    }
    val peek = result.peek()
    val keyCount = peek.size()
    if (keyCount == 0) {
      return new CypherResult(new StructType(), Array.fill[Array[Any]](rows(result))(EMPTY).toIterator)
    }
    val keys = peek.keys().asScala
    val fields = keys.map(k => (k, peek.get(k).`type`())).map(keyType => CypherTypes.field(keyType))
    val schema = StructType(fields)
    val it = result.map(record => {
      val row = new Array[Any](keyCount)
      var i = 0
      while (i < keyCount) {
        val value = convert(record.get(i).asObject())
        row.update(i, value)
        i = i + 1
      }
      row
    })
    new CypherResult(schema, it)
  }
} 
Example 5
Source File: ObjectMapper.scala    From infinispan-spark   with Apache License 2.0 5 votes vote down vote up
package org.infinispan.spark.sql

import java.beans.Introspector

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.CatalystTypeConverters
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericRowWithSchema}
import org.apache.spark.sql.types.StructType


object ObjectMapper {

   def forBean(schema: StructType, beanClass: Class[_]): (AnyRef, Array[String]) => Row = {
      val beanInfo = Introspector.getBeanInfo(beanClass)
      val attrs = schema.fields.map(f => AttributeReference(f.name, f.dataType, f.nullable)())
      val extractors = beanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
      val methodsToConverts = extractors.zip(attrs).map { case (e, attr) =>
         (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType))
      }
      (from: Any, columns: Array[String]) => {
         if (columns.nonEmpty) {
            from match {
               case _: Array[_] => new GenericRowWithSchema(from.asInstanceOf[Array[Any]], schema)
               case f: Any =>
                  val rowSchema = StructType(Array(schema(columns.head)))
                  new GenericRowWithSchema(Array(f), rowSchema)
            }
         } else {
            new GenericRowWithSchema(methodsToConverts.map { case (e, convert) =>
               val invoke: AnyRef = e.invoke(from)
               convert(invoke)
            }, schema)

         }
      }
   }

} 
Example 6
Source File: RowTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
import org.apache.spark.sql.types._
import org.scalatest.{Matchers, FunSpec}

class RowTest extends FunSpec with Matchers {

  val schema = StructType(
    StructField("col1", StringType) ::
    StructField("col2", StringType) ::
    StructField("col3", IntegerType) :: Nil)
  val values = Array("value1", "value2", 1)
  val valuesWithoutCol3 = Array[Any](null, "value2", null)

  val sampleRow: Row = new GenericRowWithSchema(values, schema)
  val sampleRowWithoutCol3: Row = new GenericRowWithSchema(valuesWithoutCol3, schema)
  val noSchemaRow: Row = new GenericRow(values)

  describe("Row (without schema)") {
    it("throws an exception when accessing by fieldName") {
      intercept[UnsupportedOperationException] {
        noSchemaRow.fieldIndex("col1")
      }
      intercept[UnsupportedOperationException] {
        noSchemaRow.getAs("col1")
      }
    }
  }

  describe("Row (with schema)") {
    it("fieldIndex(name) returns field index") {
      sampleRow.fieldIndex("col1") shouldBe 0
      sampleRow.fieldIndex("col3") shouldBe 2
    }

    it("getAs[T] retrieves a value by fieldname") {
      sampleRow.getAs[String]("col1") shouldBe "value1"
      sampleRow.getAs[Int]("col3") shouldBe 1
    }

    it("Accessing non existent field throws an exception") {
      intercept[IllegalArgumentException] {
        sampleRow.getAs[String]("non_existent")
      }
    }

    it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") {
      val expected = Map(
        "col1" -> "value1",
        "col2" -> "value2"
      )
      sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected
    }

    it("getValuesMap() retrieves null value on non AnyVal Type") {
      val expected = Map(
        "col1" -> null,
        "col2" -> "value2"
      )
      sampleRowWithoutCol3.getValuesMap[String](List("col1", "col2")) shouldBe expected
    }

    it("getAs() on type extending AnyVal throws an exception when accessing field that is null") {
      intercept[NullPointerException] {
        sampleRowWithoutCol3.getInt(sampleRowWithoutCol3.fieldIndex("col3"))
      }
    }

    it("getAs() on type extending AnyVal does not throw exception when value is null"){
      sampleRowWithoutCol3.getAs[String](sampleRowWithoutCol3.fieldIndex("col1")) shouldBe null
    }
  }

  describe("row equals") {
    val externalRow = Row(1, 2)
    val externalRow2 = Row(1, 2)
    val internalRow = InternalRow(1, 2)
    val internalRow2 = InternalRow(1, 2)

    it("equality check for external rows") {
      externalRow shouldEqual externalRow2
    }

    it("equality check for internal rows") {
      internalRow shouldEqual internalRow2
    }
  }
} 
Example 7
Source File: UnlabeledCSVRequestRowSerializerTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package unit.com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.UnlabeledCSVRequestRowSerializer

class UnlabeledCSVRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {
  val schema: StructType =
    StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false)))

  it should "serialize sparse vector" in {

    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    val sparseString = "-100.0," + "0.0," * 9 + "100.1," + "0.0," * 88 + "0.0\n"
    assert (sparseString == serialized)
  }

  it should "serialize dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert("10.0,-100.0,2.0\n" == serialized)
  }
} 
Example 8
Source File: LibSVMRequestRowSerializerTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest._
import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer

class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {
  val schema = new LibSVMResponseRowDeserializer(10).schema

  "LibSVMRequestRowSerializer" should "serialize sparse vector" in {

    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert ("1.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "ignore other columns" in {
    val schemaWithExtraColumns = StructType(Array(
      StructField("name", StringType, nullable = false),
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false),
        StructField("favorite activity", StringType, nullable = false)))

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray,
      schema = schemaWithExtraColumns)

    val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!")
    }
  }

  it should "fail on invalid label column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema),
        labelColumnName = "Sir! I must protest! I do not exist!")
    }
  }

  it should "fail on invalid types" in {
    val schemaWithInvalidLabelType = StructType(Array(
      StructField("label", StringType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType))
    }
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new LibSVMRequestRowSerializer(Some(validSchema))
  }
} 
Example 9
Source File: UnlabeledLibSVMRequestRowSerializerTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class UnlabeledLibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val schema = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false)))

  "UnlabeledLibSVMRequestRowSerializer" should "serialize sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert ("0.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert("0.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs =
      new UnlabeledLibSVMRequestRowSerializer(featuresColumnName = "mangoes are not features")
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }

  it should "fail on invalid features type" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row =
      new GenericRowWithSchema(values = Seq(1.0, "FEATURESSSSSZ!1!").toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }


  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))

    val rrs = new UnlabeledLibSVMRequestRowSerializer(Some(validSchema))
  }

  it should "fail to validate incorrect schema" in {
    val invalidSchema = StructType(Array(
      StructField("features", StringType, nullable = false)))

    intercept[IllegalArgumentException] {
      new UnlabeledLibSVMRequestRowSerializer(Some(invalidSchema))
    }
  }
} 
Example 10
Source File: ProtobufRequestRowSerializerTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter

class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val labelColumnName = "label"
  val featuresColumnName = "features"
  val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField(
    featuresColumnName, VectorType)))

  it should "serialize a dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "serialize a sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "fail to set schema on invalid features name" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    intercept[IllegalArgumentException] {
      val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist")
    }
  }


  it should "fail on invalid types" in {
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new ProtobufRequestRowSerializer(Some(validSchema))
  }
} 
Example 11
Source File: VectorExplode.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl


import odkl.analysis.spark.util.collection.OpenHashMap
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.odkl.SparkSqlUtils
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row, functions}


class VectorExplode(override val uid: String) extends
  Transformer with DefaultParamsWritable {

  val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.")

  def setValueCol(value: String) : this.type = set(valueCol, value)

  setDefault(valueCol -> "value")


  def this() = this(Identifiable.randomUID("vectorExplode"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT])

    val resultSchema = StructType(Seq(
      StructField($(valueCol), StringType, nullable = false)) ++
      vectors.map(f => StructField(f.name, DoubleType, nullable = true))
    )

    val arraySize = resultSchema.size - 1

    val names: Array[Map[Int, String]] = vectors.map(
      f => {
        AttributeGroup.fromStructField(f).attributes
          .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap)
          .getOrElse(Map())
      })

    val maxCapacity = names.map(_.size).max

    val explodeVectors : (Row => Array[Row]) = (r: Row ) => {
      val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity)

      for(i <- 0 until r.length) {
        val vector = r.getAs[Vector](i)

        vector.foreachActive((index, value) => {
          val name = names(i).getOrElse(index, s"${vectors(i).name}_$index")

          accumulator.changeValue(
            name,
            Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN},
            v => {v(i) = value; v})
        })
      }

      accumulator.map(x => new GenericRowWithSchema(
        (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray,
        resultSchema)).toArray
    }

    val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*)
    val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType)))
        
    val expression = functions.explode(explodeUDF(vectorsStruct))

    dataset
      .withColumn(uid, expression)
      .select(
        dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++
          resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*)
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(schema.fields.map(x =>
      x.dataType match {
        case vector: VectorUDT => StructField(x.name, typeFromVector(x))
        case _ => x
      }
    ))

  def typeFromVector(field: StructField): StructType = {
    val attributes = AttributeGroup.fromStructField(field)
    StructType(attributes.attributes
      .map(_.map(a => a.name.getOrElse(s"_${a.index.get}")))
      .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" })
      .map(name => StructField(name, DoubleType, nullable = false)))
  }
} 
Example 12
Source File: RowTest.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
import org.apache.spark.sql.types._
import org.scalatest.{Matchers, FunSpec}

class RowTest extends FunSpec with Matchers {

  val schema = StructType(
    StructField("col1", StringType) ::
    StructField("col2", StringType) ::
    StructField("col3", IntegerType) :: Nil)//列表结尾为Nil
  val values = Array("value1", "value2", 1)

  val sampleRow: Row = new GenericRowWithSchema(values, schema)
  val noSchemaRow: Row = new GenericRow(values)
  //行(无模式)
  describe("Row (without schema)") {
    //通过fieldName访问时会抛出异常
    it("throws an exception when accessing by fieldName") {
      intercept[UnsupportedOperationException] {
        noSchemaRow.fieldIndex("col1")
      }
      intercept[UnsupportedOperationException] {
        noSchemaRow.getAs("col1")
      }
    }
  }
  //行(带模式)
  describe("Row (with schema)") {
    //fieldIndex(name)返回字段索引
    it("fieldIndex(name) returns field index") {
      sampleRow.fieldIndex("col1") shouldBe 0
      sampleRow.fieldIndex("col3") shouldBe 2
    }
    //getAs [T]通过字段名检索值
    it("getAs[T] retrieves a value by fieldname") {
      sampleRow.getAs[String]("col1") shouldBe "value1"
      sampleRow.getAs[Int]("col3") shouldBe 1
    }
    //访问不存在的字段会引发异常
    it("Accessing non existent field throws an exception") {
      intercept[IllegalArgumentException] {
        sampleRow.getAs[String]("non_existent")
      }
    }
    //getValuesMap()检索多个字段的值作为Map(field -> value)
    it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") {
      val expected = Map(
        "col1" -> "value1",
        "col2" -> "value2"
      )
      sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected
    }
  }
  //行等于
  describe("row equals") {
    val externalRow = Row(1, 2)
    val externalRow2 = Row(1, 2)
    val internalRow = InternalRow(1, 2)
    val internalRow2 = InternalRow(1, 2)
    //外部行的等式检查
    it("equality check for external rows") {
      externalRow shouldEqual externalRow2
    }
    //相等检查内部行
    it("equality check for internal rows") {
      internalRow shouldEqual internalRow2
    }
  }
} 
Example 13
Source File: cogroup.scala    From spark-tools   with Apache License 2.0 5 votes vote down vote up
package io.univalence.plumbus

import org.apache.spark.Partitioner
import org.apache.spark.rdd.{ CoGroupedRDD, RDD }
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{ ArrayType, StructField }
import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row }

import scala.reflect.ClassTag
import scala.util.Try

object cogroup {

  
  implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) {
    def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] =
      //Use SparkAddOn ?
      ???
  }

  def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)(
    implicit encA: Encoder[A],
    encB: Encoder[B],
    encC: Encoder[K],
    enc: Encoder[(K, Seq[A], Seq[B])],
    ca: ClassTag[A],
    ck: ClassTag[K],
    cb: ClassTag[B]
  ): Dataset[(K, Seq[A], Seq[B])] =
    left.sparkSession.implicits
      .rddToDatasetHolder(
        RDD
          .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft))
          .cogroup(right.rdd.keyBy(keyRight))
          .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) })
      )
      .toDS

  def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)(
    byKey: String,
    partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*)
  ): Try[DataFrame] =
    Try {
      val subGroup: Seq[DataFrame]  = namedSubGroup.map(_._2)
      val allFrames: Seq[DataFrame] = group +: subGroup
      val allFramesKeyed: Seq[RDD[(String, Row)]] =
        allFrames.map(df => {
          val idx = df.columns.indexOf(byKey)
          df.rdd.keyBy(_.get(idx).toString)
        })

      val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner)

      val rowRdd: RDD[Row] =
        cogroupRdd.map(x => {
          val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq)
          val seq                   = rows.head.head.toSeq ++ rows.tail

          new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row]
        })

      val schema =
        types.StructType(
          group.schema.fields
            ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) }
        )

      group.sparkSession.createDataFrame(rowRdd, schema)
    }

} 
Example 14
Source File: MergeProjection.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command.mutation.merge

import java.sql.{Date, Timestamp}

import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection}
import org.apache.spark.sql.catalyst.util.DateTimeUtils


case class MergeProjection(
    @transient tableCols: Seq[String],
    @transient statusCol : String,
    @transient ds: Dataset[Row],
    @transient rltn: CarbonDatasourceHadoopRelation,
    @transient sparkSession: SparkSession,
    @transient mergeAction: MergeAction) {

  private val cutOffDate = Integer.MAX_VALUE >> 1

  val isUpdate = mergeAction.isInstanceOf[UpdateAction]
  val isDelete = mergeAction.isInstanceOf[DeleteAction]

  def apply(row: GenericRowWithSchema): InternalRow = {
    // TODO we can avoid these multiple conversions if this is added as a SparkPlan node.
    val values = row.values.map {
      case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s)
      case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d)
      case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b)
      case d: Date => DateTimeUtils.fromJavaDate(d)
      case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t)
      case value => value
    }

    projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow]
  }

  val (projection, output) = generateProjection

  private def generateProjection: (Projection, Array[Expression]) = {
    val existingDsOutput = rltn.carbonRelation.schema.toAttributes
    val colsMap = mergeAction match {
      case UpdateAction(updateMap) => updateMap
      case InsertAction(insertMap) => insertMap
      case _ => null
    }
    if (colsMap != null) {
      val output = new Array[Expression](tableCols.length)
      val expecOutput = new Array[Expression](tableCols.length)
      colsMap.foreach { case (k, v) =>
        val tableIndex = tableCols.indexOf(k.toString().toLowerCase)
        if (tableIndex < 0) {
          throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap")
        }
        output(tableIndex) = v.expr.transform {
          case a: Attribute if !a.resolved =>
            ds.queryExecution.analyzed.resolveQuoted(a.name,
              sparkSession.sessionState.analyzer.resolver).get
        }
        expecOutput(tableIndex) =
          existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get
      }
      if (output.contains(null)) {
        throw new CarbonMergeDataSetException(s"Not all columns are mapped")
      }
      (new InterpretedMutableProjection(output++Seq(
        ds.queryExecution.analyzed.resolveQuoted(statusCol,
        sparkSession.sessionState.analyzer.resolver).get),
        ds.queryExecution.analyzed.output), expecOutput)
    } else {
      (null, null)
    }
  }
} 
Example 15
Source File: KustoResponseDeserializer.scala    From azure-kusto-spark   with Apache License 2.0 5 votes vote down vote up
package com.microsoft.kusto.spark.datasource

import java.sql.Timestamp
import java.util

import com.microsoft.azure.kusto.data.{KustoResultColumn, KustoResultSetTable, Results}
import com.microsoft.kusto.spark.utils.DataTypeMapping
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StructType, _}
import org.joda.time.DateTime

import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

object KustoResponseDeserializer {
  def apply(kustoResult: KustoResultSetTable): KustoResponseDeserializer = new KustoResponseDeserializer(kustoResult)
}

// Timespan columns are casted to strings in kusto side. A simple test to compare the translation to a Duration string
// in the format of timespan resulted in less performance. One way was using a new expression that extends UnaryExpression,
// second was by a udf function, both were less performant.
case class KustoSchema(sparkSchema: StructType, toStringCastedColumns: Set[String])

class KustoResponseDeserializer(val kustoResult: KustoResultSetTable) {
  val schema: KustoSchema = getSchemaFromKustoResult

  private def getValueTransformer(valueType: String): Any => Any = {

    valueType.toLowerCase() match {
      case "string" => value: Any => value
      case "int64" => value: Any => value
      case "datetime" => value: Any => new Timestamp(new DateTime(value).getMillis)
      case "timespan" => value: Any => value
      case "sbyte" => value: Any => value
      case "long" => value: Any => value match {
        case i: Int => i.toLong
        case _ => value.asInstanceOf[Long]
      }
      case "double" => value: Any => value
      case "decimal" => value: Any => BigDecimal(value.asInstanceOf[String])
      case "int" => value: Any => value
      case "int32" => value: Any => value
      case "bool" => value: Any => value
      case "real" => value: Any => value
      case _ => value: Any => value.toString
      }
  }

   private def getSchemaFromKustoResult: KustoSchema = {
    if (kustoResult.getColumns.isEmpty) {
      KustoSchema(StructType(List()), Set())
    } else {
      val columns = kustoResult.getColumns

      KustoSchema(StructType(columns.map(col => StructField(col.getColumnName,
            DataTypeMapping.kustoTypeToSparkTypeMap.getOrElse(col.getColumnType.toLowerCase, StringType)))),
        columns.filter(c => c.getColumnType.equalsIgnoreCase("TimeSpan")).map(c => c.getColumnName).toSet)
    }
  }

  def getSchema: KustoSchema = { schema }

  def toRows: java.util.List[Row] = {
    val columnInOrder = kustoResult.getColumns
    val value: util.ArrayList[Row] = new util.ArrayList[Row](kustoResult.count())

//     Calculate the transformer function for each column to use later by order
    val valueTransformers: mutable.Seq[Any => Any] = columnInOrder.map(col => getValueTransformer(col.getColumnType))
    kustoResult.getData.asScala.foreach(row => {
      val genericRow = row.toArray().zipWithIndex.map(
        column => {
          if (column._1 == null) null else valueTransformers(column._2)(column._1)
        })
      value.add(new GenericRowWithSchema(genericRow, schema.sparkSchema))
    })

    value
  }

//  private def getOrderedColumnName = {
//    val columnInOrder = ArrayBuffer.fill(kustoResult.getColumnNameToIndex.size()){ "" }
//    kustoResult.getColumns.foreach((columnIndexPair: KustoResultColumn) => columnInOrder(columnIndexPair.) = columnIndexPair._1)
//    columnInOrder
//  }
} 
Example 16
Source File: HttpStreamServerClientTest.scala    From spark-http-stream   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.streaming.http.HttpStreamClient
import org.junit.Assert
import org.junit.Test
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.types.BooleanType
import org.apache.spark.sql.types.FloatType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.ByteType
import org.apache.spark.sql.execution.streaming.http.HttpStreamServer
import org.apache.spark.sql.execution.streaming.http.StreamPrinter
import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException


class HttpStreamServerClientTest {
	val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte),
		Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte),
		Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte));

	val ROWS2 = Array(Row("hello"),
		Row("world"),
		Row("bye"),
		Row("world"));

	@Test
	def testHttpStreamIO() {
		//starts a http server
		val kryoSerializer = new KryoSerializer(new SparkConf());
		val server = HttpStreamServer.start("/xxxx", 8080);

		val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]")
			.getOrCreate();
		spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/");

		val sqlContext = spark.sqlContext;
		import spark.implicits._
		//add a local message buffer to server, with 2 topics registered
		server.withBuffer()
			.addListener(new StreamPrinter())
			.createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1")
			.createTopic[String]("topic-2");

		val client = HttpStreamClient.connect("http://localhost:8080/xxxx");
		//tests schema of topics
		val schema1 = client.fetchSchema("topic-1");
		Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType),
			schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		val schema2 = client.fetchSchema("topic-2");
		Assert.assertArrayEquals(Array[Object](StringType),
			schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		//prepare to consume messages
		val sid1 = client.subscribe("topic-1")._1;
		val sid2 = client.subscribe("topic-2")._1;

		//produces some data
		client.sendRows("topic-1", 1, ROWS1);

		val sid4 = client.subscribe("topic-1")._1;
		val sid5 = client.subscribe("topic-2")._1;

		client.sendRows("topic-2", 1, ROWS2);

		//consumes data
		val fetched = client.fetchStream(sid1).map(_.originalRow);
		Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]);
		//it is empty now
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);

		client.unsubscribe(sid4);
		try {
			client.fetchStream(sid4);
			//exception should be thrown, because subscriber id is invalidated
			Assert.assertTrue(false);
		}
		catch {
			case e: Throwable ⇒
				e.printStackTrace();
				Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass);
		}

		server.stop();
	}
} 
Example 17
Source File: HashRedisPersistence.scala    From spark-redis   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package org.apache.spark.sql.redis

import java.util.{List => JList}

import com.redislabs.provider.redis.util.ParseUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types._
import redis.clients.jedis.Pipeline

import scala.collection.JavaConverters._


class HashRedisPersistence extends RedisPersistence[Any] {

  override def save(pipeline: Pipeline, key: String, value: Any, ttl: Int): Unit = {
    val javaValue = value.asInstanceOf[Map[String, String]].asJava
    pipeline.hmset(key, javaValue)
    if (ttl > 0) {
      pipeline.expire(key, ttl)
    }
  }

  override def load(pipeline: Pipeline, key: String, requiredColumns: Seq[String]): Unit = {
    pipeline.hmget(key, requiredColumns: _*)
  }

  override def encodeRow(keyName: String, value: Row): Map[String, String] = {
    val fields = value.schema.fields.map(_.name)
    val kvMap = value.getValuesMap[Any](fields)
    kvMap
      .filter { case (_, v) =>
        // don't store null values
        v != null
      }
      .filter { case (k, _) =>
        // don't store key values
        k != keyName
      }
      .map { case (k, v) =>
        k -> String.valueOf(v)
      }
  }

  override def decodeRow(keyMap: (String, String), value: Any, schema: StructType,
                         requiredColumns: Seq[String]): Row = {
    val scalaValue = value.asInstanceOf[JList[String]].asScala
    val values = requiredColumns.zip(scalaValue)
    val results = values :+ keyMap
    val fieldsValue = ParseUtils.parseFields(results.toMap, schema)
    new GenericRowWithSchema(fieldsValue, schema)
  }
} 
Example 18
Source File: BinaryRedisPersistence.scala    From spark-redis   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package org.apache.spark.sql.redis

import java.nio.charset.StandardCharsets.UTF_8

import org.apache.commons.lang3.SerializationUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.StructType
import redis.clients.jedis.Pipeline


class BinaryRedisPersistence extends RedisPersistence[Array[Byte]] {

  override def save(pipeline: Pipeline, key: String, value: Array[Byte], ttl: Int): Unit = {
    val keyBytes = key.getBytes(UTF_8)
    if (ttl > 0) {
      pipeline.setex(keyBytes, ttl, value)
    } else {
      pipeline.set(keyBytes, value)
    }
  }

  override def load(pipeline: Pipeline, key: String, requiredColumns: Seq[String]): Unit =
    pipeline.get(key.getBytes(UTF_8))

  override def encodeRow(keyName: String, value: Row): Array[Byte] = {
    val fields = value.schema.fields.map(_.name)
    val valuesArray = fields.map(f => value.getAs[Any](f))
    SerializationUtils.serialize(valuesArray)
  }

  override def decodeRow(keyMap: (String, String), value: Array[Byte], schema: StructType,
                         requiredColumns: Seq[String]): Row = {
    val valuesArray: Array[Any] = SerializationUtils.deserialize(value)
    new GenericRowWithSchema(valuesArray, schema)
  }
} 
Example 19
Source File: ZScoreSummarizerSpec.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.timeseries.summarize.summarizer.subtractable

import com.twosigma.flint.timeseries.row.Schema
import com.twosigma.flint.timeseries.summarize.SummarizerSuite
import com.twosigma.flint.timeseries.Summarizers
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{ DoubleType, IntegerType }

class ZScoreSummarizerSpec extends SummarizerSuite {

  override val defaultResourceDir: String = "/timeseries/summarize/summarizer/zscoresummarizer"

  "ZScoreSummarizer" should "compute in-sample `zScore` correctly" in {
    val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType))
    val expectedSchema = Schema("price_zScore" -> DoubleType)
    val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 1.5254255396193801), expectedSchema))
    val results = priceTSRdd.summarize(Summarizers.zScore("price", true))
    assert(results.schema == expectedSchema)
    assert(results.collect().deep == expectedResults.deep)
  }

  it should "compute out-of-sample `zScore` correctly" in {
    val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType))
    val expectedSchema = Schema("price_zScore" -> DoubleType)
    val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 1.8090680674665818), expectedSchema))
    val results = priceTSRdd.summarize(Summarizers.zScore("price", false))
    assert(results.schema == expectedSchema)
    assert(results.collect().deep == expectedResults.deep)
  }

  it should "ignore null values" in {
    val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType))
    assertEquals(
      priceTSRdd.summarize(Summarizers.zScore("price", true)),
      insertNullRows(priceTSRdd, "price").summarize(Summarizers.zScore("price", true))
    )
  }

  it should "pass summarizer property test" in {
    summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.zScore("x1", true))
    summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.zScore("x2", false))
  }
} 
Example 20
Source File: ArrowTestUtils.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.timeseries

import com.twosigma.flint.arrow.ArrowUtils
import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector.ipc.ArrowFileReader
import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema

import scala.collection.JavaConverters._

object ArrowTestUtils {
  def fileFormatToRows(bytes: Array[Byte]): Seq[Row] = {
    val allocator = new RootAllocator(Int.MaxValue)
    val channel = new ByteArrayReadableSeekableByteChannel(bytes)
    val reader = new ArrowFileReader(channel, allocator)

    val root = reader.getVectorSchemaRoot
    val schema = ArrowUtils.fromArrowSchema(root.getSchema)
    reader.loadNextBatch()
    val vectors = root.getFieldVectors.asScala

    val rowCount = root.getRowCount
    val columnCount = root.getSchema.getFields.size()

    val values = (0 until rowCount).map { i =>
      (0 until columnCount).map{ j =>
        vectors(j).getObject(i)
      }
    }

    val rows = values.map { value =>
      new GenericRowWithSchema(value.toArray, schema)
    }

    reader.close()
    root.close()
    allocator.close()

    rows
  }
} 
Example 21
Source File: SummarizeSpec.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.timeseries

import com.twosigma.flint.timeseries.row.Schema
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{ LongType, IntegerType, DoubleType }

class SummarizeSpec extends MultiPartitionSuite {

  override val defaultResourceDir: String = "/timeseries/summarize"

  it should "`summarize` correctly" in {
    val expectedSchema = Schema("volume_sum" -> DoubleType)
    val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 7800.0), expectedSchema))

    def test(rdd: TimeSeriesRDD): Unit = {
      val results = rdd.summarize(Summarizers.sum("volume"))
      assert(results.schema == expectedSchema)
      assert(results.collect().deep == expectedResults.deep)
    }

    {
      val volumeRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType))
      withPartitionStrategy(volumeRdd)(DEFAULT)(test)
    }

  }

  it should "`summarize` per key correctly" in {
    val expectedSchema = Schema("id" -> IntegerType, "volume_sum" -> DoubleType)
    val expectedResults = Array[Row](
      new GenericRowWithSchema(Array(0L, 7, 4100.0), expectedSchema),
      new GenericRowWithSchema(Array(0L, 3, 3700.0), expectedSchema)
    )

    def test(rdd: TimeSeriesRDD): Unit = {
      val results = rdd.summarize(Summarizers.sum("volume"), Seq("id"))
      assert(results.schema == expectedSchema)
      assert(results.collect().sortBy(_.getAs[Int]("id")).deep == expectedResults.sortBy(_.getAs[Int]("id")).deep)
    }

    {
      val volumeTSRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType))
      withPartitionStrategy(volumeTSRdd)(DEFAULT)(test)
    }
  }
} 
Example 22
Source File: CatalystTypeConvertersWrapper.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow }
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.DataType


object CatalystTypeConvertersWrapper {
  def toCatalystRowConverter(dataType: DataType): Row => InternalRow = {
    CatalystTypeConverters.createToCatalystConverter(dataType)(_).asInstanceOf[InternalRow]
  }

  def toScalaRowConverter(dataType: DataType): InternalRow => GenericRowWithSchema = {
    CatalystTypeConverters.createToScalaConverter(dataType)(_).asInstanceOf[GenericRowWithSchema]
  }

  def toCatalystConverter(dataType: DataType): Any => Any =
    CatalystTypeConverters.createToCatalystConverter(dataType)

  def toScalaConverter(dataType: DataType): Any => Any =
    CatalystTypeConverters.createToScalaConverter(dataType)
} 
Example 23
Source File: SparkScoreDoc.scala    From spark-lucenerdd   with Apache License 2.0 5 votes vote down vote up
package org.zouzias.spark.lucenerdd.models

import org.apache.lucene.document.Document
import org.apache.lucene.index.IndexableField
import org.apache.lucene.search.{IndexSearcher, ScoreDoc}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.inferNumericType
import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField}

import scala.collection.JavaConverters._

sealed trait FieldType extends Serializable
object TextType extends FieldType
object IntType extends FieldType
object DoubleType extends FieldType
object LongType extends FieldType
object FloatType extends FieldType



  private def inferNumericType(num: Number): FieldType = {
    num match {
      case _: java.lang.Double => DoubleType
      case _: java.lang.Long => LongType
      case _: java.lang.Integer => IntType
      case _: java.lang.Float => FloatType
      case _ => TextType
    }
  }
} 
Example 24
Source File: ShapeLuceneRDDKryoRegistrator.scala    From spark-lucenerdd   with Apache License 2.0 5 votes vote down vote up
package org.zouzias.spark.lucenerdd.spatial.shape

import com.twitter.algebird.TopK
import com.twitter.chill.Kryo
import org.apache.spark.SparkConf
import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types._
import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc}
import org.zouzias.spark.lucenerdd.spatial.shape.partition.ShapeLuceneRDDPartition


class ShapeLuceneRDDKryoRegistrator extends KryoRegistrator {
  def registerClasses(kryo: Kryo): Unit = {
    kryo.register(classOf[ShapeLuceneRDD[_, _]])
    kryo.register(classOf[ShapeLuceneRDDPartition[_, _]])
    kryo.register(classOf[Number])
    kryo.register(classOf[java.lang.Double])
    kryo.register(classOf[java.lang.Float])
    kryo.register(classOf[java.lang.Integer])
    kryo.register(classOf[java.lang.Long])
    kryo.register(classOf[java.lang.Short])
    kryo.register(classOf[StructType])
    kryo.register(classOf[StructField])
    kryo.register(classOf[IntegerType])
    kryo.register(classOf[IntegerType$])
    kryo.register(classOf[DoubleType])
    kryo.register(classOf[DoubleType$])
    kryo.register(classOf[FloatType])
    kryo.register(classOf[StringType])
    kryo.register(classOf[StringType$])
    kryo.register(classOf[GenericRowWithSchema])
    kryo.register(classOf[Metadata])
    kryo.register(classOf[Object])
    kryo.register(classOf[Array[Object]])
    kryo.register(classOf[Array[Array[Byte]]])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofRef])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofFloat])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofDouble])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofInt])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofLong])
    kryo.register(classOf[Array[String]])
    kryo.register(classOf[Array[Number]])
    kryo.register(classOf[Array[Float]])
    kryo.register(classOf[Array[Int]])
    kryo.register(classOf[Array[Long]])
    kryo.register(classOf[Array[Double]])
    kryo.register(classOf[Array[Boolean]])
    kryo.register(classOf[Array[SparkScoreDoc]])
    kryo.register(classOf[Array[StructType]])
    kryo.register(classOf[Array[StructField]])
    kryo.register(classOf[Range])
    kryo.register(classOf[scala.collection.immutable.Map[String, String]])
    kryo.register(classOf[scala.collection.immutable.Map[String, Number]])
    kryo.register(classOf[scala.collection.immutable.Map$EmptyMap$])
    kryo.register(classOf[scala.collection.immutable.Set$EmptySet$])
    kryo.register(classOf[scala.collection.immutable.Map[_, _]])
    kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]])
    kryo.register(classOf[SparkFacetResult])
    kryo.register(classOf[SparkScoreDoc])
    kryo.register(classOf[TopK[_]])

    ()
  }
}


  }
}