org.apache.spark.sql.types.StructField Scala Example

Source File: OnErrorSuite.scala From spark-snowflake with Apache License 2.0

6 votes

package net.snowflake.spark.snowflake

import net.snowflake.client.jdbc.SnowflakeSQLException
import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class OnErrorSuite extends IntegrationSuiteBase {
  lazy val table = s"spark_test_table_$randomSuffix"

  lazy val schema = new StructType(
    Array(StructField("var", StringType, nullable = false))
  )

  lazy val df: DataFrame = sparkSession.createDataFrame(
    sc.parallelize(
      Seq(Row("{\"dsadas\nadsa\":12311}"), Row("{\"abc\":334}")) // invalid json key
    ),
    schema
  )

  override def beforeAll(): Unit = {
    super.beforeAll()
    jdbcUpdate(s"create or replace table $table(var variant)")
  }

  override def afterAll(): Unit = {
    jdbcUpdate(s"drop table $table")
    super.afterAll()
  }

  test("continue_on_error off") {

    assertThrows[SnowflakeSQLException] {
      df.write
        .format(SNOWFLAKE_SOURCE_NAME)
        .options(connectorOptionsNoTable)
        .option("dbtable", table)
        .mode(SaveMode.Append)
        .save()
    }
  }

  test("continue_on_error on") {
    df.write
      .format(SNOWFLAKE_SOURCE_NAME)
      .options(connectorOptionsNoTable)
      .option("continue_on_error", "on")
      .option("dbtable", table)
      .mode(SaveMode.Append)
      .save()

    val result = sparkSession.read
      .format(SNOWFLAKE_SOURCE_NAME)
      .options(connectorOptionsNoTable)
      .option("dbtable", table)
      .load()

    assert(result.collect().length == 1)
  }

}

Source File: SnowflakePlan.scala From spark-snowflake with Apache License 2.0

5 votes

package net.snowflake.spark.snowflake.pushdowns

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.{StructField, StructType}


case class SnowflakePlan(output: Seq[Attribute], rdd: RDD[InternalRow])
    extends SparkPlan {

  override def children: Seq[SparkPlan] = Nil
  protected override def doExecute(): RDD[InternalRow] = {

    val schema = StructType(
      output.map(attr => StructField(attr.name, attr.dataType, attr.nullable))
    )

    rdd.mapPartitions { iter =>
      val project = UnsafeProjection.create(schema)
      iter.map(project)
    }
  }

}

Source File: cogroup.scala From spark-tools with Apache License 2.0

5 votes

package io.univalence.plumbus

import org.apache.spark.Partitioner
import org.apache.spark.rdd.{ CoGroupedRDD, RDD }
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{ ArrayType, StructField }
import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row }

import scala.reflect.ClassTag
import scala.util.Try

object cogroup {

  
  implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) {
    def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] =
      //Use SparkAddOn ?
      ???
  }

  def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)(
    implicit encA: Encoder[A],
    encB: Encoder[B],
    encC: Encoder[K],
    enc: Encoder[(K, Seq[A], Seq[B])],
    ca: ClassTag[A],
    ck: ClassTag[K],
    cb: ClassTag[B]
  ): Dataset[(K, Seq[A], Seq[B])] =
    left.sparkSession.implicits
      .rddToDatasetHolder(
        RDD
          .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft))
          .cogroup(right.rdd.keyBy(keyRight))
          .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) })
      )
      .toDS

  def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)(
    byKey: String,
    partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*)
  ): Try[DataFrame] =
    Try {
      val subGroup: Seq[DataFrame]  = namedSubGroup.map(_._2)
      val allFrames: Seq[DataFrame] = group +: subGroup
      val allFramesKeyed: Seq[RDD[(String, Row)]] =
        allFrames.map(df => {
          val idx = df.columns.indexOf(byKey)
          df.rdd.keyBy(_.get(idx).toString)
        })

      val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner)

      val rowRdd: RDD[Row] =
        cogroupRdd.map(x => {
          val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq)
          val seq                   = rows.head.head.toSeq ++ rows.tail

          new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row]
        })

      val schema =
        types.StructType(
          group.schema.fields
            ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) }
        )

      group.sparkSession.createDataFrame(rowRdd, schema)
    }

}

Source File: DataFrameComparisonTest.scala From spark-tools with Apache License 2.0

5 votes

package io.univalence.sparktest

import io.univalence.schema.SchemaComparator.SchemaError
import org.apache.spark.SparkContext
import org.apache.spark.sql.{ Row, SparkSession }
import org.apache.spark.sql.types.{ IntegerType, StructField, StructType }
import org.scalatest.FunSuite

class DataFrameComparisonTest extends FunSuite with SparkTest {

  val sharedSparkSession: SparkSession = ss
  val sc: SparkContext                 = ss.sparkContext

  // TODO : unordered
  ignore("should assertEquals unordered between equal DF") {
    val dfUT       = Seq(1, 2, 3).toDF("id")
    val dfExpected = Seq(3, 2, 1).toDF("id")

    dfUT.assertEquals(dfExpected)
  }

  // TODO : unordered
  ignore("should not assertEquals unordered between DF with different contents") {
    val dfUT       = Seq(1, 2, 3).toDF("id")
    val dfExpected = Seq(2, 1, 4).toDF("id")

    assertThrows[SparkTestError] {
      dfUT.assertEquals(dfExpected)
    }
  }

  test("should assertEquals ordered between equal DF") {
    val dfUT       = Seq(1, 2, 3).toDF("id")
    val dfExpected = Seq(1, 2, 3).toDF("id")

    dfUT.assertEquals(dfExpected)
  }

  test("should not assertEquals ordered between DF with different contents") {
    val dfUT       = Seq(1, 2, 3).toDF("id")
    val dfExpected = Seq(1, 3, 4).toDF("id")

    assertThrows[SparkTestError] {
      dfUT.assertEquals(dfExpected)
    }
  }

  test("should not assertEquals between DF with different schema") {
    val dfUT       = Seq(1, 2, 3).toDF("id")
    val dfExpected = Seq(1, 2, 3).toDF("di")

    assertThrows[SchemaError] {
      dfUT.assertEquals(dfExpected)
    }
  }

  test("assertEquals (DF & Seq) : a DF and a Seq with the same content are equal") {
    val seq = Seq(1, 2, 3)
    val df = ss.createDataFrame(
      sc.parallelize(seq.map(Row(_))),
      StructType(List(StructField("number", IntegerType, nullable = true)))
    )

    df.assertEquals(seq)
  }

  test("assertEquals (DF & Seq) : a DF and a Seq with different content are not equal") {
    val df    = Seq(1, 3, 3).toDF("number")
    val seqEx = Seq(1, 2, 3)

    assertThrows[SparkTestError] {
      df.assertEquals(seqEx)
    }
  }

  test("should assertEquals ordered between equal DF with columns containing special character") {
    val dfUT       = Seq(1, 2, 3).toDF("id.a")
    val dfExpected = Seq(2, 1, 4).toDF("id.a")

    assertThrows[SparkTestError] {
      dfUT.assertEquals(dfExpected)
    }
  }

  

}

Source File: ConfigurableDataGeneratorMain.scala From Spark.TableStatsExample with Apache License 2.0

5 votes

package com.cloudera.sa.examples.tablestats

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType}
import org.apache.spark.{SparkContext, SparkConf}

import scala.collection.mutable
import scala.util.Random



object ConfigurableDataGeneratorMain {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("ConfigurableDataGeneratorMain <outputPath> <numberOfColumns> <numberOfRecords> <numberOfPartitions> <local>")
      return
    }

    val outputPath = args(0)
    val numberOfColumns = args(1).toInt
    val numberOfRecords = args(2).toInt
    val numberOfPartitions = args(3).toInt
    val runLocal = (args.length == 5 && args(4).equals("L"))

    var sc: SparkContext = null
    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "test", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("ConfigurableDataGeneratorMain")
      sc = new SparkContext(sparkConfig)
    }

    val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    //Part A
    val rowRDD = sc.parallelize( (0 until numberOfPartitions).map( i => i), numberOfPartitions)

    //Part B
    val megaDataRDD = rowRDD.flatMap( r => {
      val random = new Random()

      val dataRange = (0 until numberOfRecords/numberOfPartitions).iterator
      dataRange.map[Row]( x => {
        val values = new mutable.ArrayBuffer[Any]
        for (i <- 0 until numberOfColumns) {
          if (i % 2 == 0) {
            values.+=(random.nextInt(100).toLong)
          } else {
            values.+=(random.nextInt(100).toString)
          }
        }
        new GenericRow(values.toArray)
      })
    })

    //Part C
    val schema =
      StructType(
        (0 until numberOfColumns).map( i => {
          if (i % 2 == 0) {
            StructField("longColumn_" + i, LongType, true) }
          else {
            StructField("stringColumn_" + i, StringType, true)
          }
        })
      )
    val df = sqlContext.createDataFrame(megaDataRDD, schema)
    df.saveAsParquetFile(outputPath)

    //Part D
    sc.stop()
  }
}

Source File: TestTableStatsSinglePathMain.scala From Spark.TableStatsExample with Apache License 2.0

5 votes

package com.cloudera.sa.examples.tablestats


import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType}
import org.scalatest.{FunSuite, BeforeAndAfterEach, BeforeAndAfterAll}


class TestTableStatsSinglePathMain extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{
  test("run table stats on sample data") {

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    var sc = new SparkContext("local", "test", sparkConfig)
    try {
      val sqlContext = new org.apache.spark.sql.SQLContext(sc)

      val schema =
        StructType(
          Array(
            StructField("id", LongType, true),
            StructField("name", StringType, true),
            StructField("age", LongType, true),
            StructField("gender", StringType, true),
            StructField("height", LongType, true),
            StructField("job_title", StringType, true)
          )
        )

      val rowRDD = sc.parallelize(Array(
        Row(1l, "Name.1", 20l, "M", 6l, "dad"),
        Row(2l, "Name.2", 20l, "F", 5l, "mom"),
        Row(3l, "Name.3", 20l, "F", 5l, "mom"),
        Row(4l, "Name.4", 20l, "M", 5l, "mom"),
        Row(5l, "Name.5", 10l, "M", 4l, "kid"),
        Row(6l, "Name.6", 8l, "M", 3l, "kid")))

      val df = sqlContext.createDataFrame(rowRDD, schema)

      val firstPassStats = TableStatsSinglePathMain.getFirstPassStat(df)

      assertResult(6l)(firstPassStats.columnStatsMap(0).maxLong)
      assertResult(1l)(firstPassStats.columnStatsMap(0).minLong)
      assertResult(21l)(firstPassStats.columnStatsMap(0).sumLong)
      assertResult(3l)(firstPassStats.columnStatsMap(0).avgLong)

      assertResult(2)(firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.length)

      firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.foreach { r =>
        if (r._1.equals("M")) {
          assertResult(4l)(r._2)
        } else if (r._1.equals("F")) {
          assertResult(2l)(r._2)
        } else {
          throw new RuntimeException("Unknown gender: " + r._1)
        }
      }
    } finally {
      sc.stop()
    }
  }
}

Source File: MetadataUtils.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.util

import scala.collection.immutable.HashMap

import org.apache.spark.ml.attribute._
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.types.StructField



  def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = {
    require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col"
      + s" to be Vector type, but it was type ${col.dataType} instead.")
    val inputAttr = AttributeGroup.fromStructField(col)
    names.map { name =>
      require(inputAttr.hasAttr(name),
        s"getFeatureIndicesFromNames found no feature with name $name in column $col.")
      inputAttr.getAttr(name).index.get
    }
  }
}

Source File: VectorSlicerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{StructField, StructType}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    val slicer = new VectorSlicer().setInputCol("feature")
    ParamsSuite.checkParams(slicer)
    assert(slicer.getIndices.length === 0)
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {
        slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true))))
      }
    }
  }

  test("feature validity checks") {
    import VectorSlicer._
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(validIndices(Array.empty[Int]))
    assert(!validIndices(Array(-1)))
    assert(!validIndices(Array(1, 2, 1)))

    assert(validNames(Array("a", "b")))
    assert(validNames(Array.empty[String]))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))
  }

  test("Test vector slicer") {
    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())
    )

    // Expected after selecting indices 1, 4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) }
    val df = spark.createDataFrame(rdd,
      StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))

    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {
      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      }
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
      resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
        assert(a === b)
      }
    }

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    validateResults(vectorSlicer.transform(df))
  }

  test("read/write") {
    val t = new VectorSlicer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setIndices(Array(1, 3))
      .setNames(Array("a", "d"))
    testDefaultReadWrite(t)
  }
}

Source File: SQLTransformerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.types.{LongType, StructField, StructType}

class SQLTransformerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new SQLTransformer())
  }

  test("transform numeric data") {
    val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    val result = sqlTrans.transform(original)
    val resultSchema = sqlTrans.transformSchema(original.schema)
    val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
      .toDF("id", "v1", "v2", "v3", "v4")
    assert(result.schema.toString == resultSchema.toString)
    assert(resultSchema == expected.schema)
    assert(result.collect().toSeq == expected.collect().toSeq)
    assert(original.sparkSession.catalog.listTables().count() == 0)
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }

  test("transformSchema") {
    val df = spark.range(10)
    val outputSchema = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
      .transformSchema(df.schema)
    val expected = StructType(Seq(StructField("id1", LongType, nullable = false)))
    assert(outputSchema === expected)
  }
}

Source File: MetastoreRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.sql.{QueryTest, Row}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
  test("makeCopy and toJSON should work") {
    val table = CatalogTable(
      identifier = TableIdentifier("test", Some("db")),
      tableType = CatalogTableType.VIEW,
      storage = CatalogStorageFormat.empty,
      schema = StructType(StructField("a", IntegerType, true) :: Nil))
    val relation = MetastoreRelation("db", "test")(table, null)

    // No exception should be thrown
    relation.makeCopy(Array("db", "test"))
    // No exception should be thrown
    relation.toJSON
  }

  test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") {
    withTable("bar") {
      withTempView("foo") {
        sql("select 0 as id").createOrReplaceTempView("foo")
        // If we optimize the query in CTAS more than once, the following saveAsTable will fail
        // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])`
        sql("CREATE TABLE bar AS SELECT * FROM foo group by id")
        checkAnswer(spark.table("bar"), Row(0) :: Nil)
        val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar"))
        assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table")
      }
    }
  }
}

Source File: SparkExecuteStatementOperationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{NullType, StructField, StructType}

class SparkExecuteStatementOperationSuite extends SparkFunSuite {
  test("SPARK-17112 `select null` via JDBC triggers IllegalArgumentException in ThriftServer") {
    val field1 = StructField("NULL", NullType)
    val field2 = StructField("(IF(true, NULL, NULL))", NullType)
    val tableSchema = StructType(Seq(field1, field2))
    val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors()
    assert(columns.size() == 2)
    assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
    assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
  }
}

Source File: LocalRelation.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.analysis
import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
import org.apache.spark.sql.types.{StructField, StructType}

object LocalRelation {
  def apply(output: Attribute*): LocalRelation = new LocalRelation(output)

  def apply(output1: StructField, output: StructField*): LocalRelation = {
    new LocalRelation(StructType(output1 +: output).toAttributes)
  }

  def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }

  def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }
}

case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
  extends LeafNode with analysis.MultiInstanceRelation {

  // A local relation must have resolved output.
  require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.")

  
  override final def newInstance(): this.type = {
    LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type]
  }

  override protected def stringArgs: Iterator[Any] = {
    if (data.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def sameResult(plan: LogicalPlan): Boolean = {
    plan.canonicalized match {
      case LocalRelation(otherOutput, otherData) =>
        otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data
      case _ => false
    }
  }

  override lazy val statistics =
    Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length)

  def toSQL(inlineTableName: String): String = {
    require(data.nonEmpty)
    val types = output.map(_.dataType)
    val rows = data.map { row =>
      val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql }
      cells.mkString("(", ", ", ")")
    }
    "VALUES " + rows.mkString(", ") +
      " AS " + inlineTableName +
      output.map(_.name).mkString("(", ", ", ")")
  }
}

Source File: ResolveInlineTables.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import scala.util.control.NonFatal

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Cast
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.types.{StructField, StructType}


  private[analysis] def convert(table: UnresolvedInlineTable): LocalRelation = {
    // For each column, traverse all the values and find a common data type and nullability.
    val fields = table.rows.transpose.zip(table.names).map { case (column, name) =>
      val inputTypes = column.map(_.dataType)
      val tpe = TypeCoercion.findWiderTypeWithoutStringPromotion(inputTypes).getOrElse {
        table.failAnalysis(s"incompatible types found in column $name for inline table")
      }
      StructField(name, tpe, nullable = column.exists(_.nullable))
    }
    val attributes = StructType(fields).toAttributes
    assert(fields.size == table.names.size)

    val newRows: Seq[InternalRow] = table.rows.map { row =>
      InternalRow.fromSeq(row.zipWithIndex.map { case (e, ci) =>
        val targetType = fields(ci).dataType
        try {
          if (e.dataType.sameType(targetType)) {
            e.eval()
          } else {
            Cast(e, targetType).eval()
          }
        } catch {
          case NonFatal(ex) =>
            table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}")
        }
      })
    }

    LocalRelation(attributes, newRows)
  }
}

Source File: resources.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import java.io.File
import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("Results", StringType, nullable = false)() :: Nil
  }
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val jarList = sparkSession.sparkContext.listJars()
    if (jars.nonEmpty) {
      for {
        jarName <- jars.map(f => new Path(f).getName)
        jarPath <- jarList if jarPath.contains(jarName)
      } yield Row(jarPath)
    } else {
      jarList.map(Row(_))
    }
  }
}

Source File: DDLSourceLoadSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      spark.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without Hive Support") {
    val e = intercept[AnalysisException] {
      spark.read.format("orc").load()
    }
    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

Source File: CarbonDataFrameExample.scala From CarbonDataLearning with GNU General Public License v3.0

5 votes

package org.github.xubo245.carbonDataLearning.example

import org.apache.carbondata.examples.util.ExampleUtils
import org.apache.spark.sql.{SaveMode, SparkSession}

object CarbonDataFrameExample {

  def main(args: Array[String]) {
    val spark = ExampleUtils.createCarbonSession("CarbonDataFrameExample")
    exampleBody(spark)
    spark.close()
  }

  def exampleBody(spark : SparkSession): Unit = {
    // Writes Dataframe to CarbonData file:
    import spark.implicits._
    val df = spark.sparkContext.parallelize(1 to 100)
      .map(x => ("a" + x % 10, "b", x))
      .toDF("c1", "c2", "number")

    // Saves dataframe to carbondata file
    df.write
      .format("carbondata")
      .option("tableName", "carbon_df_table")
      .option("partitionColumns", "c1")  // a list of column names
      .mode(SaveMode.Overwrite)
      .save()

    spark.sql(""" SELECT * FROM carbon_df_table """).show()

    spark.sql("SHOW PARTITIONS carbon_df_table").show()

    // Specify schema
    import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
    val customSchema = StructType(Array(
      StructField("c1", StringType),
      StructField("c2", StringType),
      StructField("number", IntegerType)))

    // Reads carbondata to dataframe
    val carbondf = spark.read
      .format("carbondata")
      .schema(customSchema)
      // .option("dbname", "db_name") the system will use "default" as dbname if not set this option
      .option("tableName", "carbon_df_table")
      .load()

    df.write
      .format("csv")
      .option("tableName", "csv_df_table")
      .option("partitionColumns", "c1") // a list of column names
      //      .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
      .mode(SaveMode.Overwrite)
      .csv("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv/1.csv")

    // Reads carbondata to dataframe
    val carbondf2 = spark.read
      .format("csv")
      .schema(customSchema)
      // .option("dbname", "db_name") the system will use "default" as dbname if not set this option
      .option("tableName", "csv_df_table")

      //      .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
      .load("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv")

    carbondf2.show()


    // Dataframe operations
    carbondf.printSchema()
    carbondf.select($"c1", $"number" + 10).show()
    carbondf.filter($"number" > 31).show()

    spark.sql("DROP TABLE IF EXISTS carbon_df_table")
  }
}

Source File: TestSFObjectWriter.scala From spark-salesforce with Apache License 2.0

5 votes

package com.springml.spark.salesforce

import org.mockito.Mockito._
import org.mockito.Matchers._
import org.scalatest.mock.MockitoSugar
import org.scalatest.{ FunSuite, BeforeAndAfterEach}
import com.springml.salesforce.wave.api.BulkAPI
import org.apache.spark.{ SparkConf, SparkContext}
import com.springml.salesforce.wave.model.{ JobInfo, BatchInfo}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ Row, DataFrame, SQLContext}
import org.apache.spark.sql.types.{ StructType, StringType, StructField}

class TestSFObjectWriter extends FunSuite with MockitoSugar with BeforeAndAfterEach {
  val contact = "Contact";
  val jobId = "750B0000000WlhtIAC";
  val batchId = "751B0000000scSHIAY";
  val data = "Id,Description\n003B00000067Rnx,123456\n003B00000067Rnw,7890";

  val bulkAPI = mock[BulkAPI](withSettings().serializable())
  val writer = mock[SFObjectWriter]

  var sparkConf: SparkConf = _
  var sc: SparkContext = _

  override def beforeEach() {
    val jobInfo = new JobInfo
    jobInfo.setId(jobId)
    when(bulkAPI.createJob(contact)).thenReturn(jobInfo)

    val batchInfo = new BatchInfo
    batchInfo.setId(batchId)
    batchInfo.setJobId(jobId)
    when(bulkAPI.addBatch(jobId, data)).thenReturn(batchInfo)

    when(bulkAPI.closeJob(jobId)).thenReturn(jobInfo)
    when(bulkAPI.isCompleted(jobId)).thenReturn(true)

    sparkConf = new SparkConf().setMaster("local").setAppName("Test SF Object Update")
    sc = new SparkContext(sparkConf)
  }

  private def sampleDF() : DataFrame = {
    val rowArray = new Array[Row](2)
    val fieldArray = new Array[String](2)

    fieldArray(0) = "003B00000067Rnx"
    fieldArray(1) = "Desc1"
    rowArray(0) = Row.fromSeq(fieldArray)

    val fieldArray1 = new Array[String](2)
    fieldArray1(0) = "001B00000067Rnx"
    fieldArray1(1) = "Desc2"
    rowArray(1) = Row.fromSeq(fieldArray1)

    val rdd = sc.parallelize(rowArray)
    val schema = StructType(
      StructField("id", StringType, true) ::
      StructField("desc", StringType, true) :: Nil)

    val sqlContext = new SQLContext(sc)
    sqlContext.createDataFrame(rdd, schema)
  }

  test ("Write Object to Salesforce") {
    val df = sampleDF();
    val csvHeader = Utils.csvHeadder(df.schema)
    writer.writeData(df.rdd)
    sc.stop()
  }
}

Source File: SparkScoreDoc.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd.models

import org.apache.lucene.document.Document
import org.apache.lucene.index.IndexableField
import org.apache.lucene.search.{IndexSearcher, ScoreDoc}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.inferNumericType
import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField}

import scala.collection.JavaConverters._

sealed trait FieldType extends Serializable
object TextType extends FieldType
object IntType extends FieldType
object DoubleType extends FieldType
object LongType extends FieldType
object FloatType extends FieldType



  private def inferNumericType(num: Number): FieldType = {
    num match {
      case _: java.lang.Double => DoubleType
      case _: java.lang.Long => LongType
      case _: java.lang.Integer => IntType
      case _: java.lang.Float => FloatType
      case _ => TextType
    }
  }
}

Source File: Schema.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs

import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object Schema {
  
  val GraphElementSchema = StructType(CommonFields ++ Seq(
    StructField("id", StringType, nullable = true),
    StructField("service", StringType, nullable = true),
    StructField("column", StringType, nullable = true),
    StructField("from", StringType, nullable = true),
    StructField("to", StringType, nullable = true),
    StructField("label", StringType, nullable = true),
    StructField("props", StringType, nullable = true)
  ))
}

Source File: ExecutePython.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.script

import java.util
import java.util.UUID

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.util.FileUtil
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import jep.Jep
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.JavaConversions._


class ExecutePython extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Execute python script"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var script : String = _

  override def setProperties(map: Map[String, Any]): Unit = {
    script = MapUtil.get(map,"script").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val script = new PropertyDescriptor()
      .name("script")
      .displayName("script")
      .description("The code of python")
      .defaultValue("")
      .required(true)

    descriptor = script :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/script/python.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.ScriptGroup)
  }
  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val jep = new Jep()
    val scriptPath = "/tmp/pythonExcutor-"+ UUID.randomUUID() +".py"
    FileUtil.writeFile(script,scriptPath)
    jep.runScript(scriptPath)
  }
}

Source File: ExecutePythonWithDataFrame.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.script

import java.util
import java.util.UUID

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.util.FileUtil
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import jep.Jep
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.JavaConversions._


class ExecutePythonWithDataFrame extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Execute python script with dataframe"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var script : String = _
  var execFunction : String = _

  override def setProperties(map: Map[String, Any]): Unit = {
    script = MapUtil.get(map,"script").asInstanceOf[String]
    execFunction = MapUtil.get(map,"execFunction").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val script = new PropertyDescriptor()
      .name("script")
      .displayName("script")
      .description("The code of python")
      .defaultValue("")
      .required(true)
    val execFunction = new PropertyDescriptor()
      .name("execFunction")
      .displayName("execFunction")
      .description("The function of python script to be executed.")
      .defaultValue("")
      .required(true)
    descriptor = script :: descriptor
    descriptor = execFunction :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/script/python.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.ScriptGroup)
  }
  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val spark = pec.get[SparkSession]()

    val df = in.read()

    val jep = new Jep()
    val scriptPath = "/tmp/pythonExcutor-"+ UUID.randomUUID() +".py"
    FileUtil.writeFile(script,scriptPath)
    jep.runScript(scriptPath)


    val listInfo = df.toJSON.collectAsList()
    jep.eval(s"result = $execFunction($listInfo)")
    val resultArrayList = jep.getValue("result",new util.ArrayList().getClass)
    println(resultArrayList)


    var resultList = List[Map[String, Any]]()
    val it = resultArrayList.iterator()
    while(it.hasNext){
      val i = it.next().asInstanceOf[java.util.HashMap[String, Any]]
      val item =  mapAsScalaMap(i).toMap[String, Any]
      resultList =  item +: resultList
    }


    val rows = resultList.map( m => Row(m.values.toSeq:_*))
    val header = resultList.head.keys.toList
    val schema = StructType(header.map(fieldName => new StructField(fieldName, StringType, true)))

    val rdd = spark.sparkContext.parallelize(rows)
    val resultDF = spark.createDataFrame(rdd, schema)

    out.write(resultDF)
  }
}

Source File: DataFrameRowParser.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.script

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf._
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

import scala.beans.BeanProperty

class DataFrameRowParser extends ConfigurableStop{

  val authorEmail: String = "[email protected]"
  val description: String = "Create dataframe by schema"
  val inportList: List[String] = List(Port.DefaultPort.toString)
  val outportList: List[String] = List(Port.DefaultPort.toString)

  var schema: String = _
  var separator: String = _

  override def setProperties(map: Map[String, Any]): Unit = {
    schema = MapUtil.get(map,"schema").asInstanceOf[String]
    separator = MapUtil.get(map,"separator").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val schema = new PropertyDescriptor().name("schema").displayName("schema").description("The schema of dataframe").defaultValue("").required(true)
    val separator = new PropertyDescriptor().name("separator").displayName("separator").description("The separator of schema").defaultValue("").required(true)
    descriptor = schema :: descriptor
    descriptor = separator :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/script/DataFrameRowParser.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.ScriptGroup.toString)
  }


  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    val inDF = in.read()

    //parse RDD
    val rdd = inDF.rdd.map(row => {
      val fieldArray = row.get(0).asInstanceOf[String].split(",")
      Row.fromSeq(fieldArray.toSeq)
    })

    //parse schema
    val field = schema.split(separator)
    val structFieldArray : Array[StructField] = new Array[StructField](field.size)
    for(i <- 0 to field.size - 1){
      structFieldArray(i) = new StructField(field(i),StringType, nullable = true)
    }
    val schemaStructType = StructType(structFieldArray)

    //create DataFrame
    val df = spark.createDataFrame(rdd,schemaStructType)
    //df.show()
    out.write(df)
  }

}

Source File: WordSpliter.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.nlp

import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode
import com.huaban.analysis.jieba._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

class WordSpliter extends ConfigurableStop {

  val authorEmail: String = "[email protected]"
  val description: String = "Word segmentation"
  val inportList: List[String] = List(Port.AnyPort.toString)
  val outportList: List[String] = List(Port.DefaultPort.toString)

  var path:String = _


  val jiebaSegmenter = new JiebaSegmenter()
  var tokenARR:ArrayBuffer[String]=ArrayBuffer()

  def segmenter(str:String): Unit ={

    var strVar = str
    //delete symbol
    strVar = strVar.replaceAll( "[\\p{P}+~$`^=|<>～｀＄＾＋＝｜＜＞￥×+\\s]" , "");

    val tokens = jiebaSegmenter.process(strVar,SegMode.SEARCH).asScala

    for (token: SegToken <- tokens){

        tokenARR += token.word

    }
  }

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session: SparkSession = pec.get[SparkSession]()

    //read
    val strDF = session.read.text(path)

    //segmenter
    segmenter(strDF.head().getString(0))

    //write df
    val rows: List[Row] = tokenARR.map(each => {
      var arr:Array[String]=Array(each)
      val row: Row = Row.fromSeq(arr)
      row
    }).toList
    val rowRDD: RDD[Row] = session.sparkContext.makeRDD(rows)
    val schema: StructType = StructType(Array(
      StructField("words",StringType)
    ))
    val df: DataFrame = session.createDataFrame(rowRDD,schema)

    out.write(df)
  }

  def initialize(ctx: ProcessContext): Unit = {

  }

  def setProperties(map : Map[String, Any]) = {
    path = MapUtil.get(map,"path").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val path = new PropertyDescriptor().name("path").displayName("path").description("The path of text file").defaultValue("").required(true)
    descriptor = path :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/nlp/NLP.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.Alg_NLPGroup.toString)
  }

}

Source File: PutIntoSolr.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.solr

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.solr.client.solrj.impl.HttpSolrClient
import org.apache.solr.client.solrj.response.UpdateResponse
import org.apache.solr.common.SolrInputDocument
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.DataFrame


class PutIntoSolr extends ConfigurableStop{

  override val authorEmail: String = "[email protected]"
  override val description: String = "Write data to solr"
  val inportList: List[String] = List(Port.DefaultPort)
  val outportList: List[String] = List(Port.DefaultPort)

  var solrURL:String=_
  var SolrCollection:String=_

  override def setProperties(map: Map[String, Any]): Unit = {
    solrURL=MapUtil.get(map,"solrURL").asInstanceOf[String]
    SolrCollection=MapUtil.get(map,"SolrCollection").asInstanceOf[String]
  }

  var client: HttpSolrClient =_
  var doc:SolrInputDocument =_

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val df: DataFrame = in.read()


    val SchemaList: List[StructField] = df.schema.toList
    val length: Int = SchemaList.length
    var url=solrURL+"/"+SolrCollection

    df.collect().foreach(row=>{
      client= new HttpSolrClient.Builder(url).build()
      doc= new SolrInputDocument()
      for(x<-0 until length){
        doc.addField(SchemaList(x).name,row.get(x))
      }
      val update: UpdateResponse = client.add(doc)
      client.commit()
    })
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/solr/PutSolr.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.SolrGroup)
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val solrURL = new PropertyDescriptor()
      .name("solrURL")
      .displayName("SolrURL")
      .description("The url of solr")
      .defaultValue("")
      .required(true)
      .example("http://127.0.0.1:8886/solr")
    descriptor = solrURL :: descriptor

    val SolrCollection = new PropertyDescriptor()
      .name("SolrCollection")
      .displayName("SolrCollection")
      .description("The name of collection")
      .defaultValue("")
      .required(true)
      .example("test")
    descriptor = SolrCollection :: descriptor
    descriptor



  }


}

Source File: PhoneNumberClean.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.clean

import cn.piflow.bundle.util.CleanUtil
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructField

class PhoneNumberClean extends ConfigurableStop{

  val authorEmail: String = "[email protected]"
  val description: String = "Cleaning data in mobile number format"
  val inportList: List[String] = List(Port.DefaultPort)
  val outportList: List[String] = List(Port.DefaultPort)

  var columnName:String=_

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    val sqlContext=spark.sqlContext
    val dfOld = in.read()
    dfOld.createOrReplaceTempView("thesis")
    sqlContext.udf.register("regexPro",(str:String)=>CleanUtil.processPhonenum(str))
    val structFields: Array[String] = dfOld.schema.fieldNames
    val columnNames = columnName.split(",").toSet
    val sqlNewFieldStr = new StringBuilder
    columnNames.foreach(c=>{
      if (columnNames.contains(c)) {
        sqlNewFieldStr ++= ",regexPro("
        sqlNewFieldStr ++= c
        sqlNewFieldStr ++= ") as "
        sqlNewFieldStr ++= c
        sqlNewFieldStr ++= "_new "
      }
    })

    val sqlText:String="select * " + sqlNewFieldStr + " from thesis"

    val dfNew=sqlContext.sql(sqlText)
    dfNew.createOrReplaceTempView("thesis")
    val schemaStr = new StringBuilder
    structFields.foreach(field => {
      schemaStr ++= field
      if (columnNames.contains(field)) {
        schemaStr ++= "_new as "
        schemaStr ++= field
      }
      schemaStr ++= ","
    })
    val sqlTextNew:String = "select " + schemaStr.substring(0,schemaStr.length -1) + " from thesis"
    val dfNew1=sqlContext.sql(sqlTextNew)

    out.write(dfNew1)
  }



  def initialize(ctx: ProcessContext): Unit = {

  }


  def setProperties(map: Map[String, Any]): Unit = {
    columnName=MapUtil.get(map,key="columnName").asInstanceOf[String]

  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val columnName = new PropertyDescriptor()
      .name("columnName")
      .displayName("Column_Name")
      .description("Column names are what you want to clean,multiple column names are separated by commas")
      .defaultValue("")
      .required(true)
      .example("phonenumber")
    descriptor = columnName :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/clean/PhoneNumberClean.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.CleanGroup)
  }

}

Source File: TitleClean.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.clean

import cn.piflow.bundle.util.CleanUtil
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructField

class TitleClean extends ConfigurableStop{
  val authorEmail: String = "[email protected]"
  val description: String = "Cleaning title data"
  val inportList: List[String] = List(Port.DefaultPort)
  val outportList: List[String] = List(Port.DefaultPort)

  var columnName:String=_

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    val sqlContext=spark.sqlContext
    val dfOld = in.read()
    dfOld.createOrReplaceTempView("thesis")
    sqlContext.udf.register("regexPro",(str:String)=>CleanUtil.processTitle(str))
    val structFields: Array[String] = dfOld.schema.fieldNames
    val columnNames = columnName.split(",").toSet
    val sqlNewFieldStr = new StringBuilder
    columnNames.foreach(c=>{
      if (columnNames.contains(c)) {
        sqlNewFieldStr ++= ",regexPro("
        sqlNewFieldStr ++= c
        sqlNewFieldStr ++= ") as "
        sqlNewFieldStr ++= c
        sqlNewFieldStr ++= "_new "
      }
    })

    val sqlText:String="select * " + sqlNewFieldStr + " from thesis"

    val dfNew=sqlContext.sql(sqlText)
    dfNew.createOrReplaceTempView("thesis")
    val schemaStr = new StringBuilder
    structFields.foreach(field => {
      schemaStr ++= field
      if (columnNames.contains(field)) {
        schemaStr ++= "_new as "
        schemaStr ++= field
      }
      schemaStr ++= ","
    })
    val sqlTextNew:String = "select " + schemaStr.substring(0,schemaStr.length -1) + " from thesis"
    val dfNew1=sqlContext.sql(sqlTextNew)

    out.write(dfNew1)
  }

  def initialize(ctx: ProcessContext): Unit = {

  }


  def setProperties(map: Map[String, Any]): Unit = {
    columnName=MapUtil.get(map,key="columnName").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()

    val columnName = new PropertyDescriptor()
      .name("columnName")
      .displayName("Column_Name")
      .description("Column names are what you want to clean,multiple column names are separated by commas")
      .defaultValue("")
      .required(true)
      .example("title")
    descriptor = columnName :: descriptor

    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/clean/TitleClean.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.CleanGroup)
  }

}

Source File: TestIndexing.scala From spark-solr with Apache License 2.0

5 votes

package com.lucidworks.spark

import java.util.UUID

import com.lucidworks.spark.util.SolrDataFrameImplicits._
import com.lucidworks.spark.util.{ConfigurationConstants, SolrCloudUtil, SolrQuerySupport, SolrSupport}
import org.apache.spark.sql.functions.{concat, lit}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}

class TestIndexing extends TestSuiteBuilder {

  test("Load csv file and index to Solr") {
    val collectionName = "testIndexing-" + UUID.randomUUID().toString
    SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc)
    try {
      val csvFileLocation = "src/test/resources/test-data/nyc_yellow_taxi_sample_1k.csv"
      val csvDF = sparkSession.read.format("com.databricks.spark.csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(csvFileLocation)
      assert(csvDF.count() == 999)

      val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName)
      val newDF = csvDF
        .withColumn("pickup_location", concat(csvDF.col("pickup_latitude"), lit(","), csvDF.col("pickup_longitude")))
        .withColumn("dropoff_location", concat(csvDF.col("dropoff_latitude"), lit(","), csvDF.col("dropoff_longitude")))
      newDF.write.option("zkhost", zkHost).option(ConfigurationConstants.GENERATE_UNIQUE_KEY, "true").solr(collectionName)

      // Explicit commit to make sure all docs are visible
      val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost)
      solrCloudClient.commit(collectionName, true, true)

      val solrDF = sparkSession.read.format("solr").options(solrOpts).load()
      solrDF.printSchema()
      assert (solrDF.count() == 999)
      solrDF.take(10)
    } finally {
      SolrCloudUtil.deleteCollection(collectionName, cluster)
    }
  }

  test("Solr field types config") {
    val collectionName = "testIndexing-" + UUID.randomUUID().toString
    SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc)
    try {
      val csvFileLocation = "src/test/resources/test-data/simple.csv"
      val csvDF = sparkSession.read.format("com.databricks.spark.csv")
          .option("header", "true")
          .option("inferSchema", "true")
          .load(csvFileLocation)
      val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName, ConfigurationConstants.SOLR_FIELD_TYPES -> "ntitle:text_en,nrating:string")
      csvDF.write.options(solrOpts).solr(collectionName)

      // Explicit commit to make sure all docs are visible
      val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost)
      solrCloudClient.commit(collectionName, true, true)

      val solrBaseUrl = SolrSupport.getSolrBaseUrl(zkHost)
      val solrUrl = solrBaseUrl + collectionName + "/"

      val fieldTypes = SolrQuerySupport.getFieldTypes(Set.empty, solrUrl, cloudClient, collectionName)
      assert(fieldTypes("nrating").fieldType === "string")
      assert(fieldTypes("ntitle").fieldType === "text_en")
    } finally {
      SolrCloudUtil.deleteCollection(collectionName, cluster)
    }
  }


  test("Field additions") {
    val insertSchema = StructType(Array(
      StructField("index_only_field", DataTypes.StringType, nullable = true),
      StructField("store_only_field", DataTypes.BooleanType, nullable = true),
      StructField("a_s", DataTypes.StringType, nullable = true),
      StructField("s_b", DataTypes.StringType, nullable = true)
    ))
    val collection = "testFieldAdditions" + UUID.randomUUID().toString.replace("-", "_")
    try {
      SolrCloudUtil.buildCollection(zkHost, collection, null, 2, cloudClient, sc)
      val opts = Map("zkhost" -> zkHost, "collection" -> collection)

      val solrRelation = new SolrRelation(opts, sparkSession)
      val fieldsToAdd = SolrRelation.getFieldsToAdd(insertSchema, solrRelation.conf, solrRelation.solrVersion, solrRelation.dynamicSuffixes)
      assert(fieldsToAdd.isEmpty)
    } finally {
      SolrCloudUtil.deleteCollection(collection, cluster)
    }
  }

}

Source File: SparkSchemaFnsTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.spark

import io.eels.schema.{DateType, DecimalType, Field, LongType, Precision, Scale, StringType, StructType}
import org.apache.spark.sql.types.StructField
import org.scalatest.{FunSuite, Matchers}

class SparkSchemaFnsTest extends FunSuite with Matchers {

  test("eel schema to spark") {
    val schema = StructType(
      Field("a", StringType),
      Field("b", DecimalType(Precision(14), Scale(4))),
      Field("c", LongType.Signed),
      Field("d", DateType)
    )
    SparkSchemaFns.toSparkSchema(schema) shouldBe
      org.apache.spark.sql.types.StructType(
        Seq(
          StructField("a", org.apache.spark.sql.types.StringType, true),
          StructField("b", org.apache.spark.sql.types.DecimalType(14, 4), true),
          StructField("c", org.apache.spark.sql.types.LongType, true),
          StructField("d", org.apache.spark.sql.types.DateType, true)
        )
      )
  }

  test("spark schema to eel") {
    val schema = org.apache.spark.sql.types.StructType(
      Seq(
        StructField("a", org.apache.spark.sql.types.StringType, true),
        StructField("b", org.apache.spark.sql.types.DecimalType(14, 4), true),
        StructField("c", org.apache.spark.sql.types.LongType, true),
        StructField("d", org.apache.spark.sql.types.DateType, true)
      )
    )
    SparkSchemaFns.fromSparkSchema(schema) shouldBe
      StructType(
        Field("a", StringType),
        Field("b", DecimalType(Precision(14), Scale(4))),
        Field("c", LongType.Signed),
        Field("d", DateType)
      )
  }
}

Source File: StackSummarizerFactory.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries.summarize.summarizer

import com.twosigma.flint.timeseries.summarize._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.GenericArrayData
import org.apache.spark.sql.types.{ ArrayType, StructField, StructType }

case class StackSummarizerFactory(factories: Seq[SummarizerFactory])
  extends SummarizerFactory {

  factories.foreach {
    case factory => require(
      !factory.isInstanceOf[OverlappableSummarizerFactory],
      "Stacking overlappable summarizers are not supported"
    )
  }

  
  override val requiredColumns: ColumnList = factories.map(_.requiredColumns).reduce(_ ++ _)

  def apply(inputSchema: StructType): Summarizer = {
    val summarizers = factories.map(f => f.apply(inputSchema))

    new StackSummarizer(inputSchema, prefixOpt, requiredColumns, summarizers)
  }
}

class StackSummarizer(
  override val inputSchema: StructType,
  override val prefixOpt: Option[String],
  override val requiredColumns: ColumnList,
  summarizers: Seq[Summarizer]
) extends Summarizer with InputAlwaysValid {

  override type T = InternalRow
  override type U = Seq[Any]
  override type V = Seq[InternalRow]

  require(
    summarizers.forall(s => s.outputSchema == summarizers.head.outputSchema),
    s"Summarizers must have identical schemas to be stacked: ${summarizers.map(_.outputSchema).mkString(" vs. ")}"
  )
  override val schema: StructType = StructType(
    StructField(StackSummarizer.stackColumn, ArrayType(summarizers.head.outputSchema))
      :: Nil
  )

  override val summarizer =
    com.twosigma.flint.rdd.function.summarize.summarizer.StackSummarizer(summarizers)

  // Convert the output of `summarizer` to the InternalRow.
  override def fromV(v: V): InternalRow = InternalRow(new GenericArrayData(v))

  // Convert the InternalRow to the type of row expected by the `summarizer`.
  override def toT(r: InternalRow): T = r

}

object StackSummarizer {
  val stackColumn = "stack"
}

Source File: MetadataTransformUtils.scala From automl with Apache License 2.0

5 votes

package org.apache.spark.ml.feature.operator

import org.apache.spark.sql.types.{MetadataBuilder, StructField}

import scala.collection.mutable.ArrayBuffer


  def vectorCartesianTransform(fields: Array[StructField], numFeatures: Int): MetadataBuilder = {
    if (fields.length < 2) {
      throw new IllegalArgumentException("the number of cols in the input DataFrame should be no less than 2")
    }

    var res = Array[String]()
    if (fields.head.metadata.contains(DERIVATION)) {
      res = fields.head.metadata.getStringArray(DERIVATION)
    } else {
      res = createDerivation(numFeatures)
    }

    for (i <- 1 until fields.length) {
      if (fields(i).metadata.contains(DERIVATION)) {
        res = cartesianWithArray(res, fields(i).metadata.getStringArray(DERIVATION))
      } else {
        res = cartesianWithArray(res, createDerivation(numFeatures))
      }
    }

    val metadata = fields.last.metadata
    new MetadataBuilder().withMetadata(metadata).putStringArray(DERIVATION, res)
  }

}

Source File: Surrogate.scala From automl with Apache License 2.0

5 votes

package com.tencent.angel.spark.automl.tuner.surrogate

import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace
import org.apache.commons.logging.{Log, LogFactory}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}

import scala.collection.mutable.ArrayBuffer


  def predict(X: Vector): (Double, Double)

  def stop(): Unit

  def curBest: (Vector, Double) = {
    if (minimize) curMin else curMax
  }

  def curMin: (Vector, Double) = {
    if (preY.isEmpty)
      (null, Double.MaxValue)
    else {
      val maxIdx: Int = preY.zipWithIndex.max._2
      (preX(maxIdx), -preY(maxIdx))
    }
  }

  def curMax: (Vector, Double) = {
    if (preY.isEmpty)
      (null, Double.MinValue)
    else {
      val maxIdx: Int = preY.zipWithIndex.max._2
      (preX(maxIdx), preY(maxIdx))
    }
  }
}

Source File: BasicDataSourceSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class BasicDataSourceSuite extends BaseDataSourceTest("test_datasource_basic") {
  private val row1 = Row(null, "Hello")
  private val row2 = Row(2, "TiDB")
  private val row3 = Row(3, "Spark")
  private val row4 = Row(4, null)

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')")
  }

  test("Test Select") {
    if (!supportBatchWrite) {
      cancel
    }

    testTiDBSelect(Seq(row1, row2))
  }

  test("Test Write Append") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()

    testTiDBSelect(Seq(row1, row2, row3, row4))
  }

  test("Test Write Overwrite") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    val caught = intercept[TiBatchWriteException] {
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .mode("overwrite")
        .save()
    }

    assert(
      caught.getMessage
        .equals("SaveMode: Overwrite is not supported. TiSpark only support SaveMode.Append."))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: UpperCaseColumnNameSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class UpperCaseColumnNameSuite
    extends BaseDataSourceTest("test_datasource_uppser_case_column_name") {

  private val row1 = Row(1, 2)

  private val schema = StructType(
    List(StructField("O_ORDERKEY", IntegerType), StructField("O_CUSTKEY", IntegerType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"""
                  |CREATE TABLE $dbtable (O_ORDERKEY INTEGER NOT NULL,
                  |                       O_CUSTKEY INTEGER NOT NULL);
       """.stripMargin)
  }

  test("Test insert upper case column name") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row1))
    val df = sqlContext.createDataFrame(data, schema)
    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: CheckUnsupportedSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class CheckUnsupportedSuite extends BaseDataSourceTest("test_datasource_check_unsupported") {

  override def beforeAll(): Unit =
    super.beforeAll()

  test("Test write to partition table") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()

    tidbStmt.execute("set @@tidb_enable_table_partition = 1")

    jdbcUpdate(
      s"create table $dbtable(i int, s varchar(128)) partition by range(i) (partition p0 values less than maxvalue)")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello')")

    val row1 = Row(null, "Hello")
    val row2 = Row(2, "TiDB")
    val row3 = Row(3, "Spark")

    val schema = StructType(List(StructField("i", IntegerType), StructField("s", StringType)))

    {
      val caught = intercept[TiBatchWriteException] {
        tidbWrite(List(row2, row3), schema)
      }
      assert(
        caught.getMessage
          .equals("tispark currently does not support write data to partition table!"))
    }

    testTiDBSelect(Seq(row1))
  }

  test("Check Virtual Generated Column") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i INT, c1 INT, c2 INT,  c3 INT AS (c1 + c2))")

    val row1 = Row(1, 2, 3)
    val schema = StructType(
      List(
        StructField("i", IntegerType),
        StructField("c1", IntegerType),
        StructField("c2", IntegerType)))

    val caught = intercept[TiBatchWriteException] {
      tidbWrite(List(row1), schema)
    }
    assert(
      caught.getMessage
        .equals("tispark currently does not support write data to table with generated column!"))

  }

  test("Check Stored Generated Column") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i INT, c1 INT, c2 INT,  c3 INT AS (c1 + c2) STORED)")

    val row1 = Row(1, 2, 3)
    val schema = StructType(
      List(
        StructField("i", IntegerType),
        StructField("c1", IntegerType),
        StructField("c2", IntegerType)))
    val caught = intercept[TiBatchWriteException] {
      tidbWrite(List(row1), schema)
    }
    assert(
      caught.getMessage
        .equals("tispark currently does not support write data to table with generated column!"))

  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: TiSparkTypeSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

class TiSparkTypeSuite extends BaseDataSourceTest("type_test") {
  private val row1 = Row(null, "Hello")
  private val row2 = Row(2L, "TiDB")
  private val row3 = Row(3L, "Spark")
  private val row5 = Row(Long.MaxValue, "Duplicate")

  private val schema = StructType(List(StructField("i", LongType), StructField("s", StringType)))
  test("bigint test") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i bigint, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')")

    tidbWrite(List(row3, row5), schema)
    testTiDBSelect(List(row1, row2, row3, row5))
  }
}

Source File: RegionSplitSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import com.pingcap.tikv.TiBatchWriteUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class RegionSplitSuite extends BaseDataSourceTest("region_split_test") {
  private val row1 = Row(1)
  private val row2 = Row(2)
  private val row3 = Row(3)
  private val schema = StructType(List(StructField("a", IntegerType)))

  test("index region split test") {
    if (!supportBatchWrite) {
      cancel
    }

    // do not test this case on tidb which does not support split region
    if (!isEnableSplitRegion) {
      cancel
    }

    dropTable()
    jdbcUpdate(
      s"CREATE TABLE  $dbtable ( `a` int(11), unique index(a)) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin")

    val options = Some(Map("enableRegionSplit" -> "true", "regionSplitNum" -> "3"))

    tidbWrite(List(row1, row2, row3), schema, options)

    val tiTableInfo =
      ti.tiSession.getCatalog.getTable(dbPrefix + database, table)
    val regionsNum = TiBatchWriteUtils
      .getRegionByIndex(ti.tiSession, tiTableInfo, tiTableInfo.getIndices.get(0))
      .size()
    assert(regionsNum == 3)
  }

  test("table region split test") {
    if (!supportBatchWrite) {
      cancel
    }

    // do not test this case on tidb which does not support split region
    if (!isEnableSplitRegion) {
      cancel
    }

    dropTable()
    jdbcUpdate(
      s"CREATE TABLE  $dbtable ( `a` int(11) DEFAULT NULL) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin")

    val options = Some(Map("enableRegionSplit" -> "true", "regionSplitNum" -> "3"))

    tidbWrite(List(row1, row2, row3), schema, options)

    val tiTableInfo =
      ti.tiSession.getCatalog.getTable(dbPrefix + database, table)
    val regionsNum = TiBatchWriteUtils.getRecordRegions(ti.tiSession, tiTableInfo).size()
    assert(regionsNum == 3)
  }
}

Source File: MissingParameterSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class MissingParameterSuite extends BaseDataSourceTest("test_datasource_missing_parameter") {
  private val row1 = Row(null, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  test("Missing parameter: database") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")

    val caught = intercept[IllegalArgumentException] {
      val rows = row1 :: Nil
      val data: RDD[Row] = sc.makeRDD(rows)
      val df = sqlContext.createDataFrame(data, schema)
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("table", table)
        .mode("append")
        .save()
    }
    assert(
      caught.getMessage
        .equals("requirement failed: Option 'database' is required."))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: ShardRowIDBitsSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class ShardRowIDBitsSuite extends BaseDataSourceTest("test_shard_row_id_bits") {
  private val row1 = Row(1)
  private val row2 = Row(2)
  private val row3 = Row(3)
  private val schema = StructType(List(StructField("a", IntegerType)))

  test("reading and writing a table with shard_row_id_bits") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"CREATE TABLE  $dbtable ( `a` int(11))  SHARD_ROW_ID_BITS = 4")

    jdbcUpdate(s"insert into $dbtable values(null)")

    tidbWrite(List(row1, row2, row3), schema)
    testTiDBSelect(List(Row(null), row1, row2, row3), sortCol = "a")
  }
}

Source File: OnlyOnePkSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class OnlyOnePkSuite extends BaseDataSourceTest("test_datasource_only_one_pk") {
  private val row3 = Row(3)
  private val row4 = Row(4)

  private val schema = StructType(List(StructField("i", IntegerType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int primary key)")
  }

  test("Test Write Append") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()

    testTiDBSelect(Seq(row3, row4))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: BatchWriteIssueSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark

import com.pingcap.tispark.datasource.BaseDataSourceTest
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class BatchWriteIssueSuite extends BaseDataSourceTest("test_batchwrite_issue") {
  override def beforeAll(): Unit = {
    super.beforeAll()
  }

  test("Combine unique index with null value test") {
    doTestNullValues(s"create table $dbtable(a int, b varchar(64), CONSTRAINT ab UNIQUE (a, b))")
  }

  test("Combine primary key with null value test") {
    doTestNullValues(s"create table $dbtable(a int, b varchar(64), PRIMARY KEY (a, b))")
  }

  test("PK is handler with null value test") {
    doTestNullValues(s"create table $dbtable(a int, b varchar(64), PRIMARY KEY (a))")
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }

  private def doTestNullValues(createTableSQL: String): Unit = {
    if (!supportBatchWrite) {
      cancel
    }
    val schema = StructType(
      List(
        StructField("a", IntegerType),
        StructField("b", StringType),
        StructField("c", StringType)))

    val options = Some(Map("replace" -> "true"))

    dropTable()
    jdbcUpdate(createTableSQL)
    jdbcUpdate(s"alter table $dbtable add column to_delete int")
    jdbcUpdate(s"alter table $dbtable add column c varchar(64) default 'c33'")
    jdbcUpdate(s"alter table $dbtable drop column to_delete")
    jdbcUpdate(s"""
                  |insert into $dbtable values(11, 'c12', null);
                  |insert into $dbtable values(21, 'c22', null);
                  |insert into $dbtable (a, b) values(31, 'c32');
                  |insert into $dbtable values(41, 'c42', 'c43');
                  |
      """.stripMargin)

    assert(queryTiDBViaJDBC(s"select c from $dbtable where a=11").head.head == null)
    assert(queryTiDBViaJDBC(s"select c from $dbtable where a=21").head.head == null)
    assert(
      queryTiDBViaJDBC(s"select c from $dbtable where a=31").head.head.toString.equals("c33"))
    assert(
      queryTiDBViaJDBC(s"select c from $dbtable where a=41").head.head.toString.equals("c43"))

    {
      val row1 = Row(11, "c12", "c13")
      val row3 = Row(31, "c32", null)

      tidbWrite(List(row1, row3), schema, options)

      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=11").head.head.toString.equals("c13"))
      assert(queryTiDBViaJDBC(s"select c from $dbtable where a=21").head.head == null)
      assert(queryTiDBViaJDBC(s"select c from $dbtable where a=31").head.head == null)
      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=41").head.head.toString.equals("c43"))
    }

    {
      val row1 = Row(11, "c12", "c213")
      val row3 = Row(31, "c32", "tt")
      tidbWrite(List(row1, row3), schema, options)
      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=11").head.head.toString.equals("c213"))
      assert(queryTiDBViaJDBC(s"select c from $dbtable where a=21").head.head == null)
      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=31").head.head.toString.equals("tt"))
      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=41").head.head.toString.equals("c43"))
    }
  }
}

Source File: LockTimeoutSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.ttl

import com.pingcap.tikv.TTLManager
import com.pingcap.tikv.exception.GrpcException
import com.pingcap.tispark.datasource.BaseDataSourceTest
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class LockTimeoutSuite extends BaseDataSourceTest("test_lock_timeout") {
  private val row1 = Row(1, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {
    super.beforeAll()
    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
  }

  test("Test Lock TTL Timeout") {
    if (!supportTTLUpdate) {
      cancel
    }

    val seconds = 1000
    val sleep1 = TTLManager.MANAGED_LOCK_TTL + 10 * seconds
    val sleep2 = TTLManager.MANAGED_LOCK_TTL + 15 * seconds

    val data: RDD[Row] = sc.makeRDD(List(row1))
    val df = sqlContext.createDataFrame(data, schema)

    new Thread(new Runnable {
      override def run(): Unit = {
        Thread.sleep(sleep1)
        queryTiDBViaJDBC(s"select * from $dbtable")
      }
    }).start()

    val grpcException = intercept[GrpcException] {
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .option("sleepAfterPrewritePrimaryKey", sleep2)
        .mode("append")
        .save()
    }

    assert(grpcException.getMessage.equals("retry is exhausted."))
    assert(grpcException.getCause.getMessage.startsWith("Txn commit primary key failed"))
    assert(
      grpcException.getCause.getCause.getMessage.startsWith(
        "Key exception occurred and the reason is retryable: \"Txn(Mvcc(TxnLockNotFound"))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: RedisStreamProvider.scala From spark-redis with BSD 3-Clause "New" or "Revised" License

5 votes

package org.apache.spark.sql.redis.stream

import com.redislabs.provider.redis.util.Logging
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Source
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
import org.apache.spark.sql.types.{StringType, StructField, StructType}


class RedisStreamProvider extends DataSourceRegister with StreamSourceProvider with Logging {

  override def shortName(): String = "redis"

  override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType],
                            providerName: String, parameters: Map[String, String]): (String, StructType) = {
    providerName -> schema.getOrElse {
      StructType(Seq(StructField("_id", StringType)))
    }
  }

  override def createSource(sqlContext: SQLContext, metadataPath: String,
                            schema: Option[StructType], providerName: String,
                            parameters: Map[String, String]): Source = {
    val (_, ss) = sourceSchema(sqlContext, schema, providerName, parameters)
    val source = new RedisSource(sqlContext, metadataPath, Some(ss), parameters)
    source.start()
    source
  }
}

Source File: HousePriceDataBusinessLogic.scala From bdd-spark with MIT License

5 votes

import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.json4s._
import org.json4s.jackson.JsonMethods._

object HousePriceDataBusinessLogic {
  import Spark._

  def processHousePrices(housePrices : DataFrame, postcodes : DataFrame) : DataFrame = {
    housePrices.join(postcodes, "Postcode")
  }

  def processHousePricesAndSaveToParquet(housePrices : DataFrame, postcodes : DataFrame, parquetWriter: ParquetWriter) : Unit = {
    parquetWriter.write(housePrices.join(postcodes, "Postcode"), "results.parquet")
  }

  def processDataFromFilesFilterItThenSaveItToParquet(reader: FileReader,
                                                      geoFilename : String,
                                                      priceFilename: String,
                                                      postcodeFileName: String,
                                                      writer: ParquetWriter) : Unit = {
    val joined = loadAndJoin(reader, priceFilename, postcodeFileName)

    // If this was real code, a geoJSON library would be sensible here. Dirty code follows:
    val json = parse(reader.readText(geoFilename))\\ "coordinates"
    val coords = json match {
      case JArray(outer) => outer.map{ case JArray(inner) => inner }
    }
    val points =
      coords
        .map(c => (c(0), c(1)))
        .map{ case (JDouble(long), JDouble(lat)) => (long, lat) }

    val minLat = Math.min(points(0)._2, points(1)._2)
    val maxLat = Math.max(points(0)._2, points(1)._2)
    val minLong = Math.min(points(0)._1, points(1)._1)
    val maxLong = Math.max(points(0)._1, points(1)._1)

    val filtered = joined
      .filter(s"Latitude >= $minLat and Latitude <= $maxLat")
      .filter(s"Longitude >= $minLong and Longitude <= $maxLong")

    writer.write(filtered, "results.parquet")
  }

  def processDataFromFilesAndSaveToParquet(reader: FileReader, priceFilename: String, postcodeFileName: String, writer: ParquetWriter) : Unit = {
    val joined = loadAndJoin(reader, priceFilename, postcodeFileName)

    writer.write(joined, "results.parquet")
  }

  private def loadAndJoin(reader: FileReader, priceFilename: String, postcodeFileName: String): DataFrame = {
    val priceSchema = StructType(Seq(
      StructField("Price", DataTypes.IntegerType),
      StructField("Postcode", DataTypes.StringType),
      StructField("HouseType", DataTypes.StringType)
    ))

    val prices = reader
      .readLinesToRdd(priceFilename)
      .map(_.split(','))
      .map(row => row.map(_.trim()))
      .map(splits => Row(splits(0).toInt, splits(1), splits(2)))

    val priceDf = spark.createDataFrame(prices, priceSchema)

    val postcodeSchema = StructType(Seq(
      StructField("Postcode", DataTypes.StringType),
      StructField("Latitude", DataTypes.DoubleType),
      StructField("Longitude", DataTypes.DoubleType)
    ))

    val postcodes = reader
      .readLinesToRdd(postcodeFileName)
      .map(_.split(','))
      .map(row => row.map(_.trim()))
      .map(splits => Row(splits(0), splits(1).toDouble, splits(2).toDouble))

    val postcodeDf = spark.createDataFrame(postcodes, postcodeSchema)

    val joined = priceDf.join(postcodeDf, "Postcode")
    joined
  }
}

Source File: Preprocess.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.BitCoin

import java.io.{ BufferedWriter, File, FileWriter }
import org.apache.spark.sql.types.{ DoubleType, IntegerType, StructField, StructType }
import org.apache.spark.sql.{ DataFrame, Row, SparkSession }
import scala.collection.mutable.ListBuffer

object Preprocess {
  //how many of first rows are omitted
    val dropFirstCount: Int = 612000

    def rollingWindow(data: DataFrame, window: Int, xFilename: String, yFilename: String): Unit = {
      var i = 0
      val xWriter = new BufferedWriter(new FileWriter(new File(xFilename)))
      val yWriter = new BufferedWriter(new FileWriter(new File(yFilename)))

      val zippedData = data.rdd.zipWithIndex().collect()
      System.gc()
      val dataStratified = zippedData.drop(dropFirstCount) //todo slice fisrt 614K
      while (i < (dataStratified.length - window)) {
        val x = dataStratified
          .slice(i, i + window)
          .map(r => r._1.getAs[Double]("Delta")).toList
        val y = dataStratified.apply(i + window)._1.getAs[Integer]("label")
        val stringToWrite = x.mkString(",")
        xWriter.write(stringToWrite + "\n")
        yWriter.write(y + "\n")

        i += 1
        if (i % 10 == 0) {
          xWriter.flush()
          yWriter.flush()
        }
      }

      xWriter.close()
      yWriter.close()
    }
    
  def main(args: Array[String]): Unit = {
    //todo modify these variables to match desirable files
    val priceDataFileName: String = "C:/Users/admin-karim/Desktop/bitstampUSD_1-min_data_2012-01-01_to_2017-10-20.csv/bitstampUSD_1-min_data_2012-01-01_to_2017-10-20.csv"
    val outputDataFilePath: String = "output/scala_test_x.csv"
    val outputLabelFilePath: String = "output/scala_test_y.csv"

    val spark = SparkSession
      .builder()
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName("Bitcoin Preprocessing")
      .getOrCreate()

    val data = spark.read.format("com.databricks.spark.csv").option("header", "true").load(priceDataFileName)
    data.show(10)
    println((data.count(), data.columns.size))

    val dataWithDelta = data.withColumn("Delta", data("Close") - data("Open"))

    import org.apache.spark.sql.functions._
    import spark.sqlContext.implicits._

    val dataWithLabels = dataWithDelta.withColumn("label", when($"Close" - $"Open" > 0, 1).otherwise(0))
    rollingWindow(dataWithLabels, 22, outputDataFilePath, outputLabelFilePath)    
    spark.stop()
  }
}

Source File: XSDToSchemaSuite.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml.util

import java.nio.file.Paths

import org.apache.spark.sql.types.{ArrayType, StructField, StructType, StringType}
import org.scalatest.funsuite.AnyFunSuite

class XSDToSchemaSuite extends AnyFunSuite {

  test("Basic parsing") {
    val parsedSchema = XSDToSchema.read(Paths.get("src/test/resources/basket.xsd"))
    val expectedSchema = StructType(Array(
      StructField("basket", StructType(Array(
        StructField("entry", ArrayType(
          StructType(Array(
            StructField("key", StringType),
            StructField("value", StringType)
          )))
        ))
      )))
    )
    assert(expectedSchema === parsedSchema)
  }

}

Source File: OptimizeHiveMetadataOnlyQuerySuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.scalatest.BeforeAndAfter

import org.apache.spark.metrics.source.HiveCatalogMetrics
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.expressions.NamedExpression
import org.apache.spark.sql.catalyst.plans.logical.{Distinct, Filter, Project, SubqueryAlias}
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_METADATA_ONLY
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class OptimizeHiveMetadataOnlyQuerySuite extends QueryTest with TestHiveSingleton
    with BeforeAndAfter with SQLTestUtils {

  import spark.implicits._

  override def beforeAll(): Unit = {
    super.beforeAll()
    sql("CREATE TABLE metadata_only (id bigint, data string) PARTITIONED BY (part int)")
    (0 to 10).foreach(p => sql(s"ALTER TABLE metadata_only ADD PARTITION (part=$p)"))
  }

  override protected def afterAll(): Unit = {
    try {
      sql("DROP TABLE IF EXISTS metadata_only")
    } finally {
      super.afterAll()
    }
  }

  test("SPARK-23877: validate metadata-only query pushes filters to metastore") {
    withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") {
      val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount

      // verify the number of matching partitions
      assert(sql("SELECT DISTINCT part FROM metadata_only WHERE part < 5").collect().length === 5)

      // verify that the partition predicate was pushed down to the metastore
      assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount === 5)
    }
  }

  test("SPARK-23877: filter on projected expression") {
    withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") {
      val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount

      // verify the matching partitions
      val partitions = spark.internalCreateDataFrame(Distinct(Filter(($"x" < 5).expr,
        Project(Seq(($"part" + 1).as("x").expr.asInstanceOf[NamedExpression]),
          spark.table("metadata_only").logicalPlan.asInstanceOf[SubqueryAlias].child)))
          .queryExecution.toRdd, StructType(Seq(StructField("x", IntegerType))))

      checkAnswer(partitions, Seq(1, 2, 3, 4).toDF("x"))

      // verify that the partition predicate was not pushed down to the metastore
      assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount == 11)
    }
  }
}

Source File: SparkExecuteStatementOperationSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, NullType, StringType, StructField, StructType}

class SparkExecuteStatementOperationSuite extends SparkFunSuite {
  test("SPARK-17112 `select null` via JDBC triggers IllegalArgumentException in ThriftServer") {
    val field1 = StructField("NULL", NullType)
    val field2 = StructField("(IF(true, NULL, NULL))", NullType)
    val tableSchema = StructType(Seq(field1, field2))
    val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors()
    assert(columns.size() == 2)
    assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
    assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
  }

  test("SPARK-20146 Comment should be preserved") {
    val field1 = StructField("column1", StringType).withComment("comment 1")
    val field2 = StructField("column2", IntegerType)
    val tableSchema = StructType(Seq(field1, field2))
    val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors()
    assert(columns.size() == 2)
    assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.STRING_TYPE)
    assert(columns.get(0).getComment() == "comment 1")
    assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.INT_TYPE)
    assert(columns.get(1).getComment() == "")
  }
}

Source File: LocalRelation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.analysis
import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils
import org.apache.spark.sql.types.{StructField, StructType}

object LocalRelation {
  def apply(output: Attribute*): LocalRelation = new LocalRelation(output)

  def apply(output1: StructField, output: StructField*): LocalRelation = {
    new LocalRelation(StructType(output1 +: output).toAttributes)
  }

  def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }

  def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }
}


  override final def newInstance(): this.type = {
    LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type]
  }

  override protected def stringArgs: Iterator[Any] = {
    if (data.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def computeStats(): Statistics =
    Statistics(sizeInBytes = EstimationUtils.getSizePerRow(output) * data.length)

  def toSQL(inlineTableName: String): String = {
    require(data.nonEmpty)
    val types = output.map(_.dataType)
    val rows = data.map { row =>
      val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql }
      cells.mkString("(", ", ", ")")
    }
    "VALUES " + rows.mkString(", ") +
      " AS " + inlineTableName +
      output.map(_.name).mkString("(", ", ", ")")
  }
}

Source File: ResolveInlineTables.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import scala.util.control.NonFatal

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{StructField, StructType}


  private[analysis] def convert(table: UnresolvedInlineTable): LocalRelation = {
    // For each column, traverse all the values and find a common data type and nullability.
    val fields = table.rows.transpose.zip(table.names).map { case (column, name) =>
      val inputTypes = column.map(_.dataType)
      val tpe = TypeCoercion.findWiderTypeWithoutStringPromotion(inputTypes).getOrElse {
        table.failAnalysis(s"incompatible types found in column $name for inline table")
      }
      StructField(name, tpe, nullable = column.exists(_.nullable))
    }
    val attributes = StructType(fields).toAttributes
    assert(fields.size == table.names.size)

    val newRows: Seq[InternalRow] = table.rows.map { row =>
      InternalRow.fromSeq(row.zipWithIndex.map { case (e, ci) =>
        val targetType = fields(ci).dataType
        try {
          val castedExpr = if (e.dataType.sameType(targetType)) {
            e
          } else {
            cast(e, targetType)
          }
          castedExpr.eval()
        } catch {
          case NonFatal(ex) =>
            table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}", ex)
        }
      })
    }

    LocalRelation(attributes, newRows)
  }
}

Source File: CanonicalizeSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.plans.logical.Range
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class CanonicalizeSuite extends SparkFunSuite {

  test("SPARK-24276: IN expression with different order are semantically equal") {
    val range = Range(1, 1, 1, 1)
    val idAttr = range.output.head

    val in1 = In(idAttr, Seq(Literal(1), Literal(2)))
    val in2 = In(idAttr, Seq(Literal(2), Literal(1)))
    val in3 = In(idAttr, Seq(Literal(1), Literal(2), Literal(3)))

    assert(in1.canonicalized.semanticHash() == in2.canonicalized.semanticHash())
    assert(in1.canonicalized.semanticHash() != in3.canonicalized.semanticHash())

    assert(range.where(in1).sameResult(range.where(in2)))
    assert(!range.where(in1).sameResult(range.where(in3)))

    val arrays1 = In(idAttr, Seq(CreateArray(Seq(Literal(1), Literal(2))),
      CreateArray(Seq(Literal(2), Literal(1)))))
    val arrays2 = In(idAttr, Seq(CreateArray(Seq(Literal(2), Literal(1))),
      CreateArray(Seq(Literal(1), Literal(2)))))
    val arrays3 = In(idAttr, Seq(CreateArray(Seq(Literal(1), Literal(2))),
      CreateArray(Seq(Literal(3), Literal(1)))))

    assert(arrays1.canonicalized.semanticHash() == arrays2.canonicalized.semanticHash())
    assert(arrays1.canonicalized.semanticHash() != arrays3.canonicalized.semanticHash())

    assert(range.where(arrays1).sameResult(range.where(arrays2)))
    assert(!range.where(arrays1).sameResult(range.where(arrays3)))
  }

  test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
    val expId = NamedExpression.newExprId
    val qualifier = Seq.empty[String]
    val structType = StructType(
      StructField("a", StructType(StructField("b", IntegerType, false) :: Nil), false) :: Nil)

    // GetStructField with different names are semantically equal
    val fieldA1 = GetStructField(
      AttributeReference("data1", structType, false)(expId, qualifier),
      0, Some("a1"))
    val fieldA2 = GetStructField(
      AttributeReference("data2", structType, false)(expId, qualifier),
      0, Some("a2"))
    assert(fieldA1.semanticEquals(fieldA2))

    val fieldB1 = GetStructField(
      GetStructField(
        AttributeReference("data1", structType, false)(expId, qualifier),
        0, Some("a1")),
      0, Some("b1"))
    val fieldB2 = GetStructField(
      GetStructField(
        AttributeReference("data2", structType, false)(expId, qualifier),
        0, Some("a2")),
      0, Some("b2"))
    assert(fieldB1.semanticEquals(fieldB2))
  }
}

Source File: HadoopFsRelation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.util.Locale

import scala.collection.mutable

import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.sql.catalyst.catalog.BucketSpec
import org.apache.spark.sql.execution.FileRelation
import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister}
import org.apache.spark.sql.types.{StructField, StructType}



case class HadoopFsRelation(
    location: FileIndex,
    partitionSchema: StructType,
    dataSchema: StructType,
    bucketSpec: Option[BucketSpec],
    fileFormat: FileFormat,
    options: Map[String, String])(val sparkSession: SparkSession)
  extends BaseRelation with FileRelation {

  override def sqlContext: SQLContext = sparkSession.sqlContext

  private def getColName(f: StructField): String = {
    if (sparkSession.sessionState.conf.caseSensitiveAnalysis) {
      f.name
    } else {
      f.name.toLowerCase(Locale.ROOT)
    }
  }

  val overlappedPartCols = mutable.Map.empty[String, StructField]
  partitionSchema.foreach { partitionField =>
    if (dataSchema.exists(getColName(_) == getColName(partitionField))) {
      overlappedPartCols += getColName(partitionField) -> partitionField
    }
  }

  // When data and partition schemas have overlapping columns, the output
  // schema respects the order of the data schema for the overlapping columns, and it
  // respects the data types of the partition schema.
  val schema: StructType = {
    StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++
      partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f))))
  }

  def partitionSchemaOption: Option[StructType] =
    if (partitionSchema.isEmpty) None else Some(partitionSchema)

  override def toString: String = {
    fileFormat match {
      case source: DataSourceRegister => source.shortName()
      case _ => "HadoopFiles"
    }
  }

  override def sizeInBytes: Long = {
    val compressionFactor = sqlContext.conf.fileCompressionFactor
    (location.sizeInBytes * compressionFactor).toLong
  }


  override def inputFiles: Array[String] = location.inputFiles
}

Source File: EvalPythonExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.python

import java.io.File

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.{SparkEnv, TaskContext}
import org.apache.spark.api.python.ChainedPythonFunctions
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.{DataType, StructField, StructType}
import org.apache.spark.util.Utils



abstract class EvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
  extends SparkPlan {

  def children: Seq[SparkPlan] = child :: Nil

  override def producedAttributes: AttributeSet = AttributeSet(output.drop(child.output.length))

  private def collectFunctions(udf: PythonUDF): (ChainedPythonFunctions, Seq[Expression]) = {
    udf.children match {
      case Seq(u: PythonUDF) =>
        val (chained, children) = collectFunctions(u)
        (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children)
      case children =>
        // There should not be any other UDFs, or the children can't be evaluated directly.
        assert(children.forall(_.find(_.isInstanceOf[PythonUDF]).isEmpty))
        (ChainedPythonFunctions(Seq(udf.func)), udf.children)
    }
  }

  protected def evaluate(
      funcs: Seq[ChainedPythonFunctions],
      argOffsets: Array[Array[Int]],
      iter: Iterator[InternalRow],
      schema: StructType,
      context: TaskContext): Iterator[InternalRow]

  protected override def doExecute(): RDD[InternalRow] = {
    val inputRDD = child.execute().map(_.copy())

    inputRDD.mapPartitions { iter =>
      val context = TaskContext.get()

      // The queue used to buffer input rows so we can drain it to
      // combine input with output from Python.
      val queue = HybridRowQueue(context.taskMemoryManager(),
        new File(Utils.getLocalDir(SparkEnv.get.conf)), child.output.length)
      context.addTaskCompletionListener[Unit] { ctx =>
        queue.close()
      }

      val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip

      // flatten all the arguments
      val allInputs = new ArrayBuffer[Expression]
      val dataTypes = new ArrayBuffer[DataType]
      val argOffsets = inputs.map { input =>
        input.map { e =>
          if (allInputs.exists(_.semanticEquals(e))) {
            allInputs.indexWhere(_.semanticEquals(e))
          } else {
            allInputs += e
            dataTypes += e.dataType
            allInputs.length - 1
          }
        }.toArray
      }.toArray
      val projection = newMutableProjection(allInputs, child.output)
      val schema = StructType(dataTypes.zipWithIndex.map { case (dt, i) =>
        StructField(s"_$i", dt)
      })

      // Add rows to queue to join later with the result.
      val projectedRowIter = iter.map { inputRow =>
        queue.add(inputRow.asInstanceOf[UnsafeRow])
        projection(inputRow)
      }

      val outputRowIterator = evaluate(
        pyFuncs, argOffsets, projectedRowIter, schema, context)

      val joined = new JoinedRow
      val resultProj = UnsafeProjection.create(output, output)

      outputRowIterator.map { outputRow =>
        resultProj(joined(queue.remove(), outputRow))
      }
    }
  }
}

Source File: BatchEvalPythonExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.python

import scala.collection.JavaConverters._

import net.razorvine.pickle.{Pickler, Unpickler}

import org.apache.spark.TaskContext
import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.{StructField, StructType}


case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
  extends EvalPythonExec(udfs, output, child) {

  protected override def evaluate(
      funcs: Seq[ChainedPythonFunctions],
      argOffsets: Array[Array[Int]],
      iter: Iterator[InternalRow],
      schema: StructType,
      context: TaskContext): Iterator[InternalRow] = {
    EvaluatePython.registerPicklers()  // register pickler for Row

    val dataTypes = schema.map(_.dataType)
    val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython)

    // enable memo iff we serialize the row with schema (schema and class should be memorized)
    val pickle = new Pickler(needConversion)
    // Input iterator to Python: input rows are grouped so we send them in batches to Python.
    // For each row, add it to the queue.
    val inputIterator = iter.map { row =>
      if (needConversion) {
        EvaluatePython.toJava(row, schema)
      } else {
        // fast path for these types that does not need conversion in Python
        val fields = new Array[Any](row.numFields)
        var i = 0
        while (i < row.numFields) {
          val dt = dataTypes(i)
          fields(i) = EvaluatePython.toJava(row.get(i, dt), dt)
          i += 1
        }
        fields
      }
    }.grouped(100).map(x => pickle.dumps(x.toArray))

    // Output iterator for results from Python.
    val outputIterator = new PythonUDFRunner(funcs, PythonEvalType.SQL_BATCHED_UDF, argOffsets)
      .compute(inputIterator, context.partitionId(), context)

    val unpickle = new Unpickler
    val mutableRow = new GenericInternalRow(1)
    val resultType = if (udfs.length == 1) {
      udfs.head.dataType
    } else {
      StructType(udfs.map(u => StructField("", u.dataType, u.nullable)))
    }

    val fromJava = EvaluatePython.makeFromJava(resultType)

    outputIterator.flatMap { pickedResult =>
      val unpickledBatch = unpickle.loads(pickedResult)
      unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
    }.map { result =>
      if (udfs.length == 1) {
        // fast path for single UDF
        mutableRow(0) = fromJava(result)
        mutableRow
      } else {
        fromJava(result).asInstanceOf[InternalRow]
      }
    }
  }
}

Source File: resources.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import java.io.File
import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("Results", StringType, nullable = false)() :: Nil
  }
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val jarList = sparkSession.sparkContext.listJars()
    if (jars.nonEmpty) {
      for {
        jarName <- jars.map(f => new Path(f).getName)
        jarPath <- jarList if jarPath.contains(jarName)
      } yield Row(jarPath)
    } else {
      jarList.map(Row(_))
    }
  }
}

Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.util.concurrent.TimeUnit.NANOSECONDS

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning}
import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.execution.streaming.state.StateStoreOps
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType}
import org.apache.spark.util.CompletionIterator


case class StreamingGlobalLimitExec(
    streamLimit: Long,
    child: SparkPlan,
    stateInfo: Option[StatefulOperatorStateInfo] = None,
    outputMode: Option[OutputMode] = None)
  extends UnaryExecNode with StateStoreWriter {

  private val keySchema = StructType(Array(StructField("key", NullType)))
  private val valueSchema = StructType(Array(StructField("value", LongType)))

  override protected def doExecute(): RDD[InternalRow] = {
    metrics // force lazy init at driver

    assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append,
      "StreamingGlobalLimitExec is only valid for streams in Append output mode")

    child.execute().mapPartitionsWithStateStore(
        getStateInfo,
        keySchema,
        valueSchema,
        indexOrdinal = None,
        sqlContext.sessionState,
        Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
      val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null)))
      val numOutputRows = longMetric("numOutputRows")
      val numUpdatedStateRows = longMetric("numUpdatedStateRows")
      val allUpdatesTimeMs = longMetric("allUpdatesTimeMs")
      val commitTimeMs = longMetric("commitTimeMs")
      val updatesStartTimeNs = System.nanoTime

      val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L)
      var cumulativeRowCount = preBatchRowCount

      val result = iter.filter { r =>
        val x = cumulativeRowCount < streamLimit
        if (x) {
          cumulativeRowCount += 1
        }
        x
      }

      CompletionIterator[InternalRow, Iterator[InternalRow]](result, {
        if (cumulativeRowCount > preBatchRowCount) {
          numUpdatedStateRows += 1
          numOutputRows += cumulativeRowCount - preBatchRowCount
          store.put(key, getValueRow(cumulativeRowCount))
        }
        allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs)
        commitTimeMs += timeTakenMs { store.commit() }
        setStoreMetrics(store)
      })
    }
  }

  override def output: Seq[Attribute] = child.output

  override def outputPartitioning: Partitioning = child.outputPartitioning

  override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil

  private def getValueRow(value: Long): UnsafeRow = {
    UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value)))
  }
}

Source File: BlockingSource.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.util

import java.util.concurrent.CountDownLatch

import org.apache.spark.sql.{SQLContext, _}
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class BlockingSource extends StreamSourceProvider with StreamSinkProvider {

  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    BlockingSource.latch.await()
    new Source {
      override def schema: StructType = fakeSchema
      override def getOffset: Option[Offset] = Some(new LongOffset(0))
      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
        import spark.implicits._
        Seq[Int]().toDS().toDF()
      }
      override def stop() {}
    }
  }

  override def createSink(
      spark: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new Sink {
      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
    }
  }
}

object BlockingSource {
  var latch: CountDownLatch = null
}

Source File: MockSourceProvider.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.util

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Source
import org.apache.spark.sql.sources.StreamSourceProvider
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class MockSourceProvider extends StreamSourceProvider {
  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", MockSourceProvider.fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    MockSourceProvider.sourceProviderFunction()
  }
}

object MockSourceProvider {
  // Function to generate sources. May provide multiple sources if the user implements such a
  // function.
  private var sourceProviderFunction: () => Source = _

  final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = {
    var i = 0
    val sources = source +: otherSources
    sourceProviderFunction = () => {
      val source = sources(i % sources.length)
      i += 1
      source
    }
    try {
      f
    } finally {
      sourceProviderFunction = null
    }
  }
}

Source File: TableColumnsParser.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.parser

import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
import org.apache.spark.sql.catalyst.util.DataTypeParser
import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField}

trait TableColumnsParser
  extends AbstractSparkSQLParser
  with DataTypeParser
  with AnnotationParser {

  protected def commentIndicator: Keyword

  protected lazy val columnName = acceptMatch("column name", {
    case lexical.Identifier(chars) => chars
    case lexical.Keyword(chars) if !sqlReservedWords.contains(chars.toUpperCase) => chars
  })

  
  protected lazy val tableColumns: Parser[Seq[StructField]] =
    "(" ~> repsep(annotatedCol, ",") <~ ")"

  protected lazy val annotatedCol: Parser[StructField] =
    columnName ~ metadata ~ dataType ^^ {
      case name ~ md ~ typ =>
        StructField(name, typ, nullable = true, metadata = toTableMetadata(md))
    } |
    columnName ~ dataType ~ (commentIndicator ~> stringLit).?  ^^ { case name ~ typ ~ cm =>
      val meta = cm match {
        case Some(comment) =>
          new MetadataBuilder().putString(commentIndicator.str.toLowerCase, comment).build()
        case None => Metadata.empty
      }

      StructField(name, typ, nullable = true, meta)
    }
}

Source File: ShowTablesUsingCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.sources.DatasourceCatalog
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.types.{StringType, StructField, StructType}


private[sql]
case class ShowTablesUsingCommand(provider: String, options: Map[String, String])
  extends LogicalPlan
  with RunnableCommand {

  override def output: Seq[Attribute] = StructType(
    StructField("TABLE_NAME", StringType, nullable = false) ::
    StructField("IS_TEMPORARY", StringType, nullable = false) ::
    StructField("KIND", StringType, nullable = false) ::
    Nil
  ).toAttributes

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val dataSource: Any = DatasourceResolver.resolverFor(sqlContext).newInstanceOf(provider)

    dataSource match {
      case describableRelation: DatasourceCatalog =>
        describableRelation
          .getRelations(sqlContext, new CaseInsensitiveMap(options))
          .map(relationInfo => Row(
            relationInfo.name,
            relationInfo.isTemporary.toString.toUpperCase,
            relationInfo.kind.toUpperCase))
      case _ =>
        throw new RuntimeException(s"The provided data source $provider does not support " +
        "showing its relations.")
    }
  }
}

Source File: DescribeTableUsingCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.TableIdentifierUtils._
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.sources.{DatasourceCatalog, RelationInfo}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}


private[sql]
case class DescribeTableUsingCommand(
    name: TableIdentifier,
    provider: String,
    options: Map[String, String])
  extends LogicalPlan
  with RunnableCommand {

  override def output: Seq[Attribute] = StructType(
    StructField("TABLE_NAME", StringType, nullable = false) ::
    StructField("DDL_STMT", StringType, nullable = false) ::
    Nil
  ).toAttributes

  override def run(sqlContext: SQLContext): Seq[Row] = {
    // Convert the table name according to the case-sensitivity settings
    val tableId = name.toSeq
    val resolver = DatasourceResolver.resolverFor(sqlContext)
    val catalog = resolver.newInstanceOfTyped[DatasourceCatalog](provider)

    Seq(catalog
      .getRelation(sqlContext, tableId, new CaseInsensitiveMap(options)) match {
        case None => Row("", "")
        case Some(RelationInfo(relName, _, _, ddl, _)) => Row(
          relName, ddl.getOrElse(""))
    })
  }
}

Source File: dependenciesSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.TableDependencyCalculator
import org.apache.spark.sql.sources.{RelationKind, Table}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}

object DependenciesSystemTableProvider extends SystemTableProvider with LocalSpark {
  
  override def execute(): Seq[Row] = {
    val tables = getTables(sqlContext.catalog)
    val dependentsMap = buildDependentsMap(tables)

    def kindOf(tableIdentifier: TableIdentifier): String =
      tables
        .get(tableIdentifier)
        .map(plan => RelationKind.kindOf(plan).getOrElse(Table).name)
        .getOrElse(DependenciesSystemTable.UnknownType)
        .toUpperCase

    dependentsMap.flatMap {
      case (tableIdent, dependents) =>
        val curKind = kindOf(tableIdent)
        dependents.map { dependent =>
          val dependentKind = kindOf(dependent)
          Row(
            tableIdent.database.orNull,
            tableIdent.table,
            curKind,
            dependent.database.orNull,
            dependent.table,
            dependentKind,
            ReferenceDependency.id)
        }
    }.toSeq
  }

  override val schema: StructType = DependenciesSystemTable.schema
}

object DependenciesSystemTable extends SchemaEnumeration {
  val baseSchemaName = Field("BASE_SCHEMA_NAME", StringType, nullable = true)
  val baseObjectName = Field("BASE_OBJECT_NAME", StringType, nullable = false)
  val baseObjectType = Field("BASE_OBJECT_TYPE", StringType, nullable = false)
  val dependentSchemaName = Field("DEPENDENT_SCHEMA_NAME", StringType, nullable = true)
  val dependentObjectName = Field("DEPENDENT_OBJECT_NAME", StringType, nullable = false)
  val dependentObjectType = Field("DEPENDENT_OBJECT_TYPE", StringType, nullable = false)
  val dependencyType = Field("DEPENDENCY_TYPE", IntegerType, nullable = false)

  private[DependenciesSystemTable] val UnknownType = "UNKNOWN"
}

Source File: CaseSensitivityUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.analysis.{Analyzer, Catalog}
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.util.CollectionUtils._

import scala.util.{Failure, Success, Try}


  case class DuplicateFieldsException(
      originalSchema: StructType,
      schema: StructType,
      duplicateFields: Set[String])
    extends RuntimeException(
      s"""Given schema contains duplicate fields after applying case sensitivity rules:
         |${duplicateFields.mkString(", ")}
         |Given schema:
         |$originalSchema
         |After applying case sensitivity rules:
         |$schema
       """.stripMargin)
}

Source File: CollapseExpandSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.catalyst.analysis.CollapseExpandSuite.SqlLikeCatalystSourceRelation
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.sources.sql.SqlLikeRelation
import org.apache.spark.sql.sources.{BaseRelation, CatalystSource, Table}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.util.PlanComparisonUtils._
import org.apache.spark.sql.{GlobalSapSQLContext, Row}
import org.mockito.Matchers._
import org.mockito.Mockito._
import org.scalatest.FunSuite
import org.scalatest.mock.MockitoSugar


class CollapseExpandSuite extends FunSuite with MockitoSugar with GlobalSapSQLContext {
  case object Leaf extends LeafNode {
    override def output: Seq[Attribute] = Seq.empty
  }

  test("Expansion with a single sequence of projections is correctly collapsed") {
    val expand =
      Expand(
        Seq(Seq('a.string, Literal(1))),
        Seq('a.string, 'gid.int),
        Leaf)

    val collapsed = CollapseExpand(expand)
    assertResult(normalizeExprIds(Project(Seq('a.string, Literal(1) as "gid"), Leaf)))(
      normalizeExprIds(collapsed))
  }

  test("Expansion with multiple projections is correctly collapsed") {
    val expand =
      Expand(
        Seq(
          Seq('a.string, Literal(1)),
          Seq('b.string, Literal(1))),
        Seq('a.string, 'gid1.int, 'b.string, 'gid2.int),
        Leaf)

    val collapsed = CollapseExpand(expand)
    assertResult(
      normalizeExprIds(
        Project(Seq(
            'a.string,
            Literal(1) as "gid1",
            'b.string,
            Literal(1) as "gid2"),
          Leaf)))(normalizeExprIds(collapsed))
  }

  test("Expand pushdown integration") {
    val relation = mock[SqlLikeCatalystSourceRelation]
    when(relation.supportsLogicalPlan(any[Expand]))
      .thenReturn(true)
    when(relation.isMultiplePartitionExecution(any[Seq[CatalystSource]]))
      .thenReturn(true)
    when(relation.schema)
      .thenReturn(StructType(StructField("foo", StringType) :: Nil))
    when(relation.relationName)
      .thenReturn("t")
    when(relation.logicalPlanToRDD(any[LogicalPlan]))
      .thenReturn(sc.parallelize(Seq(Row("a", 1), Row("b", 1), Row("a", 1))))

    sqlc.baseRelationToDataFrame(relation).registerTempTable("t")

    val dataFrame = sqlc.sql("SELECT COUNT(DISTINCT foo) FROM t")
    val Seq(Row(ct)) = dataFrame.collect().toSeq

    assertResult(2)(ct)
  }
}

object CollapseExpandSuite {
  abstract class SqlLikeCatalystSourceRelation
    extends BaseRelation
    with Table
    with SqlLikeRelation
    with CatalystSource
}

Source File: ResolveCountDistinctStarSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.Aggregate
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalatest.FunSuite
import org.scalatest.Inside._
import org.scalatest.mock.MockitoSugar
import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Count}

import scala.collection.mutable.ArrayBuffer

class ResolveCountDistinctStarSuite extends FunSuite with MockitoSugar {
  val persons = new LogicalRelation(new BaseRelation {
    override def sqlContext: SQLContext = mock[SQLContext]
    override def schema: StructType = StructType(Seq(
      StructField("age", IntegerType),
      StructField("name", StringType)
    ))
  })

  test("Count distinct star is resolved correctly") {
    val projection = persons.select(UnresolvedAlias(
      AggregateExpression(Count(UnresolvedStar(None) :: Nil), Complete, true)))
    val stillNotCompletelyResolvedAggregate = SimpleAnalyzer.execute(projection)
    val resolvedAggregate = ResolveCountDistinctStar(SimpleAnalyzer)
                              .apply(stillNotCompletelyResolvedAggregate)
    inside(resolvedAggregate) {
      case Aggregate(Nil,
      ArrayBuffer(Alias(AggregateExpression(Count(expressions), Complete, true), _)), _) =>
        assert(expressions.collect {
          case a:AttributeReference => a.name
        }.toSet == Set("name", "age"))
    }
    assert(resolvedAggregate.resolved)
  }
}

Source File: HiveSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark

import com.sap.spark.{GlobalSparkContext, WithSapHiveContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.SapHiveContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.scalatest.FunSuite


class HiveSuite
  extends FunSuite
  with GlobalSparkContext
  with WithSapHiveContext {

  val schema = StructType(
    StructField("foo", StringType) ::
    StructField("bar", StringType) :: Nil)

  test("NewSession returns a new SapHiveContext") {
    val hiveContext = sqlc.asInstanceOf[SapHiveContext]
    val newHiveContext = hiveContext.newSession()

    assert(newHiveContext.isInstanceOf[SapHiveContext])
    assert(newHiveContext != hiveContext)
  }

  test("NewSession returns a hive context whose catalog is separated to the current one") {
    val newContext = sqlc.newSession()
    val emptyRdd = newContext.createDataFrame(sc.emptyRDD[Row], schema)
    emptyRdd.registerTempTable("foo")

    assert(!sqlc.tableNames().contains("foo"))
    assert(newContext.tableNames().contains("foo"))
  }
}

Source File: DummyRelationUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.util

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.{ColumnName, Row, SQLContext}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.sources.sql.SqlLikeRelation
import org.apache.spark.sql.types.{StructField, StructType}


  case class DummyCatalystSourceRelation(
      schema: StructType,
      isMultiplePartitionExecutionFunc: Option[Seq[CatalystSource] => Boolean] = None,
      supportsLogicalPlanFunc: Option[LogicalPlan => Boolean] = None,
      supportsExpressionFunc: Option[Expression => Boolean] = None,
      logicalPlanToRDDFunc: Option[LogicalPlan => RDD[Row]] = None)
     (@transient implicit val sqlContext: SQLContext)
    extends BaseRelation
    with CatalystSource {

    override def isMultiplePartitionExecution(relations: Seq[CatalystSource]): Boolean =
      isMultiplePartitionExecutionFunc.forall(_.apply(relations))

    override def supportsLogicalPlan(plan: LogicalPlan): Boolean =
      supportsLogicalPlanFunc.forall(_.apply(plan))

    override def supportsExpression(expr: Expression): Boolean =
      supportsExpressionFunc.forall(_.apply(expr))

    override def logicalPlanToRDD(plan: LogicalPlan): RDD[Row] =
      logicalPlanToRDDFunc.getOrElse(
        (plan: LogicalPlan) => new LogicalPlanRDD(plan, sqlContext.sparkContext)).apply(plan)
  }
}

Source File: ExcelRelation.scala From spark-hadoopoffice-ds with Apache License 2.0

5 votes

package org.zuinnote.spark.office.excel

import scala.collection.JavaConversions._

import org.apache.spark.sql.sources.{ BaseRelation, TableScan }
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.types.ArrayType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.SQLContext

import org.apache.spark.sql._
import org.apache.spark.rdd.RDD

import org.apache.hadoop.conf._
import org.apache.hadoop.mapreduce._

import org.apache.commons.logging.LogFactory
import org.apache.commons.logging.Log

import org.zuinnote.hadoop.office.format.common.dao._
import org.zuinnote.hadoop.office.format.mapreduce._

import org.zuinnote.spark.office.excel.util.ExcelFile


  override def buildScan: RDD[Row] = {
    // read ExcelRows
    val excelRowsRDD = ExcelFile.load(sqlContext, location, hadoopParams)
    // map to schema
    val schemaFields = schema.fields
    excelRowsRDD.flatMap(excelKeyValueTuple => {
      // map the Excel row data structure to a Spark SQL schema
      val rowArray = new Array[Any](excelKeyValueTuple._2.get.length)
      var i = 0;
      for (x <- excelKeyValueTuple._2.get) { // parse through the SpreadSheetCellDAO
        val spreadSheetCellDAOStructArray = new Array[String](schemaFields.length)
        val currentSpreadSheetCellDAO: Array[SpreadSheetCellDAO] = excelKeyValueTuple._2.get.asInstanceOf[Array[SpreadSheetCellDAO]]
        spreadSheetCellDAOStructArray(0) = currentSpreadSheetCellDAO(i).getFormattedValue
        spreadSheetCellDAOStructArray(1) = currentSpreadSheetCellDAO(i).getComment
        spreadSheetCellDAOStructArray(2) = currentSpreadSheetCellDAO(i).getFormula
        spreadSheetCellDAOStructArray(3) = currentSpreadSheetCellDAO(i).getAddress
        spreadSheetCellDAOStructArray(4) = currentSpreadSheetCellDAO(i).getSheetName
        // add row representing one Excel row
        rowArray(i) = spreadSheetCellDAOStructArray
        i += 1
      }
      Some(Row.fromSeq(rowArray))
    })

  }

}

Source File: utils.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

package org.apache.spark.sql.execution.streaming.http

import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.TimestampType
import org.apache.spark.SparkConf
import org.apache.commons.io.IOUtils
import org.apache.spark.serializer.KryoSerializer
import java.io.InputStream
import com.esotericsoftware.kryo.io.Input
import java.io.ByteArrayOutputStream

class WrongArgumentException(name: String, value: Any)
		extends RuntimeException(s"wrong argument: $name=$value") {
}

class MissingRequiredArgumentException(map: Map[String, String], paramName: String)
		extends RuntimeException(s"missing required argument: $paramName, all parameters=$map") {
}

class InvalidSerializerNameException(serializerName: String)
		extends RuntimeException(s"invalid serializer name: $serializerName") {
}

object SchemaUtils {
	def buildSchema(schema: StructType, includesTimestamp: Boolean, timestampColumnName: String = "_TIMESTAMP_"): StructType = {
		if (!includesTimestamp)
			schema;
		else
			StructType(schema.fields.toSeq :+ StructField(timestampColumnName, TimestampType, false));
	}
}

object Params {
	
	def deserialize(bytes: Array[Byte]): Any = {
		val kryo = kryoSerializer.newKryo();
		val input = new Input();
		input.setBuffer(bytes);
		kryo.readClassAndObject(input);
	}
}

Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.streaming.http.HttpStreamClient
import org.junit.Assert
import org.junit.Test
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.types.BooleanType
import org.apache.spark.sql.types.FloatType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.ByteType
import org.apache.spark.sql.execution.streaming.http.HttpStreamServer
import org.apache.spark.sql.execution.streaming.http.StreamPrinter
import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException


class HttpStreamServerClientTest {
	val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte),
		Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte),
		Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte));

	val ROWS2 = Array(Row("hello"),
		Row("world"),
		Row("bye"),
		Row("world"));

	@Test
	def testHttpStreamIO() {
		//starts a http server
		val kryoSerializer = new KryoSerializer(new SparkConf());
		val server = HttpStreamServer.start("/xxxx", 8080);

		val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]")
			.getOrCreate();
		spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/");

		val sqlContext = spark.sqlContext;
		import spark.implicits._
		//add a local message buffer to server, with 2 topics registered
		server.withBuffer()
			.addListener(new StreamPrinter())
			.createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1")
			.createTopic[String]("topic-2");

		val client = HttpStreamClient.connect("http://localhost:8080/xxxx");
		//tests schema of topics
		val schema1 = client.fetchSchema("topic-1");
		Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType),
			schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		val schema2 = client.fetchSchema("topic-2");
		Assert.assertArrayEquals(Array[Object](StringType),
			schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		//prepare to consume messages
		val sid1 = client.subscribe("topic-1")._1;
		val sid2 = client.subscribe("topic-2")._1;

		//produces some data
		client.sendRows("topic-1", 1, ROWS1);

		val sid4 = client.subscribe("topic-1")._1;
		val sid5 = client.subscribe("topic-2")._1;

		client.sendRows("topic-2", 1, ROWS2);

		//consumes data
		val fetched = client.fetchStream(sid1).map(_.originalRow);
		Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]);
		//it is empty now
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);

		client.unsubscribe(sid4);
		try {
			client.fetchStream(sid4);
			//exception should be thrown, because subscriber id is invalidated
			Assert.assertTrue(false);
		}
		catch {
			case e: Throwable ⇒
				e.printStackTrace();
				Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass);
		}

		server.stop();
	}
}

Source File: VectorSlicerOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.core.types.TensorShape
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.bundle._
import org.apache.spark.ml.feature.VectorSlicer
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.mleap.TypeConverters.sparkToMleapDataShape
import org.apache.spark.sql.types.StructField


class VectorSlicerOp extends SimpleSparkOp[VectorSlicer] {
  override val Model: OpModel[SparkBundleContext, VectorSlicer] = new OpModel[SparkBundleContext, VectorSlicer] {
    override val klazz: Class[VectorSlicer] = classOf[VectorSlicer]

    override def opName: String = Bundle.BuiltinOps.feature.vector_slicer

    override def store(model: Model, obj: VectorSlicer)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz))
      val dataset = context.context.dataset.get

      val namedIndicesMap: Array[(String, Int)] = if(obj.getNames.nonEmpty) {
        extractNamedIndices(obj.getInputCol, obj.getNames, dataset)
      } else { Array() }
      val (names, namedIndices) = namedIndicesMap.unzip
      val inputShape = sparkToMleapDataShape(dataset.schema(obj.getInputCol), dataset).asInstanceOf[TensorShape]

      model.withValue("indices", Value.longList(obj.getIndices.map(_.toLong).toSeq)).
        withValue("names", Value.stringList(names)).
        withValue("named_indices", Value.intList(namedIndices)).
        withValue("input_size", Value.int(inputShape.dimensions.get.head))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): VectorSlicer = {
      val names = model.value("names").getStringList
      new VectorSlicer(uid = "").setIndices(model.value("indices").getLongList.map(_.toInt).toArray).
        setNames(names.toArray)
    }

    private def extractNamedIndices(inputCol: String,
                                    names: Array[String],
                                    dataset: DataFrame): Array[(String, Int)] = {
      names.zip(getFeatureIndicesFromNames(dataset.schema(inputCol), names))
    }

    private def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = {
      require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col"
        + s" to be Vector type, but it was type ${col.dataType} instead.")
      val inputAttr = AttributeGroup.fromStructField(col)
      names.map { name =>
        require(inputAttr.hasAttr(name),
          s"getFeatureIndicesFromNames found no feature with name $name in column $col.")
        inputAttr.getAttr(name).index.get
      }
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: VectorSlicer): VectorSlicer = {
    new VectorSlicer(uid = uid).setIndices(model.getIndices).setNames(model.getNames)
  }

  override def sparkInputs(obj: VectorSlicer): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: VectorSlicer): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
}

Source File: OneHotEncoderOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribute, NumericAttribute}
import org.apache.spark.ml.bundle._
import org.apache.spark.ml.feature.OneHotEncoderModel
import org.apache.spark.sql.types.StructField

import scala.util.{Failure, Try}

object OneHotEncoderOp {
  def sizeForField(field: StructField): Int = {
    val attr = Attribute.fromStructField(field)

    (attr match {
      case nominal: NominalAttribute =>
        if (nominal.values.isDefined) {
          Try(nominal.values.get.length)
        } else if (nominal.numValues.isDefined) {
          Try(nominal.numValues.get)
        } else {
          Failure(new RuntimeException(s"invalid nominal value for field ${field.name}"))
        }
      case binary: BinaryAttribute =>
        Try(2)
      case _: NumericAttribute =>
        Failure(new RuntimeException(s"invalid numeric attribute for field ${field.name}"))
      case _ =>
        Failure(new RuntimeException(s"unsupported attribute for field ${field.name}")) // optimistic about unknown attributes
    }).get
  }
}

class OneHotEncoderOp extends SimpleSparkOp[OneHotEncoderModel] {
  override val Model: OpModel[SparkBundleContext, OneHotEncoderModel] = new OpModel[SparkBundleContext, OneHotEncoderModel] {
    override val klazz: Class[OneHotEncoderModel] = classOf[OneHotEncoderModel]

    override def opName: String = Bundle.BuiltinOps.feature.one_hot_encoder

    override def store(model: Model, obj: OneHotEncoderModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz))

      val df = context.context.dataset.get
      val categorySizes = obj.getInputCols.map { f ⇒ OneHotEncoderOp.sizeForField(df.schema(f)) }

      model.withValue("category_sizes", Value.intList(categorySizes))
        .withValue("drop_last", Value.boolean(obj.getDropLast))
        .withValue("handle_invalid", Value.string(obj.getHandleInvalid))

    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): OneHotEncoderModel = {
      new OneHotEncoderModel(uid = "", categorySizes = model.value("category_sizes").getIntList.toArray)
          .setDropLast(model.value("drop_last").getBoolean)
          .setHandleInvalid(model.value("handle_invalid").getString)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: OneHotEncoderModel): OneHotEncoderModel = {
    new OneHotEncoderModel(uid = uid, categorySizes = model.categorySizes)
      .setDropLast(model.getDropLast)
      .setHandleInvalid(model.getHandleInvalid)
  }

  override def sparkInputs(obj: OneHotEncoderModel): Seq[ParamSpec] = Seq(ParamSpec("input", obj.inputCols))

  override def sparkOutputs(obj: OneHotEncoderModel): Seq[ParamSpec] = Seq(ParamSpec("output", obj.outputCols))

}

Source File: ReverseStringIndexerOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.core.types.{DataShape, ScalarShape}
import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribute, NumericAttribute}
import org.apache.spark.ml.bundle._
import org.apache.spark.ml.feature.IndexToString
import org.apache.spark.sql.types.StructField
import ml.combust.mleap.runtime.types.BundleTypeConverters._

import scala.util.{Failure, Try}


object ReverseStringIndexerOp {
  def labelsForField(field: StructField): Array[String] = {
    val attr = Attribute.fromStructField(field)

    (attr match {
      case nominal: NominalAttribute =>
        if (nominal.values.isDefined) {
          Try(nominal.values.get)
        } else {
          Failure(new RuntimeException(s"invalid nominal value for field ${field.name}"))
        }
      case _: BinaryAttribute =>
        Failure(new RuntimeException(s"invalid binary attribute for field ${field.name}"))
      case _: NumericAttribute =>
        Failure(new RuntimeException(s"invalid numeric attribute for field ${field.name}"))
      case _ =>
        Failure(new RuntimeException(s"unsupported attribute for field ${field.name}")) // optimistic about unknown attributes
    }).get
  }
}

class ReverseStringIndexerOp extends SimpleSparkOp[IndexToString] {
  override val Model: OpModel[SparkBundleContext, IndexToString] = new OpModel[SparkBundleContext, IndexToString] {
    override val klazz: Class[IndexToString] = classOf[IndexToString]

    override def opName: String = Bundle.BuiltinOps.feature.reverse_string_indexer

    override def store(model: Model, obj: IndexToString)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val labels = obj.get(obj.labels).getOrElse {
        assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz))
        val df = context.context.dataset.get
        ReverseStringIndexerOp.labelsForField(df.schema(obj.getInputCol))
      }

      model.withValue("labels", Value.stringList(labels)).
        withValue("input_shape", Value.dataShape(ScalarShape(false)))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): IndexToString = {
      model.getValue("input_shape").map(_.getDataShape: DataShape).foreach {
        shape =>
          require(shape.isScalar, "cannot deserialize non-scalar input to Spark IndexToString model")
      }
      
      new IndexToString(uid = "").setLabels(model.value("labels").getStringList.toArray)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: IndexToString): IndexToString = {
    new IndexToString(uid = uid).setLabels(model.getLabels)
  }

  override def sparkInputs(obj: IndexToString): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: IndexToString): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
}

Source File: MathUnary.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
import org.apache.spark.sql.functions.udf


    private val className = classOf[MathUnary].getName

    override def load(path: String): MathUnary = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("operation").head()
      val operation = data.getAs[String](0)

      val model = MathUnaryModel(UnaryOperation.forName(operation))
      val transformer = new MathUnary(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

}

Source File: WrappersSpec.scala From sparksql-scalapb with Apache License 2.0

5 votes

package scalapb.spark

import com.example.protos.wrappers._
import org.apache.spark.sql.SparkSession
import org.apache.hadoop.io.ArrayPrimitiveWritable
import scalapb.GeneratedMessageCompanion
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.ArrayType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.Row

import org.scalatest.BeforeAndAfterAll
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.must.Matchers

class WrappersSpec extends AnyFlatSpec with Matchers with BeforeAndAfterAll {
  val spark: SparkSession = SparkSession
    .builder()
    .appName("ScalaPB Demo")
    .master("local[2]")
    .getOrCreate()

  import spark.implicits.StringToColumn

  val data = Seq(
    PrimitiveWrappers(
      intValue = Option(45),
      stringValue = Option("boo"),
      ints = Seq(17, 19, 25),
      strings = Seq("foo", "bar")
    ),
    PrimitiveWrappers(
      intValue = None,
      stringValue = None,
      ints = Seq(17, 19, 25),
      strings = Seq("foo", "bar")
    )
  )

  "converting df with primitive wrappers" should "work with primitive implicits" in {
    import ProtoSQL.withPrimitiveWrappers.implicits._
    val df = ProtoSQL.withPrimitiveWrappers.createDataFrame(spark, data)
    df.schema.fields.map(_.dataType).toSeq must be(
      Seq(
        IntegerType,
        StringType,
        ArrayType(IntegerType, false),
        ArrayType(StringType, false)
      )
    )
    df.collect must contain theSameElementsAs (
      Seq(
        Row(45, "boo", Seq(17, 19, 25), Seq("foo", "bar")),
        Row(null, null, Seq(17, 19, 25), Seq("foo", "bar"))
      )
    )
  }

  "converting df with primitive wrappers" should "work with default implicits" in {
    import ProtoSQL.implicits._
    val df = ProtoSQL.createDataFrame(spark, data)
    df.schema.fields.map(_.dataType).toSeq must be(
      Seq(
        StructType(Seq(StructField("value", IntegerType, true))),
        StructType(Seq(StructField("value", StringType, true))),
        ArrayType(
          StructType(Seq(StructField("value", IntegerType, true))),
          false
        ),
        ArrayType(
          StructType(Seq(StructField("value", StringType, true))),
          false
        )
      )
    )
    df.collect must contain theSameElementsAs (
      Seq(
        Row(
          Row(45),
          Row("boo"),
          Seq(Row(17), Row(19), Row(25)),
          Seq(Row("foo"), Row("bar"))
        ),
        Row(
          null,
          null,
          Seq(Row(17), Row(19), Row(25)),
          Seq(Row("foo"), Row("bar"))
        )
      )
    )
  }
}

Source File: KustoSourceTests.scala From azure-kusto-spark with Apache License 2.0

5 votes

package com.microsoft.kusto.spark

import com.microsoft.kusto.spark.datasource.KustoSourceOptions
import com.microsoft.kusto.spark.utils.{KustoDataSourceUtils => KDSU}
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.junit.runner.RunWith
import org.scalamock.scalatest.MockFactory
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}

@RunWith(classOf[JUnitRunner])
class KustoSourceTests extends FlatSpec with MockFactory with Matchers with BeforeAndAfterAll {
  private val loggingLevel: Option[String] = Option(System.getProperty("logLevel"))
  if (loggingLevel.isDefined) KDSU.setLoggingLevel(loggingLevel.get)

  private val nofExecutors = 4
  private val spark: SparkSession = SparkSession.builder()
    .appName("KustoSource")
    .master(f"local[$nofExecutors]")
    .getOrCreate()

  private var sc: SparkContext = _
  private var sqlContext: SQLContext = _
  private val cluster: String = "KustoCluster"
  private val database: String = "KustoDatabase"
  private val query: String = "KustoTable"
  private val appId: String = "KustoSinkTestApplication"
  private val appKey: String = "KustoSinkTestKey"
  private val appAuthorityId: String = "KustoSinkAuthorityId"

  override def beforeAll(): Unit = {
    super.beforeAll()

    sc = spark.sparkContext
    sqlContext = spark.sqlContext
  }

  override def afterAll(): Unit = {
    super.afterAll()

    sc.stop()
  }

  "KustoDataSource" should "recognize Kusto and get the correct schema" in {
    val spark: SparkSession = SparkSession.builder()
      .appName("KustoSource")
      .master(f"local[$nofExecutors]")
      .getOrCreate()

    val customSchema = "colA STRING, colB INT"

    val df = spark.sqlContext
      .read
      .format("com.microsoft.kusto.spark.datasource")
      .option(KustoSourceOptions.KUSTO_CLUSTER, cluster)
      .option(KustoSourceOptions.KUSTO_DATABASE, database)
      .option(KustoSourceOptions.KUSTO_QUERY, query)
      .option(KustoSourceOptions.KUSTO_AAD_APP_ID, appId)
      .option(KustoSourceOptions.KUSTO_AAD_APP_SECRET, appKey)
      .option(KustoSourceOptions.KUSTO_AAD_AUTHORITY_ID, appAuthorityId)
      .option(KustoSourceOptions.KUSTO_CUSTOM_DATAFRAME_COLUMN_TYPES, customSchema)
      .load("src/test/resources/")

    val expected = StructType(Array(StructField("colA", StringType, nullable = true),StructField("colB", IntegerType, nullable = true)))
    assert(df.schema.equals(expected))
  }
}

Source File: DataFrameModifierHelper.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.writer

import com.stratio.sparta.sdk.pipeline.autoCalculations.AutoCalculatedField
import com.stratio.sparta.sdk.pipeline.output.Output
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{Column, DataFrame}

object DataFrameModifierHelper {

  def applyAutoCalculateFields(dataFrame: DataFrame,
                               autoCalculateFields: Seq[AutoCalculatedField],
                               auxSchema: StructType): DataFrame =
    autoCalculateFields.headOption match {
      case Some(firstAutoCalculate) =>
        applyAutoCalculateFields(
          addColumnToDataFrame(dataFrame, firstAutoCalculate, auxSchema), autoCalculateFields.drop(1), auxSchema)
      case None =>
        dataFrame
    }

  private[driver] def addColumnToDataFrame(dataFrame: DataFrame,
                                   autoCalculateField: AutoCalculatedField,
                                   auxSchema: StructType): DataFrame = {
    (autoCalculateField.fromNotNullFields,
      autoCalculateField.fromPkFields,
      autoCalculateField.fromFields,
      autoCalculateField.fromFixedValue) match {
      case (Some(fromNotNullFields), _, _, _) =>
        val fields = fieldsWithAuxMetadata(dataFrame.schema.fields, auxSchema.fields).flatMap(field =>
          if (!field.nullable) Some(col(field.name)) else None).toSeq
        addField(fromNotNullFields.field.name, fromNotNullFields.field.outputType, dataFrame, fields)
      case (None, Some(fromPkFields), _, _) =>
        val fields = fieldsWithAuxMetadata(dataFrame.schema.fields, auxSchema.fields).flatMap(field =>
          if (field.metadata.contains(Output.PrimaryKeyMetadataKey)) Some(col(field.name)) else None).toSeq
        addField(fromPkFields.field.name, fromPkFields.field.outputType, dataFrame, fields)
      case (None, None, Some(fromFields), _) =>
        val fields = autoCalculateField.fromFields.get.fromFields.map(field => col(field))
        addField(fromFields.field.name, fromFields.field.outputType, dataFrame, fields)
      case (None, None, None, Some(fromFixedValue)) =>
        addLiteral(fromFixedValue.field.name, fromFixedValue.field.outputType, dataFrame, fromFixedValue.value)
      case _ => dataFrame
    }
  }

  private[driver] def addField(name: String, outputType: String, dataFrame: DataFrame, fields: Seq[Column]): DataFrame =
    outputType match {
      case "string" => dataFrame.withColumn(name, concat_ws(Output.Separator, fields: _*))
      case "array" => dataFrame.withColumn(name, array(fields: _*))
      case "map" => dataFrame.withColumn(name, struct(fields: _*))
      case _ => dataFrame
    }

  private[driver] def addLiteral(name: String, outputType: String, dataFrame: DataFrame, literal: String): DataFrame =
    outputType match {
      case "string" => dataFrame.withColumn(name, lit(literal))
      case "array" => dataFrame.withColumn(name, array(lit(literal)))
      case "map" => dataFrame.withColumn(name, struct(lit(literal)))
      case _ => dataFrame
    }

  private[driver] def fieldsWithAuxMetadata(dataFrameFields: Array[StructField], auxFields: Array[StructField]) =
    dataFrameFields.map(field => {
      auxFields.find(auxField => auxField.name == field.name) match {
        case Some(auxFounded) => field.copy(metadata = auxFounded.metadata)
        case None => field
      }
    })
}

Source File: RawDataWriterHelper.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.writer

import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.driver.step.RawData
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.utils.AggregationTime
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
import org.apache.spark.streaming.dstream.DStream


object RawDataWriterHelper {

  def writeRawData(rawData: RawData, outputs: Seq[Output], input: DStream[Row]): Unit = {
    val RawSchema = StructType(Seq(
      StructField(rawData.timeField, TimestampType, nullable = false),
      StructField(rawData.dataField, StringType, nullable = true)))
    val eventTime = AggregationTime.millisToTimeStamp(System.currentTimeMillis())

    input.map(row => Row.merge(Row(eventTime), row))
      .foreachRDD(rdd => {
        if (!rdd.isEmpty()) {
          val rawDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, RawSchema)

          WriterHelper.write(rawDataFrame, rawData.writerOptions, Map.empty[String, String], outputs)
        }
      })
  }
}

Source File: CubeMakerTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.test.cube

import java.sql.Timestamp

import com.github.nscala_time.time.Imports._
import com.stratio.sparta.driver.step.{Cube, CubeOperations, Trigger}
import com.stratio.sparta.driver.writer.WriterOptions
import com.stratio.sparta.plugin.default.DefaultField
import com.stratio.sparta.plugin.cube.field.datetime.DateTimeField
import com.stratio.sparta.plugin.cube.operator.count.CountOperator
import com.stratio.sparta.sdk.pipeline.aggregation.cube.{Dimension, DimensionValue, DimensionValuesTime, InputFields}
import com.stratio.sparta.sdk.pipeline.schema.TypeOp
import com.stratio.sparta.sdk.utils.AggregationTime
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, TimestampType}
import org.apache.spark.streaming.TestSuiteBase
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class CubeMakerTest extends TestSuiteBase {

  val PreserverOrder = false

  
  def getEventOutput(timestamp: Timestamp, millis: Long):
  Seq[Seq[(DimensionValuesTime, InputFields)]] = {
    val dimensionString = Dimension("dim1", "eventKey", "identity", new DefaultField)
    val dimensionTime = Dimension("minute", "minute", "minute", new DateTimeField)
    val dimensionValueString1 = DimensionValue(dimensionString, "value1")
    val dimensionValueString2 = dimensionValueString1.copy(value = "value2")
    val dimensionValueString3 = dimensionValueString1.copy(value = "value3")
    val dimensionValueTs = DimensionValue(dimensionTime, timestamp)
    val tsMap = Row(timestamp)
    val valuesMap1 = InputFields(Row("value1", timestamp), 1)
    val valuesMap2 = InputFields(Row("value2", timestamp), 1)
    val valuesMap3 = InputFields(Row("value3", timestamp), 1)

    Seq(Seq(
      (DimensionValuesTime("cubeName", Seq(dimensionValueString1, dimensionValueTs)), valuesMap1),
      (DimensionValuesTime("cubeName", Seq(dimensionValueString2, dimensionValueTs)), valuesMap2),
      (DimensionValuesTime("cubeName", Seq(dimensionValueString3, dimensionValueTs)), valuesMap3)
    ))
  }
}

Source File: Parser.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.sdk.pipeline.transformation

import java.io.{Serializable => JSerializable}

import com.stratio.sparta.sdk.pipeline.schema.TypeOp
import com.stratio.sparta.sdk.properties.{CustomProperties, Parameterizable}
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructField, StructType}

import scala.util.{Failure, Success, Try}

abstract class Parser(order: Integer,
                      inputField: Option[String],
                      outputFields: Seq[String],
                      schema: StructType,
                      properties: Map[String, JSerializable])
  extends Parameterizable(properties) with Ordered[Parser] with CustomProperties {

  val customKey = "transformationOptions"
  val customPropertyKey = "transformationOptionsKey"
  val customPropertyValue = "transformationOptionsValue"
  val propertiesWithCustom = properties ++ getCustomProperties

  val outputFieldsSchema = schema.fields.filter(field => outputFields.contains(field.name))

  val inputFieldRemoved = Try(propertiesWithCustom.getBoolean("removeInputField")).getOrElse(false)

  val inputFieldIndex = inputField match {
    case Some(field) => Try(schema.fieldIndex(field)).getOrElse(0)
    case None => 0
  }

  val whenErrorDo = Try(WhenError.withName(propertiesWithCustom.getString("whenError")))
    .getOrElse(WhenError.Error)

  def parse(data: Row): Seq[Row]

  def getOrder: Integer = order

  def checkFields(keyMap: Map[String, JSerializable]): Map[String, JSerializable] =
    keyMap.flatMap(key => if (outputFields.contains(key._1)) Some(key) else None)

  def compare(that: Parser): Int = this.getOrder.compareTo(that.getOrder)

  //scalastyle:off
  def returnWhenError(exception: Exception): Null =
  whenErrorDo match {
    case WhenError.Null => null
    case _ => throw exception
  }

  //scalastyle:on

  def parseToOutputType(outSchema: StructField, inputValue: Any): Any =
    Try(TypeOp.transformValueByTypeOp(outSchema.dataType, inputValue.asInstanceOf[Any]))
      .getOrElse(returnWhenError(new IllegalStateException(
        s"Error parsing to output type the value: ${inputValue.toString}")))

  def returnData(newData: Try[Seq[_]], prevData: Seq[_]): Seq[Row] =
    newData match {
      case Success(data) => Seq(Row.fromSeq(prevData ++ data))
      case Failure(e) => whenErrorDo match {
        case WhenError.Discard => Seq.empty[Row]
        case _ => throw e
      }
    }

  def returnData(newData: Try[Row], prevData: Row): Seq[Row] =
    newData match {
      case Success(data) => Seq(Row.merge(prevData, data))
      case Failure(e) => whenErrorDo match {
        case WhenError.Discard => Seq.empty[Row]
        case _ => throw e
      }
    }

  def removeIndex(row: Seq[_], inputFieldIndex: Int): Seq[_] = if (row.size < inputFieldIndex) row
    else row.take(inputFieldIndex) ++ row.drop(inputFieldIndex + 1)

  def removeInputField(row: Row): Seq[_] = {
    if (inputFieldRemoved && inputField.isDefined)
      removeIndex(row.toSeq, inputFieldIndex)
    else
      row.toSeq
  }


}

object Parser {

  final val ClassSuffix = "Parser"
  final val DefaultOutputType = "string"
  final val TypesFromParserClass = Map("datetime" -> "timestamp")
}

Source File: ParserTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.sdk.pipeline.transformation

import java.io.{Serializable => JSerializable}

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class ParserTest extends WordSpec with Matchers {

  "Parser" should {

    val parserTest = new ParserMock(
      1,
      Some("input"),
      Seq("output"),
      StructType(Seq(StructField("some", StringType))),
      Map()
    )

    "Order must be " in {
      val expected = 1
      val result = parserTest.getOrder
      result should be(expected)
    }

    "Parse must be " in {
      val event = Row("value")
      val expected = Seq(event)
      val result = parserTest.parse(event)
      result should be(expected)
    }

    "checked fields not be contained in outputs must be " in {
      val keyMap = Map("field" -> "value")
      val expected = Map()
      val result = parserTest.checkFields(keyMap)
      result should be(expected)
    }

    "checked fields are contained in outputs must be " in {
      val keyMap = Map("output" -> "value")
      val expected = keyMap
      val result = parserTest.checkFields(keyMap)
      result should be(expected)
    }

    "classSuffix must be " in {
      val expected = "Parser"
      val result = Parser.ClassSuffix
      result should be(expected)
    }
  }
}

Source File: RedisOutput.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.output.redis

import java.io.Serializable

import com.stratio.sparta.plugin.output.redis.dao.AbstractRedisDAO
import com.stratio.sparta.sdk.pipeline.output.Output._
import com.stratio.sparta.sdk.pipeline.output.{Output, SaveModeEnum}
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row}


class RedisOutput(name: String, properties: Map[String, Serializable])
  extends Output(name, properties) with AbstractRedisDAO with Serializable {

  override val hostname = properties.getString("hostname", DefaultRedisHostname)
  override val port = properties.getString("port", DefaultRedisPort).toInt

  override def save(dataFrame: DataFrame, saveMode: SaveModeEnum.Value, options: Map[String, String]): Unit = {
    val tableName = getTableNameFromOptions(options)
    val schema = dataFrame.schema

    validateSaveMode(saveMode)

    dataFrame.foreachPartition{ rowList =>
      rowList.foreach{ row =>
        val valuesList = getValuesList(row,schema.fieldNames)
        val hashKey = getHashKeyFromRow(valuesList, schema)
        getMeasuresFromRow(valuesList, schema).foreach { case (measure, value) =>
          hset(hashKey, measure.name, value)
        }
      }
    }
  }

  def getHashKeyFromRow(valuesList: Seq[(String, String)], schema: StructType): String =
    valuesList.flatMap{ case (key, value) =>
      val fieldSearch = schema.fields.find(structField =>
        structField.metadata.contains(Output.PrimaryKeyMetadataKey) && structField.name == key)

      fieldSearch.map(structField => s"${structField.name}$IdSeparator$value")
    }.mkString(IdSeparator)

  def getMeasuresFromRow(valuesList: Seq[(String, String)], schema: StructType): Seq[(StructField, String)] =
    valuesList.flatMap{ case (key, value) =>
      val fieldSearch = schema.fields.find(structField =>
          structField.metadata.contains(Output.MeasureMetadataKey) &&
          structField.name == key)
      fieldSearch.map(field => (field, value))
    }

  def getValuesList(row: Row, fieldNames: Array[String]): Seq[(String, String)] =
    fieldNames.zip(row.toSeq).map{ case (key, value) => (key, value.toString)}.toSeq
}

Source File: KiteMorphlineImpl.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.transformation.morphline

import com.typesafe.config.Config
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructField
import org.kitesdk.morphline.api.{Command, MorphlineContext, Record}
import org.kitesdk.morphline.base.Compiler

case class KiteMorphlineImpl(config: Config, outputFieldsSchema: Array[StructField]) {

  private val morphlineContext: MorphlineContext = new MorphlineContext.Builder().build()

  private val collector: ThreadLocal[EventCollector] = new ThreadLocal[EventCollector]() {
    override def initialValue(): EventCollector = new EventCollector(outputFieldsSchema)
  }

  private val morphline: ThreadLocal[Command] = new ThreadLocal[Command]() {
    override def initialValue(): Command = new Compiler().compile(config, morphlineContext, collector.get())
  }

  def process(inputRecord: Record): Row = {
    val coll = collector.get()
    coll.reset()
    morphline.get().process(inputRecord)
    coll.row
  }
}

Source File: EventCollector.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.transformation.morphline

import com.stratio.sparta.sdk.pipeline.schema.TypeOp
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructField
import org.kitesdk.morphline.api.{Command, Record}

class EventCollector(outputFieldsSchema: Array[StructField]) extends Command {

  @volatile var row: Row = Row.empty

  override def notify(p1: Record): Unit = {}

  def reset(): Unit = row = Row.empty

  //scalastyle:off
  override def getParent: Command = null

  //scalastyle:om

  override def process(recordProcess: Record): Boolean = {
      Option(recordProcess) match {
        case Some(record) =>
          row = Row.fromSeq(outputFieldsSchema.map(field =>
            Option(record.getFirstValue(field.name)) match {
              case Some(value) =>
                TypeOp.transformValueByTypeOp(field.dataType, value.asInstanceOf[Any])
              case None =>
                throw new IllegalStateException(s"Impossible to parse field: ${field.name}.")
            }
          ).toList)
          true
        case None =>
          row = Row.empty
          false
      }
  }
}

Source File: MorphlinesParser.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.transformation.morphline

import java.io.{ByteArrayInputStream, Serializable => JSerializable}
import java.util.concurrent.ConcurrentHashMap

import com.stratio.sparta.sdk.pipeline.transformation.Parser
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructField, StructType}
import org.kitesdk.morphline.api.Record

import scala.collection.JavaConverters._
import scala.util.Try

class MorphlinesParser(order: Integer,
                       inputField: Option[String],
                       outputFields: Seq[String],
                       schema: StructType,
                       properties: Map[String, JSerializable])
  extends Parser(order, inputField, outputFields, schema, properties) {

  assert(inputField.isDefined, "It's necessary to define one inputField in the Morphline Transformation")

  private val config: String = properties.getString("morphline")

  override def parse(row: Row): Seq[Row] = {
    val inputValue = Option(row.get(inputFieldIndex))
    val newData = Try {
      inputValue match {
        case Some(s: String) =>
          if (s.isEmpty) returnWhenError(new IllegalStateException(s"Impossible to parse because value is empty"))
          else parseWithMorphline(new ByteArrayInputStream(s.getBytes("UTF-8")))
        case Some(b: Array[Byte]) =>
          if (b.length == 0)  returnWhenError(new IllegalStateException(s"Impossible to parse because value is empty"))
          else parseWithMorphline(new ByteArrayInputStream(b))
        case _ =>
          returnWhenError(new IllegalStateException(s"Impossible to parse because value is empty"))
      }
    }
    returnData(newData, removeInputFieldMorphline(row))
  }

   private def removeIndex(row: Row, inputFieldIndex: Int): Row =
    if (row.size < inputFieldIndex) row
    else Row.fromSeq(row.toSeq.take(inputFieldIndex) ++ row.toSeq.drop(inputFieldIndex + 1))


   private def removeInputFieldMorphline(row: Row): Row =
    if (inputFieldRemoved && inputField.isDefined) removeIndex(row, inputFieldIndex)
    else row

  private def parseWithMorphline(value: ByteArrayInputStream): Row = {
    val record = new Record()
    record.put(inputField.get, value)
    MorphlinesParser(order, config, outputFieldsSchema).process(record)
  }
}

object MorphlinesParser {

  private val instances = new ConcurrentHashMap[String, KiteMorphlineImpl].asScala

  def apply(order: Integer, config: String, outputFieldsSchema: Array[StructField]): KiteMorphlineImpl = {
    instances.get(config) match {
      case Some(kiteMorphlineImpl) =>
        kiteMorphlineImpl
      case None =>
        val kiteMorphlineImpl = KiteMorphlineImpl(ConfigFactory.parseString(config), outputFieldsSchema)
        instances.putIfAbsent(config, kiteMorphlineImpl)
        kiteMorphlineImpl
    }
  }
}

Source File: CassandraOutputTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.output.cassandra

import java.io.{Serializable => JSerializable}

import com.datastax.spark.connector.cql.CassandraConnector
import com.stratio.sparta.sdk._
import com.stratio.sparta.sdk.properties.JsoneyString
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.mock.MockitoSugar
import org.scalatest.{FlatSpec, Matchers}

@RunWith(classOf[JUnitRunner])
class CassandraOutputTest extends FlatSpec with Matchers with MockitoSugar with AnswerSugar {

  val s = "sum"
  val properties = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042"))

  "getSparkConfiguration" should "return a Seq with the configuration" in {
    val configuration = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042"))
    val cass = CassandraOutput.getSparkConfiguration(configuration)

    cass should be(List(("spark.cassandra.connection.host", "127.0.0.1"), ("spark.cassandra.connection.port", "9042")))
  }

  "getSparkConfiguration" should "return all cassandra-spark config" in {
    val config: Map[String, JSerializable] = Map(
      ("sparkProperties" -> JsoneyString(
        "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," +
          "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")),
      ("anotherProperty" -> "true")
    )

    val sparkConfig = CassandraOutput.getSparkConfiguration(config)

    sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(true)
    sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(true)
    sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false)
  }

  "getSparkConfiguration" should "not return cassandra-spark config" in {
    val config: Map[String, JSerializable] = Map(
      ("hadoopProperties" -> JsoneyString(
        "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," +
          "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")),
      ("anotherProperty" -> "true")
    )

    val sparkConfig = CassandraOutput.getSparkConfiguration(config)

    sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(false)
    sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(false)
    sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false)
  }
}

Source File: LastValueOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.lastValue

import java.util.Date

import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class LastValueOperatorTest extends WordSpec with Matchers {

  "LastValue operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new LastValueOperator("lastValue", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new LastValueOperator("lastValue", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new LastValueOperator("lastValue", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new LastValueOperator("lastValue", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields4.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields5 = new LastValueOperator("lastValue", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields5.processMap(Row(1, 2)) should be(None)

      val inputFields6 = new LastValueOperator("lastValue", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields6.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new LastValueOperator("lastValue", initSchema, Map())
      inputFields.processReduce(Seq()) should be(None)

      val inputFields2 = new LastValueOperator("lastValue", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(2))) should be(Some(2))

      val inputFields3 = new LastValueOperator("lastValue", initSchema, Map())
      inputFields3.processReduce(Seq(Some("a"), Some("b"))) should be(Some("b"))
    }

    "associative process must be " in {
      val inputFields = new LastValueOperator("lastValue", initSchema, Map())
      val resultInput = Seq((Operator.OldValuesKey, Some(1L)),
        (Operator.NewValuesKey, Some(1L)),
        (Operator.NewValuesKey, None))
      inputFields.associativity(resultInput) should be(Some(1L))

      val inputFields2 = new LastValueOperator("lastValue", initSchema, Map("typeOp" -> "int"))
      val resultInput2 = Seq((Operator.OldValuesKey, Some(1L)),
        (Operator.NewValuesKey, Some(1L)))
      inputFields2.associativity(resultInput2) should be(Some(1))

      val inputFields3 = new LastValueOperator("lastValue", initSchema, Map("typeOp" -> null))
      val resultInput3 = Seq((Operator.OldValuesKey, Some(1)),
        (Operator.NewValuesKey, Some(2)))
      inputFields3.associativity(resultInput3) should be(Some(2))

      val inputFields4 = new LastValueOperator("lastValue", initSchema, Map())
      val resultInput4 = Seq()
      inputFields4.associativity(resultInput4) should be(None)

      val inputFields5 = new LastValueOperator("lastValue", initSchema, Map())
      val date = new Date()
      val resultInput5 = Seq((Operator.NewValuesKey, Some(date)))
      inputFields5.associativity(resultInput5) should be(Some(date))
    }
  }
}

Source File: StddevOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.stddev

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class StddevOperatorTest extends WordSpec with Matchers {

  "Std dev operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new StddevOperator("stdev", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new StddevOperator("stdev", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row("1", 2)) should be(Some(1))

      val inputFields6 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1"))
      inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5))

      val inputFields7 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1"))
      inputFields7.processMap(Row(5L, 2)) should be(Some(5L))

      val inputFields8 = new StddevOperator("stdev", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields8.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields9 = new StddevOperator("stdev", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields9.processMap(Row(1, 2)) should be(None)

      val inputFields10 = new StddevOperator("stdev", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields10.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new StddevOperator("stdev", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new StddevOperator("stdev", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be
      (Some(2.8284271247461903))

      val inputFields3 = new StddevOperator("stdev", initSchema, Map())
      inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be
      (Some(2.850438562747845))

      val inputFields4 = new StddevOperator("stdev", initSchema, Map())
      inputFields4.processReduce(Seq(None)) should be(Some(0d))

      val inputFields5 = new StddevOperator("stdev", initSchema, Map("typeOp" -> "string"))
      inputFields5.processReduce(
        Seq(Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some("2.850438562747845"))
    }

    "processReduce distinct must be " in {
      val inputFields = new StddevOperator("stdev", initSchema, Map("distinct" -> "true"))
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new StddevOperator("stdev", initSchema, Map("distinct" -> "true"))
      inputFields2.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be
      (Some(2.8284271247461903))

      val inputFields3 = new StddevOperator("stdev", initSchema, Map("distinct" -> "true"))
      inputFields3.processReduce(Seq(Some(1), Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be
      (Some(2.850438562747845))

      val inputFields4 = new StddevOperator("stdev", initSchema, Map("distinct" -> "true"))
      inputFields4.processReduce(Seq(None)) should be(Some(0d))

      val inputFields5 = new StddevOperator("stdev", initSchema, Map("typeOp" -> "string", "distinct" -> "true"))
      inputFields5.processReduce(
        Seq(Some(1), Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some("2.850438562747845"))
    }
  }
}

Source File: MedianOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.median

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class MedianOperatorTest extends WordSpec with Matchers {

  "Median operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new MedianOperator("median", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new MedianOperator("median", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new MedianOperator("median", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new MedianOperator("median", initSchema, Map("inputField" -> "field1"))
      inputFields4.processMap(Row("1", 2)) should be(Some(1))

      val inputFields6 = new MedianOperator("median", initSchema, Map("inputField" -> "field1"))
      inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5))

      val inputFields7 = new MedianOperator("median", initSchema, Map("inputField" -> "field1"))
      inputFields7.processMap(Row(5L, 2)) should be(Some(5L))

      val inputFields8 = new MedianOperator("median", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields8.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields9 = new MedianOperator("median", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields9.processMap(Row(1, 2)) should be(None)

      val inputFields10 = new MedianOperator("median", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields10.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new MedianOperator("median", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new MedianOperator("median", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some(3d))

      val inputFields3 = new MedianOperator("median", initSchema, Map())
      inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some(3))

      val inputFields4 = new MedianOperator("median", initSchema, Map())
      inputFields4.processReduce(Seq(None)) should be(Some(0d))

      val inputFields5 = new MedianOperator("median", initSchema, Map("typeOp" -> "string"))
      inputFields5.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some("3.0"))
    }

    "processReduce distinct must be " in {
      val inputFields = new MedianOperator("median", initSchema, Map("distinct" -> "true"))
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new MedianOperator("median", initSchema, Map("distinct" -> "true"))
      inputFields2.processReduce(Seq(Some(1), Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some(2.5))

      val inputFields3 = new MedianOperator("median", initSchema, Map("distinct" -> "true"))
      inputFields3.processReduce(Seq(Some(1), Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some(3))

      val inputFields4 = new MedianOperator("median", initSchema, Map("distinct" -> "true"))
      inputFields4.processReduce(Seq(None)) should be(Some(0d))

      val inputFields5 = new MedianOperator("median", initSchema, Map("typeOp" -> "string", "distinct" -> "true"))
      inputFields5.processReduce(Seq(Some(1), Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some("2.5"))
    }
  }
}

Source File: ModeOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.mode

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class ModeOperatorTest extends WordSpec with Matchers {

  "Mode operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new ModeOperator("mode", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new ModeOperator("mode", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new ModeOperator("mode", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new ModeOperator("mode", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields4.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields5 = new ModeOperator("mode", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields5.processMap(Row(1, 2)) should be(None)

      val inputFields6 = new ModeOperator("mode", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields6.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new ModeOperator("mode", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(List()))

      val inputFields2 = new ModeOperator("mode", initSchema, Map())
      inputFields2.processReduce(Seq(Some("hey"), Some("hey"), Some("hi"))) should be(Some(List("hey")))

      val inputFields3 = new ModeOperator("mode", initSchema, Map())
      inputFields3.processReduce(Seq(Some("1"), Some("1"), Some("4"))) should be(Some(List("1")))

      val inputFields4 = new ModeOperator("mode", initSchema, Map())
      inputFields4.processReduce(Seq(
        Some("1"), Some("1"), Some("4"), Some("4"), Some("4"), Some("4"))) should be(Some(List("4")))

      val inputFields5 = new ModeOperator("mode", initSchema, Map())
      inputFields5.processReduce(Seq(
        Some("1"), Some("1"), Some("2"), Some("2"), Some("4"), Some("4"))) should be(Some(List("1", "2", "4")))

      val inputFields6 = new ModeOperator("mode", initSchema, Map())
      inputFields6.processReduce(Seq(
        Some("1"), Some("1"), Some("2"), Some("2"), Some("4"), Some("4"), Some("5"))
      ) should be(Some(List("1", "2", "4")))
    }
  }
}

Source File: RangeOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.range

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class RangeOperatorTest extends WordSpec with Matchers {

  "Range operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new RangeOperator("range", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new RangeOperator("range", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new RangeOperator("range", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new RangeOperator("range", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row("1", 2)) should be(Some(1))

      val inputFields6 = new RangeOperator("range", initSchema, Map("inputField" -> "field1"))
      inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5))

      val inputFields7 = new RangeOperator("range", initSchema, Map("inputField" -> "field1"))
      inputFields7.processMap(Row(5L, 2)) should be(Some(5L))

      val inputFields8 = new RangeOperator("range", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields8.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields9 = new RangeOperator("range", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields9.processMap(Row(1, 2)) should be(None)

      val inputFields10 = new RangeOperator("range", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields10.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new RangeOperator("range", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new RangeOperator("range", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(1))) should be(Some(0))

      val inputFields3 = new RangeOperator("range", initSchema, Map())
      inputFields3.processReduce(Seq(Some(1), Some(2), Some(4))) should be(Some(3))

      val inputFields4 = new RangeOperator("range", initSchema, Map())
      inputFields4.processReduce(Seq(None)) should be(Some(0d))

      val inputFields5 = new RangeOperator("range", initSchema, Map("typeOp" -> "string"))
      inputFields5.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some("6.0"))
    }

    "processReduce distinct must be " in {
      val inputFields = new RangeOperator("range", initSchema, Map("distinct" -> "true"))
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new RangeOperator("range", initSchema, Map("distinct" -> "true"))
      inputFields2.processReduce(Seq(Some(1), Some(1))) should be(Some(0))

      val inputFields3 = new RangeOperator("range", initSchema, Map("distinct" -> "true"))
      inputFields3.processReduce(Seq(Some(1), Some(2), Some(4))) should be(Some(3))

      val inputFields4 = new RangeOperator("range", initSchema, Map("distinct" -> "true"))
      inputFields4.processReduce(Seq(None)) should be(Some(0d))

      val inputFields5 = new RangeOperator("range", initSchema, Map("typeOp" -> "string", "distinct" -> "true"))
      inputFields5.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some("6.0"))
    }
  }
}

Source File: AccumulatorOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.accumulator

import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class AccumulatorOperatorTest extends WordSpec with Matchers {

  "Accumulator operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new AccumulatorOperator("accumulator", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new AccumulatorOperator("accumulator", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new AccumulatorOperator("accumulator", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new AccumulatorOperator("accumulator", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields4.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields5 = new AccumulatorOperator("accumulator", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":2}]"))
      inputFields5.processMap(Row(1, 2)) should be(None)

      val inputFields6 = new AccumulatorOperator("accumulator", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":2}]"
        }))
      inputFields6.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new AccumulatorOperator("accumulator", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(Seq()))

      val inputFields2 = new AccumulatorOperator("accumulator", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(1))) should be(Some(Seq("1", "1")))

      val inputFields3 = new AccumulatorOperator("accumulator", initSchema, Map())
      inputFields3.processReduce(Seq(Some("a"), Some("b"))) should be(Some(Seq("a", "b")))
    }

    "associative process must be " in {
      val inputFields = new AccumulatorOperator("accumulator", initSchema, Map())
      val resultInput = Seq((Operator.OldValuesKey, Some(Seq(1L))),
        (Operator.NewValuesKey, Some(Seq(2L))),
        (Operator.NewValuesKey, None))
      inputFields.associativity(resultInput) should be(Some(Seq("1", "2")))

      val inputFields2 = new AccumulatorOperator("accumulator", initSchema, Map("typeOp" -> "arraydouble"))
      val resultInput2 = Seq((Operator.OldValuesKey, Some(Seq(1))),
        (Operator.NewValuesKey, Some(Seq(3))))
      inputFields2.associativity(resultInput2) should be(Some(Seq(1d, 3d)))

      val inputFields3 = new AccumulatorOperator("accumulator", initSchema, Map("typeOp" -> null))
      val resultInput3 = Seq((Operator.OldValuesKey, Some(Seq(1))),
        (Operator.NewValuesKey, Some(Seq(1))))
      inputFields3.associativity(resultInput3) should be(Some(Seq("1", "1")))
    }
  }
}

Source File: FirstValueOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.firstValue

import java.util.Date

import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class FirstValueOperatorTest extends WordSpec with Matchers {

  "FirstValue operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new FirstValueOperator("firstValue", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new FirstValueOperator("firstValue", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new FirstValueOperator("firstValue", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new FirstValueOperator("firstValue", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields4.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields5 = new FirstValueOperator("firstValue", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields5.processMap(Row(1, 2)) should be(None)

      val inputFields6 = new FirstValueOperator("firstValue", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields6.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new FirstValueOperator("firstValue", initSchema, Map())
      inputFields.processReduce(Seq()) should be(None)

      val inputFields2 = new FirstValueOperator("firstValue", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(2))) should be(Some(1))

      val inputFields3 = new FirstValueOperator("firstValue", initSchema, Map())
      inputFields3.processReduce(Seq(Some("a"), Some("b"))) should be(Some("a"))
    }

    "associative process must be " in {
      val inputFields = new FirstValueOperator("firstValue", initSchema, Map())
      val resultInput = Seq((Operator.OldValuesKey, Some(1L)),
        (Operator.NewValuesKey, Some(1L)),
        (Operator.NewValuesKey, None))
      inputFields.associativity(resultInput) should be(Some(1L))

      val inputFields2 = new FirstValueOperator("firstValue", initSchema, Map("typeOp" -> "int"))
      val resultInput2 = Seq((Operator.OldValuesKey, Some(1L)),
        (Operator.NewValuesKey, Some(1L)))
      inputFields2.associativity(resultInput2) should be(Some(1))

      val inputFields3 = new FirstValueOperator("firstValue", initSchema, Map("typeOp" -> null))
      val resultInput3 = Seq((Operator.OldValuesKey, Some(1)),
        (Operator.NewValuesKey, Some(1)),
        (Operator.NewValuesKey, None))
      inputFields3.associativity(resultInput3) should be(Some(1))

      val inputFields4 = new FirstValueOperator("firstValue", initSchema, Map())
      val resultInput4 = Seq()
      inputFields4.associativity(resultInput4) should be(None)

      val inputFields5 = new FirstValueOperator("firstValue", initSchema, Map())
      val date = new Date()
      val resultInput5 = Seq((Operator.NewValuesKey, Some(date)))
      inputFields5.associativity(resultInput5) should be(Some(date))
    }
  }
}

Source File: MeanAssociativeOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.mean

import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class MeanAssociativeOperatorTest extends WordSpec with Matchers {

  "Mean operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new MeanAssociativeOperator("avg", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new MeanAssociativeOperator("avg", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1"))
      inputFields4.processMap(Row("1", 2)) should be(Some(1))

      val inputFields6 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1"))
      inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5))

      val inputFields7 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1"))
      inputFields7.processMap(Row(5L, 2)) should be(Some(5L))

      val inputFields8 = new MeanAssociativeOperator("avg", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields8.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields9 = new MeanAssociativeOperator("avg", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields9.processMap(Row(1, 2)) should be(None)

      val inputFields10 = new MeanAssociativeOperator("avg", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields10.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new MeanAssociativeOperator("avg", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(List()))

      val inputFields2 = new MeanAssociativeOperator("avg", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(1), None)) should be (Some(List(1.0, 1.0)))

      val inputFields3 = new MeanAssociativeOperator("avg", initSchema, Map())
      inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), None)) should be(Some(List(1.0, 2.0, 3.0)))

      val inputFields4 = new MeanAssociativeOperator("avg", initSchema, Map())
      inputFields4.processReduce(Seq(None)) should be(Some(List()))
    }

    "processReduce distinct must be " in {
      val inputFields = new MeanAssociativeOperator("avg", initSchema, Map("distinct" -> "true"))
      inputFields.processReduce(Seq()) should be(Some(List()))

      val inputFields2 = new MeanAssociativeOperator("avg", initSchema, Map("distinct" -> "true"))
      inputFields2.processReduce(Seq(Some(1), Some(1), None)) should be(Some(List(1.0)))

      val inputFields3 = new MeanAssociativeOperator("avg", initSchema, Map("distinct" -> "true"))
      inputFields3.processReduce(Seq(Some(1), Some(3), Some(1), None)) should be(Some(List(1.0, 3.0)))

      val inputFields4 = new MeanAssociativeOperator("avg", initSchema, Map("distinct" -> "true"))
      inputFields4.processReduce(Seq(None)) should be(Some(List()))
    }

    "associative process must be " in {
      val inputFields = new MeanAssociativeOperator("avg", initSchema, Map())
      val resultInput =
        Seq((Operator.OldValuesKey, Some(Map("count" -> 1d, "sum" -> 2d, "mean" -> 2d))), (Operator.NewValuesKey, None))
      inputFields.associativity(resultInput) should be(Some(Map("count" -> 1.0, "sum" -> 2.0, "mean" -> 2.0)))

      val inputFields2 = new MeanAssociativeOperator("avg", initSchema, Map())
      val resultInput2 = Seq((Operator.OldValuesKey, Some(Map("count" -> 1d, "sum" -> 2d, "mean" -> 2d))),
        (Operator.NewValuesKey, Some(Seq(1d))))
      inputFields2.associativity(resultInput2) should be(Some(Map("sum" -> 3.0, "count" -> 2.0, "mean" -> 1.5)))
    }
  }
}

Source File: MeanOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.mean

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class MeanOperatorTest extends WordSpec with Matchers {

  "Mean operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new MeanOperator("avg", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new MeanOperator("avg", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1"))
      inputFields4.processMap(Row("1", 2)) should be(Some(1))

      val inputFields6 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1"))
      inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5))

      val inputFields7 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1"))
      inputFields7.processMap(Row(5L, 2)) should be(Some(5L))

      val inputFields8 = new MeanOperator("avg", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields8.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields9 = new MeanOperator("avg", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields9.processMap(Row(1, 2)) should be(None)

      val inputFields10 = new MeanOperator("avg", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields10.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new MeanOperator("avg", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new MeanOperator("avg", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(1), None)) should be(Some(1))

      val inputFields3 = new MeanOperator("avg", initSchema, Map())
      inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), None)) should be(Some(2))

      val inputFields4 = new MeanOperator("avg", initSchema, Map())
      inputFields4.processReduce(Seq(None)) should be(Some(0d))

      val inputFields5 = new MeanOperator("avg", initSchema, Map("typeOp" -> "string"))
      inputFields5.processReduce(Seq(Some(1), Some(1))) should be(Some("1.0"))
    }

    "processReduce distinct must be " in {
      val inputFields = new MeanOperator("avg", initSchema, Map("distinct" -> "true"))
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new MeanOperator("avg", initSchema, Map("distinct" -> "true"))
      inputFields2.processReduce(Seq(Some(1), Some(1), None)) should be(Some(1))

      val inputFields3 = new MeanOperator("avg", initSchema, Map("distinct" -> "true"))
      inputFields3.processReduce(Seq(Some(1), Some(3), Some(1), None)) should be(Some(2))

      val inputFields4 = new MeanOperator("avg", initSchema, Map("distinct" -> "true"))
      inputFields4.processReduce(Seq(None)) should be(Some(0d))

      val inputFields5 = new MeanOperator("avg", initSchema, Map("typeOp" -> "string", "distinct" -> "true"))
      inputFields5.processReduce(Seq(Some(1), Some(1))) should be(Some("1.0"))
    }
  }
}

Source File: OperatorEntityCountTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.entityCount

import java.io.{Serializable => JSerializable}

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class OperatorEntityCountTest extends WordSpec with Matchers {

  "EntityCount" should {
    val props = Map(
      "inputField" -> "inputField".asInstanceOf[JSerializable],
      "split" -> ",".asInstanceOf[JSerializable])
    val schema = StructType(Seq(StructField("inputField", StringType)))
    val entityCount = new OperatorEntityCountMock("op1", schema, props)
    val inputFields = Row("hello,bye")

    "Return the associated precision name" in {
      val expected = Option(Seq("hello", "bye"))
      val result = entityCount.processMap(inputFields)
      result should be(expected)
    }

    "Return empty list" in {
      val expected = None
      val result = entityCount.processMap(Row())
      result should be(expected)
    }
  }
}

Source File: EntityCountOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.entityCount

import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class EntityCountOperatorTest extends WordSpec with Matchers {

  "Entity Count Operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new EntityCountOperator("entityCount", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new EntityCountOperator("entityCount", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row("hola holo", 2)) should be(Some(Seq("hola holo")))

      val inputFields4 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1", "split" -> ","))
      inputFields4.processMap(Row("hola holo", 2)) should be(Some(Seq("hola holo")))

      val inputFields5 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1", "split" -> "-"))
      inputFields5.processMap(Row("hola-holo", 2)) should be(Some(Seq("hola", "holo")))

      val inputFields6 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1", "split" -> ","))
      inputFields6.processMap(Row("hola,holo adios", 2)) should be(Some(Seq("hola", "holo " + "adios")))

      val inputFields7 = new EntityCountOperator("entityCount", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"!=\", \"value\":\"hola\"}]"))
      inputFields7.processMap(Row("hola", 2)) should be(None)

      val inputFields8 = new EntityCountOperator("entityCount", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"!=\", \"value\":\"hola\"}]",
          "split" -> " "))
      inputFields8.processMap(Row("hola holo", 2)) should be(Some(Seq("hola", "holo")))
    }

    "processReduce must be " in {
      val inputFields = new EntityCountOperator("entityCount", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(Seq()))

      val inputFields2 = new EntityCountOperator("entityCount", initSchema, Map())
      inputFields2.processReduce(Seq(Some(Seq("hola", "holo")))) should be(Some(Seq("hola", "holo")))

      val inputFields3 = new EntityCountOperator("entityCount", initSchema, Map())
      inputFields3.processReduce(Seq(Some(Seq("hola", "holo", "hola")))) should be(Some(Seq("hola", "holo", "hola")))
    }

    "associative process must be " in {
      val inputFields = new EntityCountOperator("entityCount", initSchema, Map())
      val resultInput = Seq((Operator.OldValuesKey, Some(Map("hola" -> 1L, "holo" -> 1L))),
        (Operator.NewValuesKey, None))
      inputFields.associativity(resultInput) should be(Some(Map("hola" -> 1L, "holo" -> 1L)))

      val inputFields2 = new EntityCountOperator("entityCount", initSchema, Map("typeOp" -> "int"))
      val resultInput2 = Seq((Operator.OldValuesKey, Some(Map("hola" -> 1L, "holo" -> 1L))),
        (Operator.NewValuesKey, Some(Seq("hola"))))
      inputFields2.associativity(resultInput2) should be(Some(Map()))

      val inputFields3 = new EntityCountOperator("entityCount", initSchema, Map("typeOp" -> null))
      val resultInput3 = Seq((Operator.OldValuesKey, Some(Map("hola" -> 1L, "holo" -> 1L))))
      inputFields3.associativity(resultInput3) should be(Some(Map("hola" -> 1L, "holo" -> 1L)))
    }
  }
}

Source File: SumOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.sum

import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class SumOperatorTest extends WordSpec with Matchers {

  "Sum operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new SumOperator("sum", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new SumOperator("sum", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new SumOperator("sum", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new SumOperator("sum", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row("1", 2)) should be(Some(1))

      val inputFields6 = new SumOperator("sum", initSchema, Map("inputField" -> "field1"))
      inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5))

      val inputFields7 = new SumOperator("sum", initSchema, Map("inputField" -> "field1"))
      inputFields7.processMap(Row(5L, 2)) should be(Some(5L))

      val inputFields8 = new SumOperator("sum", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields8.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields9 = new SumOperator("sum", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields9.processMap(Row(1, 2)) should be(None)

      val inputFields10 = new SumOperator("sum", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields10.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new SumOperator("sum", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new SumOperator("sum", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some(20d))

      val inputFields3 = new SumOperator("sum", initSchema, Map())
      inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some(20d))

      val inputFields4 = new SumOperator("sum", initSchema, Map())
      inputFields4.processReduce(Seq(None)) should be(Some(0d))
    }

    "processReduce distinct must be " in {
      val inputFields = new SumOperator("sum", initSchema, Map("distinct" -> "true"))
      inputFields.processReduce(Seq()) should be(Some(0d))

      val inputFields2 = new SumOperator("sum", initSchema, Map("distinct" -> "true"))
      inputFields2.processReduce(Seq(Some(1), Some(2), Some(1))) should be(Some(3d))
    }

    "associative process must be " in {
      val inputFields = new SumOperator("count", initSchema, Map())
      val resultInput = Seq((Operator.OldValuesKey, Some(1L)),
        (Operator.NewValuesKey, Some(1L)),
        (Operator.NewValuesKey, None))
      inputFields.associativity(resultInput) should be(Some(2d))

      val inputFields2 = new SumOperator("count", initSchema, Map("typeOp" -> "string"))
      val resultInput2 = Seq((Operator.OldValuesKey, Some(1L)),
        (Operator.NewValuesKey, Some(1L)),
        (Operator.NewValuesKey, None))
      inputFields2.associativity(resultInput2) should be(Some("2.0"))
    }
  }
}

Source File: FullTextOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.fullText

import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class FullTextOperatorTest extends WordSpec with Matchers {

  "FullText operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new FullTextOperator("fullText", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new FullTextOperator("fullText", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new FullTextOperator("fullText", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row(1, 2)) should be(Some(1))

      val inputFields4 = new FullTextOperator("fullText", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]"))
      inputFields4.processMap(Row(1, 2)) should be(Some(1L))

      val inputFields5 = new FullTextOperator("fullText", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]"))
      inputFields5.processMap(Row(1, 2)) should be(None)

      val inputFields6 = new FullTextOperator("fullText", initSchema,
        Map("inputField" -> "field1", "filters" -> {
          "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," +
            "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]"
        }))
      inputFields6.processMap(Row(1, 2)) should be(None)
    }

    "processReduce must be " in {
      val inputFields = new FullTextOperator("fullText", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(""))

      val inputFields2 = new FullTextOperator("fullText", initSchema, Map())
      inputFields2.processReduce(Seq(Some(1), Some(1))) should be(Some(s"1${Operator.SpaceSeparator}1"))

      val inputFields3 = new FullTextOperator("fullText", initSchema, Map())
      inputFields3.processReduce(Seq(Some("a"), Some("b"))) should be(Some(s"a${Operator.SpaceSeparator}b"))
    }

    "associative process must be " in {
      val inputFields = new FullTextOperator("fullText", initSchema, Map())
      val resultInput = Seq((Operator.OldValuesKey, Some(2)), (Operator.NewValuesKey, None))
      inputFields.associativity(resultInput) should be(Some("2"))

      val inputFields2 = new FullTextOperator("fullText", initSchema, Map("typeOp" -> "arraystring"))
      val resultInput2 = Seq((Operator.OldValuesKey, Some(2)),
        (Operator.NewValuesKey, Some(1)))
      inputFields2.associativity(resultInput2) should be(Some(Seq(s"2${Operator.SpaceSeparator}1")))

      val inputFields3 = new FullTextOperator("fullText", initSchema, Map("typeOp" -> null))
      val resultInput3 = Seq((Operator.OldValuesKey, Some(2)), (Operator.OldValuesKey, Some(3)))
      inputFields3.associativity(resultInput3) should be(Some(s"2${Operator.SpaceSeparator}3"))
    }
  }
}

Source File: TotalEntityCountOperatorTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.totalEntityCount

import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class TotalEntityCountOperatorTest extends WordSpec with Matchers {

  "Entity Count Operator" should {

    val initSchema = StructType(Seq(
      StructField("field1", IntegerType, false),
      StructField("field2", IntegerType, false),
      StructField("field3", IntegerType, false)
    ))

    val initSchemaFail = StructType(Seq(
      StructField("field2", IntegerType, false)
    ))

    "processMap must be " in {
      val inputField = new TotalEntityCountOperator("totalEntityCount", initSchema, Map())
      inputField.processMap(Row(1, 2)) should be(None)

      val inputFields2 = new TotalEntityCountOperator("totalEntityCount", initSchemaFail, Map("inputField" -> "field1"))
      inputFields2.processMap(Row(1, 2)) should be(None)

      val inputFields3 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1"))
      inputFields3.processMap(Row("hola holo", 2)) should be(Some(Seq("hola holo")))

      val inputFields4 =
        new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1", "split" -> ","))
      inputFields4.processMap(Row("hola holo", 2)) should be(Some(Seq("hola holo")))

      val inputFields5 =
        new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1", "split" -> "-"))
      inputFields5.processMap(Row("hola-holo", 2)) should be(Some(Seq("hola", "holo")))

      val inputFields6 =
        new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1", "split" -> ","))
      inputFields6.processMap(Row("hola,holo adios", 2)) should be(Some(Seq("hola", "holo " + "adios")))

      val inputFields7 = new TotalEntityCountOperator("totalEntityCount", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"!=\", \"value\":\"hola\"}]"))
      inputFields7.processMap(Row("hola", 2)) should be(None)

      val inputFields8 = new TotalEntityCountOperator("totalEntityCount", initSchema,
        Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"!=\", \"value\":\"hola\"}]",
          "split" -> " "))
      inputFields8.processMap(Row("hola holo", 2)) should be
      (Some(Seq("hola", "holo")))
    }

    "processReduce must be " in {
      val inputFields = new TotalEntityCountOperator("totalEntityCount", initSchema, Map())
      inputFields.processReduce(Seq()) should be(Some(0L))

      val inputFields2 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map())
      inputFields2.processReduce(Seq(Some(Seq("hola", "holo")))) should be(Some(2L))

      val inputFields3 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map())
      inputFields3.processReduce(Seq(Some(Seq("hola", "holo", "hola")))) should be(Some(3L))

      val inputFields4 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map())
      inputFields4.processReduce(Seq(None)) should be(Some(0L))
    }

    "processReduce distinct must be " in {
      val inputFields = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("distinct" -> "true"))
      inputFields.processReduce(Seq()) should be(Some(0L))

      val inputFields2 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("distinct" -> "true"))
      inputFields2.processReduce(Seq(Some(Seq("hola", "holo", "hola")))) should be(Some(2L))
    }

    "associative process must be " in {
      val inputFields = new TotalEntityCountOperator("totalEntityCount", initSchema, Map())
      val resultInput = Seq((Operator.OldValuesKey, Some(2)), (Operator.NewValuesKey, None))
      inputFields.associativity(resultInput) should be(Some(2))

      val inputFields2 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("typeOp" -> "int"))
      val resultInput2 = Seq((Operator.OldValuesKey, Some(2)),
        (Operator.NewValuesKey, Some(1)))
      inputFields2.associativity(resultInput2) should be(Some(3))

      val inputFields3 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("typeOp" -> null))
      val resultInput3 = Seq((Operator.OldValuesKey, Some(2)))
      inputFields3.associativity(resultInput3) should be(Some(2))
    }
  }
}

Source File: MorphlinesParserTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.transformation.morphline

import java.io.Serializable

import com.stratio.sparta.sdk.pipeline.input.Input
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, Matchers, WordSpecLike}


@RunWith(classOf[JUnitRunner])
class MorphlinesParserTest extends WordSpecLike with Matchers with BeforeAndAfter with BeforeAndAfterAll {

  val morphlineConfig = """
          id : test1
          importCommands : ["org.kitesdk.**"]
          commands: [
          {
              readJson {},
          }
          {
              extractJsonPaths {
                  paths : {
                      col1 : /col1
                      col2 : /col2
                  }
              }
          }
          {
            java {
              code : "return child.process(record);"
            }
          }
          {
              removeFields {
                  blacklist:["literal:_attachment_body"]
              }
          }
          ]
                        """
  val inputField = Some(Input.RawDataKey)
  val outputsFields = Seq("col1", "col2")
  val props: Map[String, Serializable] = Map("morphline" -> morphlineConfig)

  val schema = StructType(Seq(StructField("col1", StringType), StructField("col2", StringType)))

  val parser = new MorphlinesParser(1, inputField, outputsFields, schema, props)

  "A MorphlinesParser" should {

    "parse a simple json" in {
      val simpleJson =
        """{
            "col1":"hello",
            "col2":"word"
            }
        """
      val input = Row(simpleJson)
      val result = parser.parse(input)

      val expected = Seq(Row(simpleJson, "hello", "world"))

      result should be eq(expected)
    }

    "parse a simple json removing raw" in {
      val simpleJson =
        """{
            "col1":"hello",
            "col2":"word"
            }
        """
      val input = Row(simpleJson)
      val result = parser.parse(input)

      val expected = Seq(Row("hello", "world"))

      result should be eq(expected)
    }

    "exclude not configured fields" in {
      val simpleJson =
        """{
            "col1":"hello",
            "col2":"word",
            "col3":"!"
            }
        """
      val input = Row(simpleJson)
      val result = parser.parse(input)

      val expected = Seq(Row(simpleJson, "hello", "world"))

      result should be eq(expected)
    }
  }
}

Source File: DateTimeParserTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.transformation.datetime

import com.stratio.sparta.sdk.properties.JsoneyString
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpecLike}

@RunWith(classOf[JUnitRunner])
class DateTimeParserTest extends WordSpecLike with Matchers {

  val inputField = Some("ts")
  val outputsFields = Seq("ts")

  //scalastyle:off
  "A DateTimeParser" should {
    "parse unixMillis to string" in {
      val input = Row(1416330788000L)
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unixMillis"))
          .parse(input)

      val expected = Seq(Row(1416330788000L, "1416330788000"))

      assertResult(result)(expected)
    }

    "parse unix to string" in {
      val input = Row(1416330788)
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unix"))
          .parse(input)

      val expected = Seq(Row(1416330788, "1416330788000"))

      assertResult(result)(expected)
    }

    "parse unix to string removing raw" in {
      val input = Row(1416330788)
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unix",
          "removeInputField" -> JsoneyString.apply("true")))
          .parse(input)

      val expected = Seq(Row("1416330788000"))

      assertResult(result)(expected)
    }

    "not parse anything if the field does not match" in {
      val input = Row("1212")
      val schema = StructType(Seq(StructField("otherField", StringType)))

      an[IllegalStateException] should be thrownBy new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unixMillis")).parse(input)
    }

    "not parse anything and generate a new Date" in {
      val input = Row("anything")
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "autoGenerated"))
          .parse(input)

      assertResult(result.head.size)(2)
    }

    "Auto generated if inputFormat does not exist" in {
      val input = Row("1416330788")
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map()).parse(input)

      assertResult(result.head.size)(2)
    }

    "parse dateTime in hive format" in {
      val input = Row("2015-11-08 15:58:58")
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "hive"))
          .parse(input)

      val expected = Seq(Row("2015-11-08 15:58:58", "1446998338000"))

      assertResult(result)(expected)
    }
  }
}

Source File: package.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering
import org.apache.spark.sql.execution.datasources.oap.io.ColumnStatistics
import org.apache.spark.sql.execution.datasources.oap.utils.OapUtils
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StructField, StructType}

package object oap {
  type Key = InternalRow

  def order(sf: StructField): Ordering[Key] = GenerateOrdering.create(StructType(Array(sf)))

  // Return if the rowGroup or file can be skipped by min max statistics
  def isSkippedByStatistics(
      columnStats: Array[ColumnStatistics],
      filter: Filter,
      schema: StructType): Boolean = filter match {
    case Or(left, right) =>
      isSkippedByStatistics(columnStats, left, schema) &&
          isSkippedByStatistics(columnStats, right, schema)
    case And(left, right) =>
      isSkippedByStatistics(columnStats, left, schema) ||
          isSkippedByStatistics(columnStats, right, schema)
    case IsNotNull(attribute) =>
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      !stat.hasNonNullValue
    case EqualTo(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      (OapUtils.keyFromBytes(stat.min, schema(idx).dataType), OapUtils.keyFromBytes(
        stat.max, schema(idx).dataType)) match {
        case (Some(v1), Some(v2)) => comp.gt(v1, key) || comp.lt(v2, key)
        case _ => false
      }
    case LessThan(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      OapUtils.keyFromBytes(stat.min, schema(idx).dataType) match {
        case Some(v) => comp.gteq(v, key)
        case None => false
      }
    case LessThanOrEqual(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      OapUtils.keyFromBytes(stat.min, schema(idx).dataType) match {
        case Some(v) => comp.gt(v, key)
        case None => false
      }
    case GreaterThan(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      OapUtils.keyFromBytes(stat.max, schema(idx).dataType) match {
        case Some(v) => comp.lteq(v, key)
        case None => false
      }
    case GreaterThanOrEqual(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      OapUtils.keyFromBytes(stat.max, schema(idx).dataType) match {
        case Some(v) => comp.lt(v, key)
        case None => false
      }
    case _ => false
  }
}


class OapException(message: String, cause: Throwable) extends Exception(message, cause) {

  def this(message: String) = this(message, null)
}

Source File: AnnotationUtils.scala From glow with Apache License 2.0

5 votes

package io.projectglow.vcf

import org.apache.spark.sql.types.{ArrayType, DataType, IntegerType, StringType, StructField, StructType}

// Unified VCF annotation representation, used by SnpEff and VEP
object AnnotationUtils {

  // Delimiter between annotation fields
  val annotationDelimiter = "|"
  val annotationDelimiterRegex = "\\|"

  // Fractional delimiter for struct subfields
  val structDelimiter = "/"
  val structDelimiterRegex = "\\/"

  // Delimiter for array subfields
  val arrayDelimiter = "&"

  // Struct subfield schemas
  private val rankTotalStruct = StructType(
    Seq(StructField("rank", IntegerType), StructField("total", IntegerType)))
  private val posLengthStruct = StructType(
    Seq(StructField("pos", IntegerType), StructField("length", IntegerType)))
  private val referenceVariantStruct = StructType(
    Seq(StructField("reference", StringType), StructField("variant", StringType)))

  // Special schemas for SnpEff subfields
  private val snpEffFieldsToSchema: Map[String, DataType] = Map(
    "Annotation" -> ArrayType(StringType),
    "Rank" -> rankTotalStruct,
    "cDNA_pos/cDNA_length" -> posLengthStruct,
    "CDS_pos/CDS_length" -> posLengthStruct,
    "AA_pos/AA_length" -> posLengthStruct,
    "Distance" -> IntegerType
  )

  // Special schemas for VEP subfields
  private val vepFieldsToSchema: Map[String, DataType] = Map(
    "Consequence" -> ArrayType(StringType),
    "EXON" -> rankTotalStruct,
    "INTRON" -> rankTotalStruct,
    "cDNA_position" -> IntegerType,
    "CDS_position" -> IntegerType,
    "Protein_position" -> IntegerType,
    "Amino_acids" -> referenceVariantStruct,
    "Codons" -> referenceVariantStruct,
    "Existing_variation" -> ArrayType(StringType),
    "DISTANCE" -> IntegerType,
    "STRAND" -> IntegerType,
    "FLAGS" -> ArrayType(StringType)
  )

  // Special schemas for LOFTEE (as VEP plugin) subfields
  private val lofteeFieldsToSchema: Map[String, DataType] = Map(
    "LoF_filter" -> ArrayType(StringType),
    "LoF_flags" -> ArrayType(StringType),
    "LoF_info" -> ArrayType(StringType)
  )

  // Default string schema for annotation subfield
  val allFieldsToSchema: Map[String, DataType] =
    (snpEffFieldsToSchema ++ vepFieldsToSchema ++ lofteeFieldsToSchema).withDefaultValue(StringType)
}

Source File: DateTimeDataFixture.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector.vector

import java.util.TimeZone

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DateType, StructField, StructType, TimestampType}

import com.actian.spark_vector.test.util.DateHelper.{ansiDateFor, timestampFor}

object DateTimeDataFixture {
  def timeRDD(sparkContext: SparkContext): (RDD[Seq[Any]], StructType) = createTimeRDD(sparkContext, timeData)

  private[vector] val tz = TimeZone.getTimeZone("GMT-06:00")

  private[vector] val utc = TimeZone.getTimeZone("UTC")

  private[vector] val timeData = Seq(
    Seq[Any](
      timestampFor(1995, 1, 22, 18, 3, 29, 234, tz),
      timestampFor(1996, 2, 22, 18, 3, 29, 234),
      timestampFor(1997, 2, 22, 18, 3, 29, 234),
      timestampFor(1998, 1, 22, 18, 3, 29, 234, tz),
      timestampFor(1999, 2, 22, 18, 3, 29, 234),
      timestampFor(2000, 2, 22, 18, 3, 29, 234),
      timestampFor(2015, 11, 23, 18, 3, 29, 123, tz),
      timestampFor(2015, 11, 23, 18, 3, 29, 123),
      ansiDateFor(1995, 2, 22)),
    Seq[Any](
      timestampFor(2015, 3, 2, 17, 52, 12, 678, tz),
      timestampFor(2015, 4, 2, 17, 52, 12, 678),
      timestampFor(2015, 4, 2, 17, 52, 12, 678),
      timestampFor(2015, 3, 2, 17, 52, 12, 678, tz),
      timestampFor(2015, 4, 2, 17, 52, 12, 678),
      timestampFor(2015, 4, 2, 17, 52, 12, 678),
      timestampFor(2015, 11, 13, 17, 52, 12, 123, tz),
      ansiDateFor(2015, 4, 2)))

  private def createTimeRDD(sparkContext: SparkContext, data: Seq[Seq[Any]]): (RDD[Seq[Any]], StructType) = {
    val schema = StructType(Seq(
      StructField("tswtz", TimestampType),
      StructField("tsntz", TimestampType),
      StructField("tsltz", TimestampType),
      StructField("tswtz4", TimestampType),
      StructField("tsntz4", TimestampType),
      StructField("tsltz4", TimestampType),
      StructField("tmwtz", TimestampType),
      StructField("tmntz", TimestampType),
      StructField("tmltz", TimestampType),
      StructField("tmwtz3", TimestampType),
      StructField("tmntz3", TimestampType),
      StructField("tmltz3", TimestampType),
      StructField("date", DateType)))

    (sparkContext.parallelize(data, 2), schema)
  }

  def createTimeTable(connectionProps: VectorConnectionProperties)(tableName: String): Unit = {
    VectorJDBC.withJDBC(connectionProps) { cxn =>
      cxn.dropTable(tableName)
      cxn.executeStatement(
        s"""|create table ${tableName} (
            |  tswtz timestamp with time zone,
            |  tsntz timestamp without time zone,
            |  tsltz timestamp with local time zone,
            |  tswtz4 timestamp(4) with time zone,
            |  tsntz4 timestamp(4) without time zone,
            |  tsltz4 timestamp(4) with local time zone,
            |  tmwtz time with time zone,
            |  tmntz time without time zone,
            |  tmltz time with local time zone,
            |  tmwtz3 time(3) with time zone,
            |  tmntz3 time(3) without time zone,
            |  tmltz3 time(3) with local time zone,
            |  dt date
            |)""".stripMargin)
    }
  }
}

Source File: RDDFixtures.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector

import java.util.Date

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ DateType, IntegerType, StringType, StructField, StructType }

import com.actian.spark_vector.test.util.StructTypeUtil.createSchema


trait RDDFixtures {
  // poor man's fixture, for other approaches see:
  // http://www.scalatest.org/user_guide/sharing_fixtures
  def createRecordRdd(sc: SparkContext): (RDD[Seq[Any]], StructType) = {
    val input = Seq(
      Seq(42, "a"),
      Seq(43, "b"))
    val inputRdd = sc.parallelize(input, 2)
    val inputSchema = createSchema("id" -> IntegerType, "name" -> StringType)
    (inputRdd, inputSchema)
  }

  def createRowRDD(sc: SparkContext): (RDD[Seq[Any]], StructType) = {
    val input = Seq(
      Seq[Any](42, "a", new Date(), new Date()),
      Seq[Any](43, "b", new Date(), new Date()))
    val inputRdd = sc.parallelize(input, 2)
    val inputSchema = createSchema("id" -> IntegerType, "name" -> StringType, "date" -> DateType)
    (inputRdd, inputSchema)
  }

  def wideRDD(sc: SparkContext, columnCount: Int, rowCount: Int = 2): (RDD[Row], StructType) = {
    val data: Row = Row.fromSeq(1 to columnCount)

    val fields = for (i <- 1 to rowCount) yield {
      StructField("field_" + i, IntegerType, true)
    }

    val inputSchema = StructType(fields.toSeq)

    val input = for (i <- 1 to rowCount) yield {
      data
    }

    val inputRDD = sc.parallelize(input, 2)
    (inputRDD, inputSchema)
  }
}

Source File: CurrentPersistenceIdsQuerySourceProvider.scala From apache-spark-test with Apache License 2.0

5 votes

package akka.persistence.jdbc.spark.sql.execution.streaming

import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source }
import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider }
import org.apache.spark.sql.types.{ StringType, StructField, StructType }
import org.apache.spark.sql.{ SQLContext, _ }

object CurrentPersistenceIdsQuerySourceProvider {
  val name = "current-persistence-id"
  val schema: StructType = StructType(Array(
    StructField("persistence_id", StringType, nullable = false)
  ))
}

class CurrentPersistenceIdsQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable {
  override def sourceSchema(
    sqlContext: SQLContext,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): (String, StructType) = {
    CurrentPersistenceIdsQuerySourceProvider.name -> CurrentPersistenceIdsQuerySourceProvider.schema
  }

  override def createSource(
    sqlContext: SQLContext,
    metadataPath: String,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): Source = {
    new CurrentPersistenceIdsQuerySourceImpl(sqlContext, parameters("path"))
  }
  override def shortName(): String = CurrentPersistenceIdsQuerySourceProvider.name
}

class CurrentPersistenceIdsQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String) extends Source with ReadJournalSource {
  override def schema: StructType = CurrentPersistenceIdsQuerySourceProvider.schema

  override def getOffset: Option[Offset] = {
    val offset = maxPersistenceIds
    println("[CurrentPersistenceIdsQuery]: Returning maximum offset: " + offset)
    Some(LongOffset(offset))
  }

  override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = {
    val (start, end) = getStartEnd(_start, _end)
    println(s"[CurrentPersistenceIdsQuery]: Getting currentPersistenceIds from start: $start, end: $end")
    import sqlContext.implicits._
    persistenceIds(start, end).toDF()
  }
}

Source File: HelloWorldDataSource.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.datasources.helloworld

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{ BaseRelation, DataSourceRegister, RelationProvider, TableScan }
import org.apache.spark.sql.types.{ StringType, StructField, StructType }
import org.apache.spark.sql.{ Row, SQLContext }

class HelloWorldDataSource extends RelationProvider with DataSourceRegister with Serializable {
  override def shortName(): String = "helloworld"

  override def hashCode(): Int = getClass.hashCode()

  override def equals(other: scala.Any): Boolean = other.isInstanceOf[HelloWorldDataSource]

  override def toString: String = "HelloWorldDataSource"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
    val path = parameters.get("path")
    path match {
      case Some(p) => new HelloWorldRelationProvider(sqlContext, p, parameters)
      case _       => throw new IllegalArgumentException("Path is required for Tickets datasets")
    }
  }
}

class HelloWorldRelationProvider(val sqlContext: SQLContext, path: String, parameters: Map[String, String]) extends BaseRelation with TableScan {
  import sqlContext.implicits._

  override def schema: StructType = StructType(Array(
    StructField("key", StringType, nullable = false),
    StructField("value", StringType, nullable = true)
  ))

  override def buildScan(): RDD[Row] =
    Seq(
      "path" -> path,
      "message" -> parameters.getOrElse("message", ""),
      "name" -> s"Hello ${parameters.getOrElse("name", "")}",
      "hello_world" -> "Hello World!"
    ).toDF.rdd
}

Source File: TextPiperSuite.scala From glow with Apache License 2.0

5 votes

package io.projectglow.transformers.pipe

import scala.collection.JavaConverters._

import org.apache.spark.SparkException
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import io.projectglow.Glow
import io.projectglow.sql.GlowBaseTest

class TextPiperSuite extends GlowBaseTest {
  override def afterEach(): Unit = {
    Glow.transform("pipe_cleanup", spark.emptyDataFrame)
    super.afterEach()
  }

  def pipeText(df: DataFrame): DataFrame = {
    val options =
      Map("inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["cat", "-"]""")
    new PipeTransformer().transform(df, options)
  }

  test("text input and output") {
    val sess = spark
    import sess.implicits._

    val output = pipeText(Seq("hello", "world").toDF())
    assert(output.count() == 2)
    assert(output.schema == StructType(Seq(StructField("text", StringType))))
    assert(output.orderBy("text").as[String].collect.toSeq == Seq("hello", "world"))
  }

  test("text input requires one column") {
    val sess = spark
    import sess.implicits._

    val df = Seq(Seq("hello", "world"), Seq("foo", "bar")).toDF()
    assertThrows[IllegalArgumentException](pipeText(df))
  }

  test("text input requires string column") {
    val sess = spark
    import sess.implicits._

    val df = Seq(Seq(5), Seq(6)).toDF()
    assertThrows[IllegalArgumentException](pipeText(df))
  }

  test("does not break on null row") {
    val sess = spark
    import sess.implicits._

    val df = Seq("hello", null, "hello").toDF()
    val output = pipeText(df)
    assert(output.count() == 2)
    assert(output.filter("text = 'hello'").count == 2)
  }

  test("command fails") {
    val sess = spark
    import sess.implicits._

    val df = Seq("hello", "world").toDF()
    val options =
      Map(
        "inputFormatter" -> "text",
        "outputFormatter" -> "text",
        "cmd" -> """["bash", "-c", "exit 1"]""")

    val ex = intercept[SparkException] {
      new PipeTransformer().transform(df, options)
    }
    assert(ex.getMessage.contains("Subprocess exited with status 1"))

    // threads should still be cleaned up
    eventually {
      assert(
        !Thread
          .getAllStackTraces
          .asScala
          .keySet
          .exists(_.getName.startsWith(ProcessHelper.STDIN_WRITER_THREAD_PREFIX)))
      assert(
        !Thread
          .getAllStackTraces
          .asScala
          .keySet
          .exists(_.getName.startsWith(ProcessHelper.STDERR_READER_THREAD_PREFIX)))
    }
  }
}

Source File: MomentAggState.scala From glow with Apache License 2.0

5 votes

package io.projectglow.sql.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}

import io.projectglow.common.GlowLogging


  def toInternalRow(row: InternalRow, offset: Int = 0): InternalRow = {
    row.update(offset, if (count > 0) mean else null)
    row.update(offset + 1, if (count > 0) Math.sqrt(m2 / (count - 1)) else null)
    row.update(offset + 2, if (count > 0) min else null)
    row.update(offset + 3, if (count > 0) max else null)
    row
  }

  def toInternalRow: InternalRow = {
    toInternalRow(new GenericInternalRow(4))
  }
}

object MomentAggState extends GlowLogging {
  val schema = StructType(
    Seq(
      StructField("mean", DoubleType),
      StructField("stdDev", DoubleType),
      StructField("min", DoubleType),
      StructField("max", DoubleType)
    )
  )

  def merge(s1: MomentAggState, s2: MomentAggState): MomentAggState = {
    if (s1.count == 0) {
      return s2
    } else if (s2.count == 0) {
      return s1
    }

    val newState = MomentAggState()
    newState.count = s1.count + s2.count
    val delta = s2.mean - s1.mean
    val deltaN = delta / newState.count
    newState.mean = s1.mean + deltaN * s2.count

    // higher order moments computed according to:
    // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics
    newState.m2 = s1.m2 + s2.m2 + delta * deltaN * s1.count * s2.count

    newState.min = Math.min(s1.min, s2.min)
    newState.max = Math.max(s1.max, s2.max)
    newState
  }
}

Source File: UTF8TextOutputFormatter.scala From glow with Apache License 2.0

5 votes

package io.projectglow.transformers.pipe

import java.io.InputStream

import scala.collection.JavaConverters._

import org.apache.commons.io.IOUtils
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.unsafe.types.UTF8String


class UTF8TextOutputFormatter() extends OutputFormatter {

  override def makeIterator(stream: InputStream): Iterator[Any] = {
    val schema = StructType(Seq(StructField("text", StringType)))
    val iter = IOUtils.lineIterator(stream, "UTF-8").asScala.map { s =>
      new GenericInternalRow(Array(UTF8String.fromString(s)): Array[Any])
    }
    Iterator(schema) ++ iter
  }
}

class UTF8TextOutputFormatterFactory extends OutputFormatterFactory {
  override def name: String = "text"

  override def makeOutputFormatter(options: Map[String, String]): OutputFormatter = {
    new UTF8TextOutputFormatter
  }
}

Source File: CSVOutputFormatter.scala From glow with Apache License 2.0

5 votes

package io.projectglow.transformers.pipe

import java.io.InputStream

import scala.collection.JavaConverters._

import com.univocity.parsers.csv.CsvParser
import org.apache.commons.io.IOUtils
import org.apache.spark.sql.execution.datasources.csv.{CSVDataSourceUtils, CSVUtils, UnivocityParserUtils}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import io.projectglow.SparkShim.{CSVOptions, UnivocityParser}

class CSVOutputFormatter(parsedOptions: CSVOptions) extends OutputFormatter {

  private def getSchema(record: Array[String]): StructType = {
    val header =
      CSVDataSourceUtils.makeSafeHeader(
        record,
        SQLConf.get.caseSensitiveAnalysis,
        parsedOptions
      )
    val fields = header.map { fieldName =>
      StructField(fieldName, StringType, nullable = true)
    }
    StructType(fields)
  }

  
  override def makeIterator(stream: InputStream): Iterator[Any] = {
    val lines = IOUtils.lineIterator(stream, "UTF-8").asScala
    val filteredLines = CSVUtils.filterCommentAndEmpty(lines, parsedOptions)

    if (filteredLines.isEmpty) {
      return Iterator.empty
    }

    val firstLine = filteredLines.next
    val csvParser = new CsvParser(parsedOptions.asParserSettings)
    val firstRecord = csvParser.parseLine(firstLine)
    val schema = getSchema(firstRecord)
    val univocityParser = new UnivocityParser(schema, schema, parsedOptions)

    val parsedIter =
      UnivocityParserUtils.parseIterator(
        Iterator(firstLine) ++ filteredLines,
        univocityParser,
        schema
      )

    val parsedIterWithoutHeader = if (parsedOptions.headerFlag) {
      parsedIter.drop(1)
    } else {
      parsedIter
    }

    Iterator(schema) ++ parsedIterWithoutHeader.map(_.copy)
  }
}

class CSVOutputFormatterFactory extends OutputFormatterFactory {
  override def name: String = "csv"

  override def makeOutputFormatter(
      options: Map[String, String]
  ): OutputFormatter = {
    val parsedOptions =
      new CSVOptions(
        options,
        SQLConf.get.csvColumnPruning,
        SQLConf.get.sessionLocalTimeZone
      )
    new CSVOutputFormatter(parsedOptions)
  }
}

Source File: package.scala From sparkpipe-core with Apache License 2.0

5 votes

package software.uncharted.sparkpipe.ops.core.dataframe

import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.types.{StructType, StructField}


  // Can't test because DataFrameWriter is currently marked final
  // $COVERAGE-OFF$
  def write(
    path: String,
    format: String = "parquet",
    options: Map[String, String] = Map[String, String]()
  )(input: DataFrame): DataFrame = {
    if (path.length > 0) {
      input.write.format(format).options(options).save(path)
    } else {
      input.write.format(format).options(options).save()
    }
    input
  }
  // $COVERAGE-ON$
}

Source File: SQLTransformerSuite.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.feature

import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest}
import org.apache.spark.sql.types.{LongType, StructField, StructType}
import org.apache.spark.storage.StorageLevel

class SQLTransformerSuite extends MLTest with DefaultReadWriteTest {

  import testImplicits._

  test("transform numeric data") {
    val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
     val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
      .toDF("id", "v1", "v2", "v3", "v4")
    val resultSchema = sqlTrans.transformSchema(original.schema)
    testTransformerByGlobalCheckFunc[(Int, Double, Double)](
      original,
      sqlTrans,
      "id",
      "v1",
      "v2",
      "v3",
      "v4") { rows =>
      assert(rows.head.schema.toString == resultSchema.toString)
      assert(resultSchema == expected.schema)
      assert(rows == expected.collect().toSeq)
      assert(original.sparkSession.catalog.listTables().count() == 0)
    }
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }

  test("transformSchema") {
    val df = spark.range(10)
    val outputSchema = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
      .transformSchema(df.schema)
    val expected = StructType(Seq(StructField("id1", LongType, nullable = false)))
    assert(outputSchema === expected)
  }

  ignore("SPARK-22538: SQLTransformer should not unpersist given dataset") {
    val df = spark.range(10).toDF()
    df.cache()
    df.count()
    assert(df.storageLevel != StorageLevel.NONE)
    val sqlTrans = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
    testTransformerByGlobalCheckFunc[Long](df, sqlTrans, "id1") { _ => }
    assert(df.storageLevel != StorageLevel.NONE)
  }
}

Source File: KCore.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.graph.kcore
import com.tencent.angel.sona.context.PSContext
import org.apache.spark.SparkContext
import com.tencent.angel.sona.graph.params._
import com.tencent.angel.sona.ml.Transformer
import com.tencent.angel.sona.ml.param.ParamMap
import com.tencent.angel.sona.ml.util.Identifiable
import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.storage.StorageLevel

class KCore(override val uid: String) extends Transformer
  with HasSrcNodeIdCol with HasDstNodeIdCol with HasOutputNodeIdCol with HasOutputCoreIdCol
  with HasStorageLevel with HasPartitionNum with HasPSPartitionNum with HasUseBalancePartition {

  def this() = this(Identifiable.randomUID("KCore"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val edges = dataset.select($(srcNodeIdCol), $(dstNodeIdCol)).rdd
      .map(row => (row.getLong(0), row.getLong(1)))
      .filter(e => e._1 != e._2)

    edges.persist(StorageLevel.DISK_ONLY)

    val maxId = edges.map(e => math.max(e._1, e._2)).max() + 1
    val minId = edges.map(e => math.min(e._1, e._2)).min()
    val nodes = edges.flatMap(e => Iterator(e._1, e._2))
    val numEdges = edges.count()

    println(s"minId=$minId maxId=$maxId numEdges=$numEdges level=${$(storageLevel)}")

    // Start PS and init the model
    println("start to run ps")
    PSContext.getOrCreate(SparkContext.getOrCreate())

    val model = KCorePSModel.fromMinMax(minId, maxId, nodes, $(psPartitionNum), $(useBalancePartition))
    var graph = edges.flatMap(e => Iterator((e._1, e._2), (e._2, e._1)))
      .groupByKey($(partitionNum))
      .mapPartitionsWithIndex((index, edgeIter) =>
        Iterator(KCoreGraphPartition.apply(index, edgeIter)))

    graph.persist($(storageLevel))
    graph.foreachPartition(_ => Unit)
    graph.foreach(_.initMsgs(model))

    var curIteration = 0
    var numMsgs = model.numMsgs()
    var prev = graph
    println(s"numMsgs=$numMsgs")

    do {
      curIteration += 1
      graph = prev.map(_.process(model, numMsgs, curIteration == 1))
      graph.persist($(storageLevel))
      graph.count()
      prev.unpersist(true)
      prev = graph
      model.resetMsgs()
      numMsgs = model.numMsgs()
      println(s"curIteration=$curIteration numMsgs=$numMsgs")
    } while (numMsgs > 0)

    val retRDD = graph.map(_.save()).flatMap{case (nodes,cores) => nodes.zip(cores)}
      .map(r => Row.fromSeq(Seq[Any](r._1, r._2)))

    dataset.sparkSession.createDataFrame(retRDD, transformSchema(dataset.schema))
  }

  override def transformSchema(schema: StructType): StructType = {
    StructType(Seq(
      StructField(s"${$(outputNodeIdCol)}", LongType, nullable = false),
      StructField(s"${$(outputCoreIdCol)}", IntegerType, nullable = false)
    ))
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

}

Source File: Predictor.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.common
import com.tencent.angel.mlcore.conf.{MLCoreConf, SharedConf}
import com.tencent.angel.ml.math2.utils.{DataBlock, LabeledData}
import org.apache.spark.broadcast.Broadcast
import com.tencent.angel.sona.ml.common.MathImplicits._
import com.tencent.angel.sona.core.{AngelGraphModel, ExecutorContext}
import com.tencent.angel.sona.data.LocalMemoryDataBlock
import org.apache.spark.linalg
import org.apache.spark.linalg.Vectors
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{Row, SPKSQLUtils}

import scala.collection.mutable.ListBuffer

class Predictor(bcValue: Broadcast[ExecutorContext],
                featIdx: Int, predictionCol: String, probabilityCol: String,
                bcConf: Broadcast[SharedConf]) extends Serializable {

  @transient private lazy val executorContext: ExecutorContext = {
    bcValue.value
  }

  @transient private lazy implicit val dim: Long = {
    executorContext.conf.getLong(MLCoreConf.ML_FEATURE_INDEX_RANGE)
  }

  @transient private lazy val appendedSchema: StructType = if (probabilityCol.nonEmpty) {
    new StructType(Array[StructField](StructField(probabilityCol, DoubleType),
      StructField(predictionCol, DoubleType)))
  } else {
    new StructType(Array[StructField](StructField(predictionCol, DoubleType)))
  }

  def predictRDD(data: Iterator[Row]): Iterator[Row] = {
    val localModel = executorContext.borrowModel(bcConf.value)
    val batchSize = 1024
    val storage = new LocalMemoryDataBlock(batchSize, batchSize * 1024 * 1024)

    var count = 0
    val cachedRows: Array[Row] = new Array[Row](batchSize)
    val result: ListBuffer[Row] = ListBuffer[Row]()
    data.foreach {
      case row if count != 0 && count % batchSize == 0 =>
        predictInternal(localModel, storage, cachedRows, result)

        storage.clean()
        storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0))
        cachedRows(count % batchSize) = row
        count += 1
      case row =>
        storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0))
        cachedRows(count % batchSize) = row
        count += 1
    }

    predictInternal(localModel, storage, cachedRows, result)

    executorContext.returnModel(localModel)

    result.toIterator
  }

  private def predictInternal(model: AngelGraphModel,
                              storage: DataBlock[LabeledData],
                              cachedRows: Array[Row],
                              result: ListBuffer[Row]): Unit = {
    val predicted = model.predict(storage)

    if (appendedSchema.length == 1) {
      predicted.zipWithIndex.foreach {
        case (res, idx) =>
          result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.pred))
      }
    } else {
      predicted.zipWithIndex.foreach {
        case (res, idx) =>
          result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.proba, res.predLabel))
      }
    }

  }

  def predictRaw(features: linalg.Vector): linalg.Vector = {
    val localModel = executorContext.borrowModel(bcConf.value)

    val res = localModel.predict(new LabeledData(features, 0.0))

    executorContext.returnModel(localModel)
    Vectors.dense(res.pred, -res.pred)
  }
}

Source File: Correlation.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.stat

import org.apache.spark.linalg.{SQLDataTypes, Vector}

import scala.collection.JavaConverters._
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.types.{StructField, StructType}

/**
 * API for correlation functions in MLlib, compatible with DataFrames and Datasets.
 *
 * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]]
 * to spark.ml's Vector types.
 */
object Correlation {

  /**
   * :: Experimental ::
   * Compute the correlation matrix for the input Dataset of Vectors using the specified method.
   * Methods currently supported: `pearson` (default), `spearman`.
   *
   * @param dataset A dataset or a dataframe
   * @param column The name of the column of vectors for which the correlation coefficient needs
   *               to be computed. This must be a column of the dataset, and it must contain
   *               Vector objects.
   * @param method String specifying the method to use for computing correlation.
   *               Supported: `pearson` (default), `spearman`
   * @return A dataframe that contains the correlation matrix of the column of vectors. This
   *         dataframe contains a single row and a single column of name
   *         '$METHODNAME($COLUMN)'.
   * @throws IllegalArgumentException if the column is not a valid column in the dataset, or if
   *                                  the content of this column is not of type Vector.
   *
   *  Here is how to access the correlation coefficient:
   *  {{{
   *    val data: Dataset[Vector] = ...
   *    val Row(coeff: Matrix) = Correlation.corr(data, "value").head
   *    // coeff now contains the Pearson correlation matrix.
   *  }}}
   *
   * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column
   * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
   * which is fairly costly. Cache the input Dataset before calling corr with `method = "spearman"`
   * to avoid recomputing the common lineage.
   */

  def corr(dataset: Dataset[_], column: String, method: String): DataFrame = {
    val rdd = dataset.select(column).rdd.map {
      case Row(v: Vector) => v
    }
    val oldM = Statistics.corr(rdd, method)
    val name = s"$method($column)"
    val schema = StructType(Array(StructField(name, SQLDataTypes.MatrixType, nullable = false)))
    dataset.sparkSession.createDataFrame(Seq(Row(oldM)).asJava, schema)
  }

  /**
   * Compute the Pearson correlation matrix for the input Dataset of Vectors.
   */

  def corr(dataset: Dataset[_], column: String): DataFrame = {
    corr(dataset, column, "pearson")
  }
}

Source File: SONAMetadataUtils.scala From sona with Apache License 2.0

5 votes

package org.apache.spark.sql.util

import com.tencent.angel.sona.ml.attribute._
import org.apache.spark.linalg.VectorUDT
import org.apache.spark.sql.types.StructField

import scala.collection.immutable.HashMap


/**
 * Helper utilities for algorithms using ML metadata
 */
object SONAMetadataUtils {

  /**
   * Examine a schema to identify the number of classes in a label column.
   * Returns None if the number of labels is not specified, or if the label column is continuous.
   */
  def getNumClasses(labelSchema: StructField): Option[Int] = {
    Attribute.fromStructField(labelSchema) match {
      case binAttr: BinaryAttribute => Some(2)
      case nomAttr: NominalAttribute => nomAttr.getNumValues
      case _: NumericAttribute | UnresolvedAttribute => None
    }
  }

  /**
   * Examine a schema to identify categorical (Binary and Nominal) features.
   *
   * @param featuresSchema  Schema of the features column.
   *                        If a feature does not have metadata, it is assumed to be continuous.
   *                        If a feature is Nominal, then it must have the number of values
   *                        specified.
   * @return  Map: feature index to number of categories.
   *          The map's set of keys will be the set of categorical feature indices.
   */
  def getCategoricalFeatures(featuresSchema: StructField): Map[Int, Int] = {
    val metadata = AttributeGroup.fromStructField(featuresSchema)
    if (metadata.attributes.isEmpty) {
      HashMap.empty[Int, Int]
    } else {
      metadata.attributes.get.zipWithIndex.flatMap { case (attr, idx) =>
        if (attr == null) {
          Iterator()
        } else {
          attr match {
            case _: NumericAttribute | UnresolvedAttribute => Iterator()
            case binAttr: BinaryAttribute => Iterator(idx -> 2)
            case nomAttr: NominalAttribute =>
              nomAttr.getNumValues match {
                case Some(numValues: Int) => Iterator(idx -> numValues)
                case None => throw new IllegalArgumentException(s"Feature $idx is marked as" +
                  " Nominal (categorical), but it does not have the number of values specified.")
              }
          }
        }
      }.toMap
    }
  }

  /**
   * Takes a Vector column and a list of feature names, and returns the corresponding list of
   * feature indices in the column, in order.
   * @param col  Vector column which must have feature names specified via attributes
   * @param names  List of feature names
   */
  def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Long] = {
    require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col"
      + s" to be Vector type, but it was type ${col.dataType} instead.")
    val inputAttr = AttributeGroup.fromStructField(col)
    names.map { name =>
      require(inputAttr.hasAttr(name),
        s"getFeatureIndicesFromNames found no feature with name $name in column $col.")
      inputAttr.getAttr(name).index.get
    }
  }
}

Source File: SparkSpreadsheetServiceWriteSuite.scala From spark-google-spreadsheets with Apache License 2.0

5 votes

package com.github.potix2.spark.google.spreadsheets

import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheet
import com.google.api.services.sheets.v4.model.{ExtendedValue, CellData, RowData}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.scalatest.{BeforeAndAfter, FlatSpec}

import scala.collection.JavaConverters._

class SparkSpreadsheetServiceWriteSuite extends FlatSpec with BeforeAndAfter {
  private val serviceAccountId = "53797494708-ds5v22b6cbpchrv2qih1vg8kru098k9i@developer.gserviceaccount.com"
  private val testCredentialPath = "src/test/resources/spark-google-spreadsheets-test-eb7b191d1e1d.p12"
  private val TEST_SPREADSHEET_NAME = "WriteSuite"
  private val TEST_SPREADSHEET_ID = "163Ja2OWUephWjIa-jpwTlvGcg8EJwCFCfxrF7aI117s"

  private val context: SparkSpreadsheetService.SparkSpreadsheetContext =
    SparkSpreadsheetService.SparkSpreadsheetContext(Some(serviceAccountId), new java.io.File(testCredentialPath))

  var spreadsheet: SparkSpreadsheet = null
  var worksheetName: String = ""

  def definedSchema: StructType = {
    new StructType()
      .add(new StructField("col_1", DataTypes.StringType))
      .add(new StructField("col_2", DataTypes.LongType))
      .add(new StructField("col_3", DataTypes.StringType))
  }

  case class Elem(col_1: String, col_2: Long, col_3: String)

  def extractor(e: Elem): RowData =
    new RowData().setValues(
      List(
        new CellData().setUserEnteredValue(
          new ExtendedValue().setStringValue(e.col_1)
        ),
        new CellData().setUserEnteredValue(
          new ExtendedValue().setNumberValue(e.col_2.toDouble)
        ),
        new CellData().setUserEnteredValue(
          new ExtendedValue().setStringValue(e.col_3)
        )
      ).asJava
    )

  before {
    spreadsheet = context.findSpreadsheet(TEST_SPREADSHEET_ID)
    worksheetName = scala.util.Random.alphanumeric.take(16).mkString
    val data = List(
      Elem("a", 1L, "x"),
      Elem("b", 2L, "y"),
      Elem("c", 3L, "z")
    )

    spreadsheet.addWorksheet(worksheetName, definedSchema, data, extractor)
  }

  after {
    spreadsheet.deleteWorksheet(worksheetName)
  }

  behavior of "A Spreadsheet"
  it should "find the new worksheet" in {
    val newWorksheet = spreadsheet.findWorksheet(worksheetName)
    assert(newWorksheet.isDefined)
    assert(newWorksheet.get.name == worksheetName)
    assert(newWorksheet.get.headers == Seq("col_1", "col_2", "col_3"))

    val rows = newWorksheet.get.rows
    assert(rows.head == Map("col_1" -> "a", "col_2" -> "1", "col_3" -> "x"))
  }

  behavior of "SparkWorksheet#updateCells"
  it should "update values in a worksheet" in {
    val newWorksheet = spreadsheet.findWorksheet(worksheetName)
    assert(newWorksheet.isDefined)

    val newData = List(
      Elem("f", 5L, "yy"),
      Elem("e", 4L, "xx"),
      Elem("c", 3L, "z"),
      Elem("b", 2L, "y"),
      Elem("a", 1L, "x")
    )

    newWorksheet.get.updateCells(definedSchema, newData, extractor)

    val rows = newWorksheet.get.rows
    assert(rows.head == Map("col_1" -> "f", "col_2" -> "5", "col_3" -> "yy"))
    assert(rows.last == Map("col_1" -> "a", "col_2" -> "1", "col_3" -> "x"))
  }
}

Source File: SpreadsheetRelation.scala From spark-google-spreadsheets with Apache License 2.0

5 votes

package com.github.potix2.spark.google.spreadsheets

import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheetContext
import com.github.potix2.spark.google.spreadsheets.util._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

case class SpreadsheetRelation protected[spark] (
                                                  context:SparkSpreadsheetContext,
                                                  spreadsheetName: String,
                                                  worksheetName: String,
                                                  userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan with InsertableRelation {

  import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService._

  override def schema: StructType = userSchema.getOrElse(inferSchema())

  private lazy val aWorksheet: SparkWorksheet =
    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(aWorksheet) => aWorksheet
      case Left(e) => throw e
    }

  private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows

  private[spreadsheets] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] =
    for {
      sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right
      worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right
    } yield worksheet

  override def buildScan(): RDD[Row] = {
    val aSchema = schema
    sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter =>
      iter.map { m =>
        var index = 0
        val rowArray = new Array[Any](aSchema.fields.length)
        while(index < aSchema.fields.length) {
          val field = aSchema.fields(index)
          rowArray(index) = if (m.contains(field.name)) {
            TypeCast.castTo(m(field.name), field.dataType, field.nullable)
          } else {
            null
          }
          index += 1
        }
        Row.fromSeq(rowArray)
      }
    }
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    if(!overwrite) {
      sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.")
    }

    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(w) =>
        w.updateCells(data.schema, data.collect().toList, Util.toRowData)
      case Left(e) =>
        throw e
    }
  }

  private def inferSchema(): StructType =
    StructType(aWorksheet.headers.toList.map { fieldName =>
      StructField(fieldName, StringType, nullable = true)
    })

}

Source File: TestData.scala From drunken-data-quality with Apache License 2.0

5 votes

package de.frosner.ddq.testutils

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object TestData {

  def makeIntegerDf(spark: SparkSession, numbers: Seq[Int]): DataFrame =
    spark.createDataFrame(
      spark.sparkContext.makeRDD(numbers.map(Row(_))),
      StructType(List(StructField("column", IntegerType, nullable = false)))
    )

  def makeNullableStringDf(spark: SparkSession, strings: Seq[String]): DataFrame =
    spark.createDataFrame(spark.sparkContext.makeRDD(strings.map(Row(_))), StructType(List(StructField("column", StringType, nullable = true))))

  def makeIntegersDf(spark: SparkSession, row1: Seq[Int], rowN: Seq[Int]*): DataFrame = {
    val rows = row1 :: rowN.toList
    val numCols = row1.size
    val rdd = spark.sparkContext.makeRDD(rows.map(Row(_:_*)))
    val schema = StructType((1 to numCols).map(idx => StructField("column" + idx, IntegerType, nullable = false)))
    spark.createDataFrame(rdd, schema)
  }

}

Source File: SchemaWithInfo.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.model.metadata

import com.flaminem.flamy.conf.FlamyContext
import com.flaminem.flamy.model.IOFormat
import com.flaminem.flamy.model.metadata.TableWithInfo.getSparkSchema
import com.flaminem.flamy.model.names.{SchemaName, TableName}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}


class SchemaWithInfo(
  override val creationTime: Option[Long],
  override val location: String,
  val name: SchemaName,
  val numTables: Option[Int],
  fileSize: Option[Long],
  fileCount: Option[Long],
  modificationTime: Option[Long]
) extends ItemWithInfo {

  def formattedNumTables: String = {
    numTables.map{_.toString}.getOrElse("")
  }

  override def getFormattedInfo(context: FlamyContext, humanReadable: Boolean): Seq[String] = {
    Seq(
      name.toString,
      formattedNumTables,
      formattedFileSize(context, humanReadable),
      formattedFileCount(context),
      formattedModificationTime(context)
    )
  }

  override def getFileSize: Option[Long] = {
    fileSize
  }

  override def getFileCount: Option[Long] = {
    fileCount
  }

  override def getModificationTime(context: FlamyContext, refresh: Boolean = false): Option[Long] = {
    modificationTime
  }

  override def toString: String = {
    name.toString
  }

}

object SchemaWithInfo {

  val getSparkSchema: StructType = {
    StructType(Seq(
      StructField("schema", StringType),
      StructField("num_tables", LongType),
      StructField("size", LongType),
      StructField("num_files", LongType),
      StructField("modification_time", LongType)
    ))
  }

  def getInfoHeader: Seq[String] = {
    getSparkSchema.fields.map{_.name}
  }

}

Source File: ContinuousDistributionBuilderFactory.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.dataframe.report.distribution.continuous

import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
import org.apache.spark.sql.types.StructField

import io.deepsense.deeplang.doperables.dataframe.report.distribution._
import io.deepsense.deeplang.utils.SparkTypeConverter._
import io.deepsense.deeplang.utils.aggregators._

object ContinuousDistributionBuilderFactory {

  def prepareBuilder(
      columnIndex: Int,
      field: StructField,
      multivarStats: MultivariateStatisticalSummary): DistributionBuilder = {
    val columnStats = ColumnStats.fromMultiVarStats(multivarStats, columnIndex)
    // MultivarStats inits min with Double.MaxValue and max with MinValue.
    // If there is at least one not (null or NaN) its guaranteed to change min/max values.
    // TODO Its a bit hacky. Find more elegant solution. Example approaches:
    // - Filter out nulls? Problematic because we operate on vectors for performance.
    // - Remade spark aggregators to return options?
    val hasOnlyNulls =
      columnStats.min == Double.MaxValue &&
        columnStats.max == Double.MinValue

    if (!hasOnlyNulls) {
      val histogram = {
        val buckets = BucketsCalculator.calculateBuckets(field.dataType,
          columnStats)
        HistogramAggregator(buckets, true).mapInput(getColumnAsDouble(columnIndex))
      }
      val missing = CountOccurenceAggregator[Option[Any]](None).mapInput(getOption(columnIndex))
      val colStats = columnStats
      ContinuousDistributionBuilder(histogram, missing, field, colStats)
    } else {
      NoDistributionBuilder(field.name, NoDistributionReasons.OnlyNulls)
    }
  }
}

Source File: DiscreteDistributionBuilder.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.dataframe.report.distribution.discrete

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{BooleanType, StringType, StructField}

import io.deepsense.deeplang.doperables.dataframe.report.DataFrameReportGenerator
import io.deepsense.deeplang.doperables.dataframe.report.distribution.{DistributionBuilder, NoDistributionReasons}
import io.deepsense.deeplang.doperables.report.ReportUtils
import io.deepsense.deeplang.utils.aggregators.Aggregator
import io.deepsense.deeplang.utils.aggregators.AggregatorBatch.BatchedResult
import io.deepsense.reportlib.model.{DiscreteDistribution, Distribution, NoDistribution}

case class DiscreteDistributionBuilder(
    categories: Aggregator[Option[scala.collection.mutable.Map[String, Long]], Row],
    missing: Aggregator[Long, Row],
    field: StructField)
  extends DistributionBuilder {

  def allAggregators: Seq[Aggregator[_, Row]] = Seq(categories, missing)

  override def build(results: BatchedResult): Distribution = {
    val categoriesMap = results.forAggregator(categories)
    val nullsCount = results.forAggregator(missing)

    categoriesMap match {
      case Some(occurrencesMap) => {
        val labels = field.dataType match {
          case StringType => occurrencesMap.keys.toSeq.sorted
          // We always want two labels, even when all elements are true or false
          case BooleanType => Seq(false.toString, true.toString)
        }
        val counts = labels.map(occurrencesMap.getOrElse(_, 0L))
        DiscreteDistribution(
          field.name,
          s"Discrete distribution for ${field.name} column",
          nullsCount,
          labels.map(ReportUtils.shortenLongStrings(_,
            DataFrameReportGenerator.StringPreviewMaxLength)),
          counts)
      }
      case None => NoDistribution(
        field.name,
        NoDistributionReasons.TooManyDistinctCategoricalValues
      )
    }
  }
}

Source File: GBTClassificationModel.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.spark.wrappers.models

import org.apache.spark.ml.classification.{GBTClassificationModel => SparkGBTClassificationModel, GBTClassifier => SparkGBTClassifier}
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}

import io.deepsense.commons.utils.Logging
import io.deepsense.deeplang.doperables.report.CommonTablesGenerators.SparkSummaryEntry
import io.deepsense.deeplang.doperables.report.{CommonTablesGenerators, Report}
import io.deepsense.deeplang.doperables.spark.wrappers.params.common.PredictorParams
import io.deepsense.deeplang.doperables.stringindexingwrapper.StringIndexingWrapperModel
import io.deepsense.deeplang.doperables.{LoadableWithFallback, SparkModelWrapper}
import io.deepsense.deeplang.params.Param
import io.deepsense.sparkutils.ML

class GBTClassificationModel(vanilaModel: VanillaGBTClassificationModel)
  extends StringIndexingWrapperModel[SparkGBTClassificationModel, SparkGBTClassifier](vanilaModel) {

  def this() = this(new VanillaGBTClassificationModel())
}

class VanillaGBTClassificationModel()
  extends SparkModelWrapper[SparkGBTClassificationModel, SparkGBTClassifier]
  with LoadableWithFallback[SparkGBTClassificationModel, SparkGBTClassifier]
  with PredictorParams
  with Logging {

  override private[deeplang] def _transformSchema(schema: StructType): Option[StructType] = {
    val predictionColumnName = $(predictionColumn)
    Some(StructType(schema.fields :+ StructField(predictionColumnName, DoubleType)))
  }

  override val params: Array[Param[_]] =
    Array(featuresColumn, predictionColumn)

  override def report: Report = {
    val summary =
      List(
        SparkSummaryEntry(
          name = "number of features",
          value = sparkModel.numFeatures,
          description = "Number of features the model was trained on."))

    super.report
      .withReportName(
        s"${this.getClass.getSimpleName} with ${sparkModel.numTrees} trees")
      .withAdditionalTable(CommonTablesGenerators.modelSummary(summary))
      .withAdditionalTable(
        CommonTablesGenerators.decisionTree(
          sparkModel.treeWeights,
          sparkModel.trees),
        2)
  }

  override protected def transformerName: String = classOf[GBTClassificationModel].getSimpleName

  override def tryToLoadModel(path: String): Option[SparkGBTClassificationModel] = {
    ML.ModelLoading.GBTClassification(path)
  }
}

Source File: RandomForestClassificationModel.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.spark.wrappers.models

import org.apache.spark.ml.classification.{RandomForestClassificationModel => SparkRandomForestClassificationModel, RandomForestClassifier => SparkRandomForestClassifier}
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}

import io.deepsense.deeplang.doperables.report.CommonTablesGenerators.SparkSummaryEntry
import io.deepsense.deeplang.doperables.report.{CommonTablesGenerators, Report}
import io.deepsense.deeplang.doperables.spark.wrappers.params.common.ProbabilisticClassifierParams
import io.deepsense.deeplang.doperables.stringindexingwrapper.StringIndexingWrapperModel
import io.deepsense.deeplang.doperables.{LoadableWithFallback, SparkModelWrapper}
import io.deepsense.deeplang.params.Param
import io.deepsense.sparkutils.ML

class RandomForestClassificationModel(
    vanillaModel: VanillaRandomForestClassificationModel)
  extends StringIndexingWrapperModel[
    SparkRandomForestClassificationModel,
    SparkRandomForestClassifier](vanillaModel) {

  def this() = this(new VanillaRandomForestClassificationModel())
}

class VanillaRandomForestClassificationModel
  extends SparkModelWrapper[
    SparkRandomForestClassificationModel,
    SparkRandomForestClassifier]
  with LoadableWithFallback[
    SparkRandomForestClassificationModel,
    SparkRandomForestClassifier]
  with ProbabilisticClassifierParams {

  override private[deeplang] def _transformSchema(schema: StructType): Option[StructType] = {
    val predictionColumnName = $(predictionColumn)
    val probabilityColumnName = $(probabilityColumn)
    val rawPredictionColumnName = $(rawPredictionColumn)
    Some(StructType(schema.fields ++ Seq(
      StructField(predictionColumnName, DoubleType),
      StructField(probabilityColumnName, new io.deepsense.sparkutils.Linalg.VectorUDT),
      StructField(rawPredictionColumnName, new io.deepsense.sparkutils.Linalg.VectorUDT)
    )))
  }

  override val params: Array[Param[_]] = Array(
    featuresColumn,
    predictionColumn,
    probabilityColumn,
    rawPredictionColumn) // thresholds

  override def report: Report = {
    val treeWeight = SparkSummaryEntry(
      name = "tree weights",
      value = sparkModel.treeWeights,
      description = "Weights for each tree."
    )

    super.report
      .withAdditionalTable(CommonTablesGenerators.modelSummary(List(treeWeight)))
  }

  override protected def transformerName: String =
    classOf[RandomForestClassificationModel].getSimpleName

  override def tryToLoadModel(path: String): Option[SparkRandomForestClassificationModel] = {
    ML.ModelLoading.randomForestClassification(path)
  }
}

Source File: CustomCodeColumnTransformer.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.doperations.exceptions.CustomOperationExecutionException
import io.deepsense.deeplang.OperationExecutionDispatcher.Result
import io.deepsense.deeplang.params.{CodeSnippetParam, Param}
import io.deepsense.deeplang.params.choice.ChoiceParam
import org.apache.spark.sql.types.{DataType, StructField, StructType}


abstract class CustomCodeColumnTransformer() extends MultiColumnTransformer {

  import CustomCodeColumnTransformer._

  val targetType = ChoiceParam[TargetTypeChoice](
    name = "target type",
    description = Some("Target type of the columns."))
  def getTargetType: TargetTypeChoice = $(targetType)
  def setTargetType(value: TargetTypeChoice): this.type = set(targetType, value)

  val codeParameter: CodeSnippetParam
  def getCodeParameter: String = $(codeParameter)
  def setCodeParameter(value: String): this.type = set(codeParameter, value)

  def runCode(context: ExecutionContext, code: String): Result

  def isValid(context: ExecutionContext, code: String): Boolean

  def getComposedCode(
      userCode: String,
      inputColumn: String,
      outputColumn: String,
      targetType: DataType): String

  override def getSpecificParams: Array[Param[_]]


  private def executeCode(
      code: String,
      inputColumn: String,
      outputColumn: String,
      context: ExecutionContext,
      dataFrame: DataFrame): DataFrame = {
    runCode(context, code) match {
      case Left(error) =>
        throw CustomOperationExecutionException(s"Execution exception:\n\n$error")

      case Right(_) =>
        val sparkDataFrame =
          context.dataFrameStorage.getOutputDataFrame(OutputPortNumber).getOrElse {
            throw CustomOperationExecutionException(
              "Operation finished successfully, but did not produce a DataFrame.")
          }

        val newSparkDataFrame = context.sparkSQLSession.createDataFrame(
          sparkDataFrame.rdd,
          transformSingleColumnSchema(inputColumn, outputColumn, dataFrame.schema.get).get)
        DataFrame.fromSparkDataFrame(newSparkDataFrame)
    }
  }

  override def transformSingleColumn(
      inputColumn: String,
      outputColumn: String,
      context: ExecutionContext,
      dataFrame: DataFrame): DataFrame = {
    val code = getComposedCode(
      $(codeParameter), inputColumn, outputColumn, getTargetType.columnType)
    logger.debug(s"Code to be validated and executed:\n$code")

    if (!isValid(context, code)) {
      throw CustomOperationExecutionException("Code validation failed")
    }

    context.dataFrameStorage.withInputDataFrame(InputPortNumber, dataFrame.sparkDataFrame) {
      executeCode(code, inputColumn, outputColumn, context, dataFrame)
    }
  }

  override def transformSingleColumnSchema(
      inputColumn: String,
      outputColumn: String,
      schema: StructType): Option[StructType] = {
    MultiColumnTransformer.assertColumnExist(inputColumn, schema)
    MultiColumnTransformer.assertColumnDoesNotExist(outputColumn, schema)
    Some(schema.add(StructField(outputColumn, getTargetType.columnType, nullable = true)))
  }
}

object CustomCodeColumnTransformer {
  val InputPortNumber: Int = 0
  val OutputPortNumber: Int = 0
}

Source File: CountVectorizerExample.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperations.examples

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType}

import io.deepsense.deeplang.doperables.dataframe.{DataFrame, DataFrameBuilder}
import io.deepsense.deeplang.doperations.spark.wrappers.estimators.CountVectorizer

class CountVectorizerExample extends AbstractOperationExample[CountVectorizer]{
  override def dOperation: CountVectorizer = {
    val op = new CountVectorizer()
    op.estimator
      .setInputColumn("lines")
      .setNoInPlace("lines_out")
      .setMinTF(3)
    op.set(op.estimator.extractParamMap())
  }

  override def inputDataFrames: Seq[DataFrame] = {
    val rows = Seq(
      Row("a a a b b c c c d ".split(" ").toSeq),
      Row("c c c c c c".split(" ").toSeq),
      Row("a".split(" ").toSeq),
      Row("e e e e e".split(" ").toSeq))
    val rdd = sparkContext.parallelize(rows)
    val schema = StructType(Seq(StructField("lines", ArrayType(StringType, containsNull = true))))
    Seq(DataFrameBuilder(sparkSQLSession).buildDataFrame(schema, rdd))
  }
}

Source File: UnionIntegSpec.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperations

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.doperations.exceptions.SchemaMismatchException
import io.deepsense.deeplang.inference.{InferContext, InferenceWarnings}
import io.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport}

class UnionIntegSpec extends DeeplangIntegTestSupport {

  import DeeplangIntegTestSupport._
  val schema1 = StructType(List(
    StructField("column1", DoubleType),
    StructField("column2", DoubleType)))

  val rows1_1 = Seq(
    Row(1.0, 2.0),
    Row(2.0, 3.0)
  )

  "Union" should {
    "return a union of two DataFrames" in {
      val rows1_2 = Seq(
        Row(2.0, 4.0),
        Row(4.0, 6.0)
      )

      val df1 = createDataFrame(rows1_1, schema1)
      val df2 = createDataFrame(rows1_2, schema1)

      val merged = Union()
        .executeUntyped(Vector(df1, df2))(executionContext)
        .head.asInstanceOf[DataFrame]

      assertDataFramesEqual(
        merged, createDataFrame(rows1_1 ++ rows1_2, schema1))
    }

    "throw for mismatching types in DataFrames" in {
      val schema2 = StructType(List(
        StructField("column1", StringType),
        StructField("column2", DoubleType)))

      val rows2_1 = Seq(
        Row("a", 1.0),
        Row("b", 1.0)
      )

      val df1 = createDataFrame(rows1_1, schema1)
      val df2 = createDataFrame(rows2_1, schema2)

      a [SchemaMismatchException] should be thrownBy {
        Union().executeUntyped(Vector(df1, df2))(executionContext)
      }
    }

    "throw for mismatching column names in DataFrames" in {
      val schema2 = StructType(List(
        StructField("column1", DoubleType),
        StructField("different_column_name", DoubleType)))

      val rows2_1 = Seq(
        Row(1.1, 1.0),
        Row(1.1, 1.0)
      )

      val df1 = createDataFrame(rows1_1, schema1)
      val df2 = createDataFrame(rows2_1, schema2)

      a [SchemaMismatchException] should be thrownBy {
        Union().executeUntyped(Vector(df1, df2))(executionContext)
      }
    }
  }

  it should {
    "propagate schema when both schemas match" in {
      val structType = StructType(Seq(
        StructField("x", DoubleType),
        StructField("y", DoubleType)))
      val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType))
      val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType))
      Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext]) shouldBe
        (Vector(knowledgeDF1), InferenceWarnings())
    }
    "generate error when schemas don't match" in {
      val structType1 = StructType(Seq(
        StructField("x", DoubleType)))
      val structType2 = StructType(Seq(
        StructField("y", DoubleType)))
      val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType1))
      val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType2))
      an [SchemaMismatchException] shouldBe thrownBy(
        Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext]))
    }
  }
}

Source File: DataFrameSplitterIntegSpec.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperations

import scala.collection.JavaConverters._

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.scalatest.Matchers
import org.scalatest.prop.GeneratorDrivenPropertyChecks

import io.deepsense.deeplang._
import io.deepsense.deeplang.doperables.dataframe.DataFrame

class DataFrameSplitterIntegSpec
  extends DeeplangIntegTestSupport
  with GeneratorDrivenPropertyChecks
  with Matchers {

  "SplitDataFrame" should {
    "split randomly one df into two df in given range" in {

      val input = Range(1, 100)

      val parameterPairs = List(
        (0.0, 0),
        (0.3, 1),
        (0.5, 2),
        (0.8, 3),
        (1.0, 4))

      for((splitRatio, seed) <- parameterPairs) {
        val rdd = createData(input)
        val df = executionContext.dataFrameBuilder.buildDataFrame(createSchema, rdd)
        val (df1, df2) = executeOperation(
          executionContext,
          new Split()
            .setSplitMode(
              SplitModeChoice.Random()
                .setSplitRatio(splitRatio)
                .setSeed(seed / 2)))(df)
        validateSplitProperties(df, df1, df2)
      }
    }

    "split conditionally one df into two df in given range" in {

      val input = Range(1, 100)

      val condition = "value > 20"
      val predicate: Int => Boolean = _ > 20

      val (expectedDF1, expectedDF2) =
        (input.filter(predicate), input.filter(!predicate(_)))

      val rdd = createData(input)
      val df = executionContext.dataFrameBuilder.buildDataFrame(createSchema, rdd)
      val (df1, df2) = executeOperation(
        executionContext,
        new Split()
          .setSplitMode(
            SplitModeChoice.Conditional()
              .setCondition(condition)))(df)
      df1.sparkDataFrame.collect().map(_.get(0)) should contain theSameElementsAs expectedDF1
      df2.sparkDataFrame.collect().map(_.get(0)) should contain theSameElementsAs expectedDF2
      validateSplitProperties(df, df1, df2)
    }
  }

  private def createSchema: StructType = {
    StructType(List(
      StructField("value", IntegerType, nullable = false)
    ))
  }

  private def createData(data: Seq[Int]): RDD[Row] = {
    sparkContext.parallelize(data.map(Row(_)))
  }

  private def executeOperation(context: ExecutionContext, operation: DOperation)
                              (dataFrame: DataFrame): (DataFrame, DataFrame) = {
    val operationResult = operation.executeUntyped(Vector[DOperable](dataFrame))(context)
    val df1 = operationResult.head.asInstanceOf[DataFrame]
    val df2 = operationResult.last.asInstanceOf[DataFrame]
    (df1, df2)
  }

  def validateSplitProperties(inputDF: DataFrame, outputDF1: DataFrame, outputDF2: DataFrame)
      : Unit = {
    val dfCount = inputDF.sparkDataFrame.count()
    val df1Count = outputDF1.sparkDataFrame.count()
    val df2Count = outputDF2.sparkDataFrame.count()
    val rowsDf = inputDF.sparkDataFrame.collectAsList().asScala
    val rowsDf1 = outputDF1.sparkDataFrame.collectAsList().asScala
    val rowsDf2 = outputDF2.sparkDataFrame.collectAsList().asScala
    val intersect = rowsDf1.intersect(rowsDf2)
    intersect.size shouldBe 0
    (df1Count + df2Count) shouldBe dfCount
    rowsDf.toSet shouldBe rowsDf1.toSet.union(rowsDf2.toSet)
  }
}

Source File: DataFrameReportPerformanceSpec.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.dataframe

import java.sql.Timestamp
import java.text.{DateFormat, SimpleDateFormat}
import java.util.TimeZone

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DoubleType, StructField, StructType, TimestampType}
import org.scalatest.{BeforeAndAfter, Ignore}

import io.deepsense.commons.utils.{DoubleUtils, Logging}
import io.deepsense.deeplang.{TestFiles, DeeplangIntegTestSupport}

// It's ignored because it does not have got assertions, it only prints report generation time.
@Ignore
class DataFrameReportPerformanceSpec
    extends DeeplangIntegTestSupport
    with BeforeAndAfter
    with TestFiles
    with Logging {
  val testFile = absoluteTestsDirPath.pathWithoutScheme + "/demand_without_header.csv"

  "DataFrame" should {
    "generate report" when {
      "DataFrame has 17K of rows" in {
        val numberOfTries = 10
        var results: Seq[Double] = Seq()
        for (i <- 1 to numberOfTries) {
          val dataFrame: DataFrame = demandDataFrame()
          val start = System.nanoTime()
          val report = dataFrame.report
          val end = System.nanoTime()
          val time1: Double = (end - start).toDouble / 1000000000.0
          results = results :+ time1
          logger.debug("Report generation time: {}", DoubleUtils.double2String(time1))
        }
        logger.debug(
          "Mean report generation time: {}",
          DoubleUtils.double2String(results.fold(0D)(_ + _) / numberOfTries.toDouble))
      }
    }
  }

  private def demandDataFrame(): DataFrame = {
    val rddString: RDD[String] = executionContext.sparkContext.textFile(testFile)
    val data: RDD[Row] = rddString.map(DataFrameHelpers.demandString2Row)
    executionContext.dataFrameBuilder.buildDataFrame(demandSchema, data)
  }

  private def demandSchema: StructType = StructType(Seq(
    StructField("datetime", TimestampType),
    StructField("log_count", DoubleType),
    StructField("workingday", DoubleType),
    StructField("holiday", DoubleType),
    StructField("season2", DoubleType),
    StructField("season3", DoubleType),
    StructField("season4", DoubleType)))

  private def timestamp(s: String): Timestamp = {
    val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
    format.setTimeZone(TimeZone.getTimeZone("UTC"))
    new Timestamp(format.parse(s).getTime)
  }
}

private object DataFrameHelpers {
  def demandString2Row(s: String): Row = {
    val split = s.split(",")
    Row(
      timestamp(split(0)),
      split(1).toDouble,
      split(2).toDouble,
      split(3).toDouble,
      split(4).toDouble,
      split(5).toDouble,
      split(6).toDouble
    )
  }

  private def timestamp(s: String): Timestamp = {
    val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
    format.setTimeZone(TimeZone.getTimeZone("UTC"))
    new Timestamp(format.parse(s).getTime)
  }
}

Source File: AbstractEvaluatorSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.params.ParamPair
import io.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport}
import io.deepsense.sparkutils.Linalg.Vectors

abstract class AbstractEvaluatorSmokeTest extends DeeplangIntegTestSupport {

  def className: String

  val evaluator: Evaluator

  val evaluatorParams: Seq[ParamPair[_]]

  val inputDataFrameSchema = StructType(Seq(
    StructField("s", StringType),
    StructField("prediction", DoubleType),
    StructField("rawPrediction", new io.deepsense.sparkutils.Linalg.VectorUDT),
    StructField("label", DoubleType)
  ))

  val inputDataFrame: DataFrame = {
    val rowSeq = Seq(
      Row("aAa bBb cCc dDd eEe f", 1.0, Vectors.dense(2.1, 2.2, 2.3), 3.0),
      Row("das99213 99721 8i!#@!", 4.0, Vectors.dense(5.1, 5.2, 5.3), 6.0)
    )
    createDataFrame(rowSeq, inputDataFrameSchema)
  }

  def setUpStubs(): Unit = ()

  className should {
    "successfully run _evaluate()" in {
      setUpStubs()
      evaluator.set(evaluatorParams: _*)._evaluate(executionContext, inputDataFrame)
    }
    "successfully run _infer()" in {
      evaluator.set(evaluatorParams: _*)._infer(DKnowledge(inputDataFrame))
    }
    "successfully run report" in {
      evaluator.set(evaluatorParams: _*).report
    }
  }
}

Source File: EstimatorModelWrapperFixtures.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.spark.wrappers.estimators

import scala.language.reflectiveCalls

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml
import org.apache.spark.ml.param.{ParamMap, Param => SparkParam}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.report.Report
import io.deepsense.deeplang.doperables.serialization.SerializableSparkModel
import io.deepsense.deeplang.doperables.{SparkEstimatorWrapper, SparkModelWrapper}
import io.deepsense.deeplang.params.wrappers.spark.SingleColumnCreatorParamWrapper
import io.deepsense.deeplang.params.{Param, Params}
import io.deepsense.sparkutils.ML

object EstimatorModelWrapperFixtures {

  class SimpleSparkModel private[EstimatorModelWrapperFixtures]()
    extends ML.Model[SimpleSparkModel] {

    def this(x: String) = this()

    override val uid: String = "modelId"

    val predictionCol = new SparkParam[String](uid, "name", "description")

    def setPredictionCol(value: String): this.type = set(predictionCol, value)

    override def copy(extra: ParamMap): this.type = defaultCopy(extra)

    override def transformDF(dataset: DataFrame): DataFrame = {
      dataset.selectExpr("*", "1 as " + $(predictionCol))
    }

    @DeveloperApi
    override def transformSchema(schema: StructType): StructType = ???
  }

  class SimpleSparkEstimator extends ML.Estimator[SimpleSparkModel] {

    def this(x: String) = this()

    override val uid: String = "estimatorId"

    val predictionCol = new SparkParam[String](uid, "name", "description")

    override def fitDF(dataset: DataFrame): SimpleSparkModel =
      new SimpleSparkModel().setPredictionCol($(predictionCol))

    override def copy(extra: ParamMap): ML.Estimator[SimpleSparkModel] = defaultCopy(extra)

    @DeveloperApi
    override def transformSchema(schema: StructType): StructType = {
      schema.add(StructField($(predictionCol), IntegerType, nullable = false))
    }
  }

  trait HasPredictionColumn extends Params {
    val predictionColumn = new SingleColumnCreatorParamWrapper[
        ml.param.Params { val predictionCol: SparkParam[String] }](
      "prediction column",
      None,
      _.predictionCol)
    setDefault(predictionColumn, "abcdefg")

    def getPredictionColumn(): String = $(predictionColumn)
    def setPredictionColumn(value: String): this.type = set(predictionColumn, value)
  }

  class SimpleSparkModelWrapper
    extends SparkModelWrapper[SimpleSparkModel, SimpleSparkEstimator]
    with HasPredictionColumn {

    override val params: Array[Param[_]] = Array(predictionColumn)
    override def report: Report = ???

    override protected def loadModel(
      ctx: ExecutionContext,
      path: String): SerializableSparkModel[SimpleSparkModel] = ???
  }

  class SimpleSparkEstimatorWrapper
    extends SparkEstimatorWrapper[SimpleSparkModel, SimpleSparkEstimator, SimpleSparkModelWrapper]
    with HasPredictionColumn {

    override val params: Array[Param[_]] = Array(predictionColumn)
    override def report: Report = ???
  }
}

Source File: EstimatorModelWrapperIntegSpec.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperables.spark.wrappers.estimators

import io.deepsense.deeplang.DeeplangIntegTestSupport
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructType, StructField}

class EstimatorModelWrapperIntegSpec extends DeeplangIntegTestSupport {

  import io.deepsense.deeplang.doperables.spark.wrappers.estimators.EstimatorModelWrapperFixtures._

  val inputDF = {
    val rowSeq = Seq(Row(1), Row(2), Row(3))
    val schema = StructType(Seq(StructField("x", IntegerType, nullable = false)))
    createDataFrame(rowSeq, schema)
  }

  val estimatorPredictionParamValue = "estimatorPrediction"

  val expectedSchema = StructType(Seq(
    StructField("x", IntegerType, nullable = false),
    StructField(estimatorPredictionParamValue, IntegerType, nullable = false)
  ))

  val transformerPredictionParamValue = "modelPrediction"

  val expectedSchemaForTransformerParams = StructType(Seq(
    StructField("x", IntegerType, nullable = false),
    StructField(transformerPredictionParamValue, IntegerType, nullable = false)
  ))

  "EstimatorWrapper" should {
    "_fit() and transform() + transformSchema() with parameters inherited" in {

      val transformer = createEstimatorAndFit()

      val transformOutputSchema =
        transformer._transform(executionContext, inputDF).sparkDataFrame.schema
      transformOutputSchema shouldBe expectedSchema

      val inferenceOutputSchema = transformer._transformSchema(inputDF.sparkDataFrame.schema)
      inferenceOutputSchema shouldBe Some(expectedSchema)
    }

    "_fit() and transform() + transformSchema() with parameters overwritten" in {

      val transformer = createEstimatorAndFit().setPredictionColumn(transformerPredictionParamValue)

      val transformOutputSchema =
        transformer._transform(executionContext, inputDF).sparkDataFrame.schema
      transformOutputSchema shouldBe expectedSchemaForTransformerParams

      val inferenceOutputSchema = transformer._transformSchema(inputDF.sparkDataFrame.schema)
      inferenceOutputSchema shouldBe Some(expectedSchemaForTransformerParams)
    }

    "_fit_infer().transformSchema() with parameters inherited" in {

      val estimatorWrapper = new SimpleSparkEstimatorWrapper()
        .setPredictionColumn(estimatorPredictionParamValue)

      estimatorWrapper._fit_infer(inputDF.schema)
        ._transformSchema(inputDF.sparkDataFrame.schema) shouldBe Some(expectedSchema)
    }

    "_fit_infer().transformSchema() with parameters overwritten" in {

      val estimatorWrapper = new SimpleSparkEstimatorWrapper()
        .setPredictionColumn(estimatorPredictionParamValue)
      val transformer =
        estimatorWrapper._fit_infer(inputDF.schema).asInstanceOf[SimpleSparkModelWrapper]
      val transformerWithParams = transformer.setPredictionColumn(transformerPredictionParamValue)

      val outputSchema = transformerWithParams._transformSchema(inputDF.sparkDataFrame.schema)
      outputSchema shouldBe Some(expectedSchemaForTransformerParams)
    }
  }

  private def createEstimatorAndFit(): SimpleSparkModelWrapper = {

    val estimatorWrapper = new SimpleSparkEstimatorWrapper()
      .setPredictionColumn(estimatorPredictionParamValue)

    val transformer =
      estimatorWrapper._fit(executionContext, inputDF).asInstanceOf[SimpleSparkModelWrapper]
    transformer.getPredictionColumn() shouldBe estimatorPredictionParamValue

    transformer
  }
}

Source File: StructFieldJsonProtocol.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.reportlib.model

import org.apache.spark.sql.types.{DataType, StructField}
import spray.json._

import io.deepsense.commons.json.EnumerationSerializer
import io.deepsense.commons.types.{ColumnType, SparkConversions}

trait StructFieldJsonProtocol
  extends DefaultJsonProtocol
  with MetadataJsonProtocol
  with DataTypeJsonProtocol {

  implicit val failureCodeFormat = EnumerationSerializer.jsonEnumFormat(ColumnType)

  // StructField format without metadata, with deeplangType appended
  implicit val structFieldFormat = new RootJsonFormat[StructField] {
    val c = (s: String, d: DataType, b: Boolean) => StructField(s, d, b)
    implicit val rawFormat = jsonFormat(c, "name", "dataType", "nullable")

    override def write(obj: StructField): JsValue = {
      val jsObject = obj.toJson(rawFormat).asJsObject

      val deeplangType =
        SparkConversions.sparkColumnTypeToColumnType(obj.dataType)

      JsObject(jsObject.fields + ("deeplangType" -> deeplangType.toJson))
    }

    override def read(json: JsValue): StructField = {
      json.convertTo(rawFormat)
    }
  }
}

Source File: ReportContentTestFactory.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.reportlib.model.factory

import io.deepsense.reportlib.model.{ReportType, ReportContent}
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}

trait ReportContentTestFactory {

  import ReportContentTestFactory._

  def testReport: ReportContent = ReportContent(
    reportName,
    reportType,
    Seq(TableTestFactory.testEmptyTable),
    Map(ReportContentTestFactory.categoricalDistName ->
      DistributionTestFactory.testCategoricalDistribution(
        ReportContentTestFactory.categoricalDistName),
      ReportContentTestFactory.continuousDistName ->
      DistributionTestFactory.testContinuousDistribution(
        ReportContentTestFactory.continuousDistName)
    )
  )

}

object ReportContentTestFactory extends ReportContentTestFactory {
  val continuousDistName = "continuousDistributionName"
  val categoricalDistName = "categoricalDistributionName"
  val reportName = "TestReportContentName"
  val reportType = ReportType.Empty

  val someReport: ReportContent = ReportContent("empty", ReportType.Empty)
}

Source File: ListTablesSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType}
import org.apache.spark.sql.catalyst.TableIdentifier

class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContext {
  import testImplicits._

  private lazy val df = (1 to 10).map(i => (i, s"str$i")).toDF("key", "value")

  before {
    df.registerTempTable("ListTablesSuiteTable")
  }

  after {
    sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
  }

  test("get all tables") {
    checkAnswer(
      sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    checkAnswer(
      sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
    assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
  }

  test("getting all Tables with a database name has no impact on returned table names") {
    checkAnswer(
      sqlContext.tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    checkAnswer(
      sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
    assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
  }

  test("query the returned DataFrame of tables") {
    val expectedSchema = StructType(
      StructField("tableName", StringType, false) ::
      StructField("isTemporary", BooleanType, false) :: Nil)

    Seq(sqlContext.tables(), sql("SHOW TABLes")).foreach {
      case tableDF =>
        assert(expectedSchema === tableDF.schema)

        tableDF.registerTempTable("tables")
        checkAnswer(
          sql(
            "SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"),
          Row(true, "ListTablesSuiteTable")
        )
        checkAnswer(
          sqlContext.tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
          Row("tables", true))
        sqlContext.dropTempTable("tables")
    }
  }
}

Source File: GroupedDatasetSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.api.python.PythonEvalType
import org.apache.spark.sql.catalyst.plans.logical.AnalysisBarrier
import org.apache.spark.sql.execution.python.PythonUDF
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{LongType, StructField, StructType}

class GroupedDatasetSuite extends QueryTest with SharedSQLContext {
  import testImplicits._

  private val scalaUDF = udf((x: Long) => { x + 1 })
  private lazy val datasetWithUDF = spark.range(1).toDF("s").select($"s", scalaUDF($"s"))

  private def assertContainsAnalysisBarrier(ds: Dataset[_], atLevel: Int = 1): Unit = {
    assert(atLevel >= 0)
    var children = Seq(ds.queryExecution.logical)
    (1 to atLevel).foreach { _ =>
      children = children.flatMap(_.children)
    }
    val barriers = children.collect {
      case ab: AnalysisBarrier => ab
    }
    assert(barriers.nonEmpty, s"Plan does not contain AnalysisBarrier at level $atLevel:\n" +
      ds.queryExecution.logical)
  }

  test("SPARK-24373: avoid running Analyzer rules twice on RelationalGroupedDataset") {
    val groupByDataset = datasetWithUDF.groupBy()
    val rollupDataset = datasetWithUDF.rollup("s")
    val cubeDataset = datasetWithUDF.cube("s")
    val pivotDataset = datasetWithUDF.groupBy().pivot("s", Seq(1, 2))
    datasetWithUDF.cache()
    Seq(groupByDataset, rollupDataset, cubeDataset, pivotDataset).foreach { rgDS =>
      val df = rgDS.count()
      assertContainsAnalysisBarrier(df)
      assertCached(df)
    }

    val flatMapGroupsInRDF = datasetWithUDF.groupBy().flatMapGroupsInR(
      Array.emptyByteArray,
      Array.emptyByteArray,
      Array.empty,
      StructType(Seq(StructField("s", LongType))))
    val flatMapGroupsInPandasDF = datasetWithUDF.groupBy().flatMapGroupsInPandas(PythonUDF(
      "pyUDF",
      null,
      StructType(Seq(StructField("s", LongType))),
      Seq.empty,
      PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
      true))
    Seq(flatMapGroupsInRDF, flatMapGroupsInPandasDF).foreach { df =>
      assertContainsAnalysisBarrier(df, 2)
      assertCached(df)
    }
    datasetWithUDF.unpersist(true)
  }

  test("SPARK-24373: avoid running Analyzer rules twice on KeyValueGroupedDataset") {
    val kvDasaset = datasetWithUDF.groupByKey(_.getLong(0))
    datasetWithUDF.cache()
    val mapValuesKVDataset = kvDasaset.mapValues(_.getLong(0)).reduceGroups(_ + _)
    val keysKVDataset = kvDasaset.keys
    val flatMapGroupsKVDataset = kvDasaset.flatMapGroups((k, _) => Seq(k))
    val aggKVDataset = kvDasaset.count()
    val otherKVDataset = spark.range(1).groupByKey(_ + 1)
    val cogroupKVDataset = kvDasaset.cogroup(otherKVDataset)((k, _, _) => Seq(k))
    Seq((mapValuesKVDataset, 1),
        (keysKVDataset, 2),
        (flatMapGroupsKVDataset, 2),
        (aggKVDataset, 1),
        (cogroupKVDataset, 2)).foreach { case (df, analysisBarrierDepth) =>
      assertContainsAnalysisBarrier(df, analysisBarrierDepth)
      assertCached(df)
    }
    datasetWithUDF.unpersist(true)
  }
}

Source File: BlockingSource.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.util

import java.util.concurrent.CountDownLatch

import org.apache.spark.sql.{SQLContext, _}
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class BlockingSource extends StreamSourceProvider with StreamSinkProvider {

  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    BlockingSource.latch.await()
    new Source {
      override def schema: StructType = fakeSchema
      override def getOffset: Option[Offset] = Some(new LongOffset(0))
      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
        import spark.implicits._
        Seq[Int]().toDS().toDF()
      }
      override def stop() {}
    }
  }

  override def createSink(
      spark: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new Sink {
      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
    }
  }
}

object BlockingSource {
  var latch: CountDownLatch = null
}

Source File: MockSourceProvider.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.util

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Source
import org.apache.spark.sql.sources.StreamSourceProvider
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class MockSourceProvider extends StreamSourceProvider {
  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", MockSourceProvider.fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    MockSourceProvider.sourceProviderFunction()
  }
}

object MockSourceProvider {
  // Function to generate sources. May provide multiple sources if the user implements such a
  // function.
  private var sourceProviderFunction: () => Source = _

  final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = {
    var i = 0
    val sources = source +: otherSources
    sourceProviderFunction = () => {
      val source = sources(i % sources.length)
      i += 1
      source
    }
    try {
      f
    } finally {
      sourceProviderFunction = null
    }
  }
}

Source File: SpreadsheetRelation.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark.datasource.google.spreadsheet

import mimir.exec.spark.datasource.google.spreadsheet.SparkSpreadsheetService.SparkSpreadsheetContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

case class SpreadsheetRelation protected[spark] (
                                                  context:SparkSpreadsheetContext,
                                                  spreadsheetName: String,
                                                  worksheetName: String,
                                                  userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan with InsertableRelation {

  import mimir.exec.spark.datasource.google.spreadsheet.SparkSpreadsheetService._

  private val fieldMap = scala.collection.mutable.Map[String, String]()
  override def schema: StructType = userSchema.getOrElse(inferSchema())

  private lazy val aWorksheet: SparkWorksheet =
    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(aWorksheet) => aWorksheet
      case Left(e) => throw e
    }

  private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows

  private[spreadsheet] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] =
    for {
      sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right
      worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right
    } yield worksheet

  override def buildScan(): RDD[Row] = {
    val aSchema = schema
    val schemaMap = fieldMap.toMap
    sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter =>
      iter.map { m =>
        var index = 0
        val rowArray = new Array[Any](aSchema.fields.length)
        while(index < aSchema.fields.length) {
          val field = aSchema.fields(index)
          rowArray(index) = if (m.contains(field.name)) {
            TypeCast.castTo(m(field.name), field.dataType, field.nullable)
          } else if (schemaMap.contains(field.name) && m.contains(schemaMap(field.name))) {
            TypeCast.castTo(m(schemaMap(field.name)), field.dataType, field.nullable)
          } else {
            null
          }
          index += 1
        }
        Row.fromSeq(rowArray)
      }
    }
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    if(!overwrite) {
      sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.")
    }

    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(w) =>
        w.updateCells(data.schema, data.collect().toList, Util.toRowData)
      case Left(e) =>
        throw e
    }
  }

  def sanitizeColumnName(name: String): String =
  {
    name
      .replaceAll("[^a-zA-Z0-9]+", "_")    // Replace sequences of non-alphanumeric characters with underscores
      .replaceAll("_+$", "")               // Strip trailing underscores
      .replaceAll("^[0-9_]+", "")          // Strip leading underscores and digits
  }

  private def inferSchema(): StructType =
    StructType(aWorksheet.headers.toList.map { fieldName => {
      val sanitizedName = sanitizeColumnName(fieldName)
      fieldMap.put(sanitizedName, fieldName)
      StructField(sanitizedName, StringType, true)
    }})

}

Source File: MimirUDF.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark.udf

import java.sql.{ Timestamp, Date }

import org.apache.spark.sql.types.{ DataType, StructType, StructField }

import mimir.algebra._
import mimir.exec.spark._
import mimir.util.SparkUtils

class MimirUDF {
  def getPrimitive(t:Type, value:Any) = value match {
    case null => NullPrimitive()
    case _ => t match {
      //case TInt() => IntPrimitive(value.asInstanceOf[Long])
      case TInt() => IntPrimitive(value.asInstanceOf[Long])
      case TFloat() => FloatPrimitive(value.asInstanceOf[Double])
      case TDate() => SparkUtils.convertDate(value.asInstanceOf[Date])
      case TTimestamp() => SparkUtils.convertTimestamp(value.asInstanceOf[Timestamp])
      case TString() => StringPrimitive(value.asInstanceOf[String])
      case TBool() => BoolPrimitive(value.asInstanceOf[Boolean])
      case TRowId() => RowIdPrimitive(value.asInstanceOf[String])
      case TType() => TypePrimitive(Type.fromString(value.asInstanceOf[String]))
      //case TAny() => NullPrimitive()
      //case TUser(name) => name.toLowerCase
      //case TInterval() => Primitive(value.asInstanceOf[Long])
      case _ => StringPrimitive(value.asInstanceOf[String])
    }
  }
  def getNative(primitive : PrimitiveValue) : AnyRef = 
    primitive match {
      case NullPrimitive() => null
      case RowIdPrimitive(s) => s
      case StringPrimitive(s) => s
      case IntPrimitive(i) => new java.lang.Long(i)
      case FloatPrimitive(f) => new java.lang.Double(f)
      case BoolPrimitive(b) => new java.lang.Boolean(b)
      case ts@TimestampPrimitive(y,m,d,h,mm,s,ms) => SparkUtils.convertTimestamp(ts)
      case dt@DatePrimitive(y,m,d) => SparkUtils.convertDate(dt)
      case x =>  x.asString
    }
  def getStructType(datatypes:Seq[DataType]): StructType = {
    StructType(datatypes.map(dti => StructField("", RAToSpark.getInternalSparkType(dti), true)))
  }
}

Source File: LibSVMResponseRowDeserializer.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers

import org.apache.spark.ml.linalg.{SparseVector, SQLDataTypes}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.{ContentTypes, ResponseRowDeserializer}


  override val accepts: String = ContentTypes.TEXT_LIBSVM

  private def parseLibSVMRow(record: String): Row = {
    val items = record.split(' ')
    val label = items.head.toDouble
    val (indices, values) = items.tail.filter(_.nonEmpty).map { item =>
      val entry = item.split(':')
      val index = entry(0).toInt - 1
      val value = entry(1).toDouble
      (index, value)
    }.unzip
    Row(label, new SparseVector(dim, indices.toArray, values.toArray))
  }

  override val schema: StructType = StructType(
    Array(
      StructField(labelColumnName, DoubleType, nullable = false),
      StructField(featuresColumnName, SQLDataTypes.VectorType, nullable = false)))
}

Source File: ProtobufRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter

class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val labelColumnName = "label"
  val featuresColumnName = "features"
  val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField(
    featuresColumnName, VectorType)))

  it should "serialize a dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "serialize a sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "fail to set schema on invalid features name" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    intercept[IllegalArgumentException] {
      val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist")
    }
  }


  it should "fail on invalid types" in {
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new ProtobufRequestRowSerializer(Some(validSchema))
  }
}

Source File: UnlabeledLibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class UnlabeledLibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val schema = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false)))

  "UnlabeledLibSVMRequestRowSerializer" should "serialize sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert ("0.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert("0.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs =
      new UnlabeledLibSVMRequestRowSerializer(featuresColumnName = "mangoes are not features")
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }

  it should "fail on invalid features type" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row =
      new GenericRowWithSchema(values = Seq(1.0, "FEATURESSSSSZ!1!").toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }


  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))

    val rrs = new UnlabeledLibSVMRequestRowSerializer(Some(validSchema))
  }

  it should "fail to validate incorrect schema" in {
    val invalidSchema = StructType(Array(
      StructField("features", StringType, nullable = false)))

    intercept[IllegalArgumentException] {
      new UnlabeledLibSVMRequestRowSerializer(Some(invalidSchema))
    }
  }
}

Source File: LibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest._
import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer

class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {
  val schema = new LibSVMResponseRowDeserializer(10).schema

  "LibSVMRequestRowSerializer" should "serialize sparse vector" in {

    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert ("1.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "ignore other columns" in {
    val schemaWithExtraColumns = StructType(Array(
      StructField("name", StringType, nullable = false),
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false),
        StructField("favorite activity", StringType, nullable = false)))

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray,
      schema = schemaWithExtraColumns)

    val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!")
    }
  }

  it should "fail on invalid label column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema),
        labelColumnName = "Sir! I must protest! I do not exist!")
    }
  }

  it should "fail on invalid types" in {
    val schemaWithInvalidLabelType = StructType(Array(
      StructField("label", StringType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType))
    }
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new LibSVMRequestRowSerializer(Some(validSchema))
  }
}

Source File: UnlabeledCSVRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package unit.com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.UnlabeledCSVRequestRowSerializer

class UnlabeledCSVRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {
  val schema: StructType =
    StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false)))

  it should "serialize sparse vector" in {

    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    val sparseString = "-100.0," + "0.0," * 9 + "100.1," + "0.0," * 88 + "0.0\n"
    assert (sparseString == serialized)
  }

  it should "serialize dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert("10.0,-100.0,2.0\n" == serialized)
  }
}

Source File: HashSetRowIterator.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, UnsafeProjection, UnsafeRow}
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class ObjectHashSetRowIterator(set: ObjectHashSet) extends Iterator[InternalRow] {
  val rawIter = set.iterator()

  override final def hasNext(): Boolean = {
    rawIter.hasNext
  }

  override final def next(): InternalRow = {
    rawIter.next()
  }
}

class IntKeysHashSetRowIterator(set: IntKeysHashSet) extends Iterator[InternalRow] {
  val rawIter = set.iterator()
  val uRow = new UnsafeRow()
  val bufferHolder = new BufferHolder()
  val rowWriter = new UnsafeRowWriter()

  override final def hasNext(): Boolean = {
    rawIter.hasNext
  }

  override final def next(): InternalRow = {
    bufferHolder.reset()
    rowWriter.initialize(bufferHolder, 1)
    rowWriter.write(0, rawIter.next())

    uRow.pointTo(bufferHolder.buffer, 1, bufferHolder.totalSize())
    uRow
  }
}

class LongKeysHashSetRowIterator(set: LongKeysHashSet) extends Iterator[InternalRow] {
  val rawIter = set.iterator()
  val numFields = set.schemaInfo.arity
  val uRow = new UnsafeRow()
  val bufferHolder = new BufferHolder()
  val rowWriter = new UnsafeRowWriter()

  override final def hasNext(): Boolean = {
    rawIter.hasNext
  }

  override final def next(): InternalRow = {
    bufferHolder.reset()
    rowWriter.initialize(bufferHolder, numFields)

    val value = rawIter.nextLong()
    if (numFields == 2) {
      rowWriter.write(0, (value >> 32).toInt)
      rowWriter.write(1, value.toInt)
    } else {
      rowWriter.write(0, value)
    }
    uRow.pointTo(bufferHolder.buffer, numFields, bufferHolder.totalSize())
    uRow
  }
}

object HashSetRowIterator {
  def create(set: HashSet): Iterator[InternalRow] = {
    set match {
      //case set: UnsafeFixedWidthSet => set.iterator().asScala
      case set: IntKeysHashSet => new IntKeysHashSetRowIterator(set)
      case set: LongKeysHashSet => new LongKeysHashSetRowIterator(set)
      case set: ObjectHashSet => new ObjectHashSetRowIterator(set)
    }
  }
}

Source File: LibSVMRelation.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import com.google.common.base.Objects

import org.apache.spark.Logging
import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}


@Since("1.6.0")
class DefaultSource extends RelationProvider with DataSourceRegister {

  @Since("1.6.0")
  override def shortName(): String = "libsvm"

  @Since("1.6.0")
  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String])
    : BaseRelation = {
    val path = parameters.getOrElse("path",
      throw new IllegalArgumentException("'path' must be specified"))
    val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt
    val vectorType = parameters.getOrElse("vectorType", "sparse")
    new LibSVMRelation(path, numFeatures, vectorType)(sqlContext)
  }
}

Source File: MetadataUtils.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.util

import scala.collection.immutable.HashMap

import org.apache.spark.ml.attribute._
import org.apache.spark.mllib.linalg.VectorUDT
import org.apache.spark.sql.types.StructField



  def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = {
    require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col"
      + s" to be Vector type, but it was type ${col.dataType} instead.")
    val inputAttr = AttributeGroup.fromStructField(col)
    names.map { name =>
      require(inputAttr.hasAttr(name),
        s"getFeatureIndicesFromNames found no feature with name $name in column $col.")
      inputAttr.getAttr(name).index.get
    }
  }
}

Source File: LocalRelation.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis}
import org.apache.spark.sql.types.{StructField, StructType}

object LocalRelation {
  def apply(output: Attribute*): LocalRelation = new LocalRelation(output)

  def apply(output1: StructField, output: StructField*): LocalRelation = {
    new LocalRelation(StructType(output1 +: output).toAttributes)
  }

  def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }

  def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }
}

case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
  extends LeafNode with analysis.MultiInstanceRelation {

  
  override final def newInstance(): this.type = {
    LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type]
  }

  override protected def stringArgs = Iterator(output)

  override def sameResult(plan: LogicalPlan): Boolean = plan match {
    case LocalRelation(otherOutput, otherData) =>
      otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data
    case _ => false
  }

  override lazy val statistics =
    Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length)
}

Source File: SparkSqlSerializer.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import java.nio.ByteBuffer
import java.util.{HashMap => JavaHashMap}

import scala.reflect.ClassTag
import com.esotericsoftware.kryo.io.{Input, Output}
import com.esotericsoftware.kryo.{Kryo, Serializer}
import com.twitter.chill.ResourcePool
import org.apache.spark.serializer.{KryoSerializer, SerializerInstance}
import org.apache.spark.sql.types.{Decimal, StructField, StructType}
import org.apache.spark.util.MutablePair
import org.apache.spark.{SparkConf, SparkEnv}


//private[sql]
class SparkSqlSerializer(conf: SparkConf) extends KryoSerializer(conf) {
  override def newKryo(): Kryo = {
    val kryo = super.newKryo()
    kryo.setRegistrationRequired(false)
    kryo.register(classOf[MutablePair[_, _]])
    kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericRow])
    kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericInternalRow])
    kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericMutableRow])
    kryo.register(classOf[java.math.BigDecimal], new JavaBigDecimalSerializer)
    kryo.register(classOf[BigDecimal], new ScalaBigDecimalSerializer)

    kryo.register(classOf[Decimal])
    kryo.register(classOf[JavaHashMap[_, _]])

    // APS
    kryo.register(classOf[StructType])
    kryo.register(classOf[StructField])

    kryo.setReferences(false)
    kryo
  }
}

private[execution] class KryoResourcePool(size: Int)
  extends ResourcePool[SerializerInstance](size) {

  val ser: SparkSqlSerializer = {
    val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
    new SparkSqlSerializer(sparkConf)
  }

  def newInstance(): SerializerInstance = ser.newInstance()
}

//private[sql]
object SparkSqlSerializer {
  @transient lazy val resourcePool = new KryoResourcePool(30)

  private[this] def acquireRelease[O](fn: SerializerInstance => O): O = {
    val kryo = resourcePool.borrow
    try {
      fn(kryo)
    } finally {
      resourcePool.release(kryo)
    }
  }

  def serialize[T: ClassTag](o: T): Array[Byte] =
    acquireRelease { k =>
      k.serialize(o).array()
    }

  def deserialize[T: ClassTag](bytes: Array[Byte]): T =
    acquireRelease { k =>
      k.deserialize[T](ByteBuffer.wrap(bytes))
    }
}

private[sql] class JavaBigDecimalSerializer extends Serializer[java.math.BigDecimal] {
  def write(kryo: Kryo, output: Output, bd: java.math.BigDecimal) {
    // TODO: There are probably more efficient representations than strings...
    output.writeString(bd.toString)
  }

  def read(kryo: Kryo, input: Input, tpe: Class[java.math.BigDecimal]): java.math.BigDecimal = {
    new java.math.BigDecimal(input.readString())
  }
}

private[sql] class ScalaBigDecimalSerializer extends Serializer[BigDecimal] {
  def write(kryo: Kryo, output: Output, bd: BigDecimal) {
    // TODO: There are probably more efficient representations than strings...
    output.writeString(bd.toString)
  }

  def read(kryo: Kryo, input: Input, tpe: Class[BigDecimal]): BigDecimal = {
    new java.math.BigDecimal(input.readString())
  }
}

Source File: resources.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import java.io.File
import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("Results", StringType, nullable = false)() :: Nil
  }
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val jarList = sparkSession.sparkContext.listJars()
    if (jars.nonEmpty) {
      for {
        jarName <- jars.map(f => new Path(f).getName)
        jarPath <- jarList if jarPath.contains(jarName)
      } yield Row(jarPath)
    } else {
      jarList.map(Row(_))
    }
  }
}

Source File: DDLSourceLoadSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      caseInsensitiveContext.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    caseInsensitiveContext.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    caseInsensitiveContext.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without HiveContext") {
    intercept[ClassNotFoundException] {
      caseInsensitiveContext.read.format("orc").load()
    }
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

Source File: ScalaSparkSQLBySchema.scala From learning-spark with Apache License 2.0

5 votes

package com.javachen.spark.examples.sparksql

import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}

object ScalaSparkSQLBySchema {

  def main(args: Array[String]) {
    val sc = new SparkContext(new SparkConf().setAppName("ScalaSparkSQL"))
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    // Create an RDD
    val people = sc.textFile("people.txt")

    // The schema is encoded in a string
    val schemaString = "name age"

    // Import Spark SQL data types and Row.
    import org.apache.spark.sql._

    // Generate the schema based on the string of schema
    val schema =
      StructType(
        schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))

    // Convert records of the RDD (people) to Rows.
    val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))

    // Apply the schema to the RDD.
    val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema)

    // Register the DataFrames as a table.
    peopleDataFrame.registerTempTable("people")

    // SQL statements can be run by using the sql methods provided by sqlContext.
    val results = sqlContext.sql("SELECT name FROM people")

    // The results of SQL queries are DataFrames and support all the normal RDD operations.
    // The columns of a row in the result can be accessed by ordinal.
    results.map(t => "Name: " + t(0)).collect().foreach(println)
  }
}

Source File: MongodbSchemaIT.scala From Spark-MongoDB with Apache License 2.0

5 votes

package com.stratio.datasource.mongodb.schema

import java.text.SimpleDateFormat
import java.util.Locale

import com.stratio.datasource.MongodbTestConstants
import com.stratio.datasource.mongodb.config.{MongodbConfig, MongodbConfigBuilder}
import com.stratio.datasource.mongodb.partitioner.MongodbPartitioner
import com.stratio.datasource.mongodb.rdd.MongodbRDD
import com.stratio.datasource.mongodb._
import org.apache.spark.sql.mongodb.{TemporaryTestSQLContext, TestSQLContext}
import org.apache.spark.sql.types.{ArrayType, StringType, StructField, TimestampType}
import org.junit.runner.RunWith
import org.scalatest._
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class MongodbSchemaIT extends FlatSpec
with Matchers
with MongoEmbedDatabase
with TestBsonData
with MongodbTestConstants {

  private val host: String = "localhost"
  private val collection: String = "testCol"
  private val readPreference = "secondaryPreferred"

  val testConfig = MongodbConfigBuilder()
    .set(MongodbConfig.Host,List(host + ":" + mongoPort))
    .set(MongodbConfig.Database,db)
    .set(MongodbConfig.Collection,collection)
    .set(MongodbConfig.SamplingRatio,1.0)
    .set(MongodbConfig.ReadPreference, readPreference)
    .build()

  val mongodbPartitioner = new MongodbPartitioner(testConfig)

  val mongodbRDD = new MongodbRDD(TemporaryTestSQLContext, testConfig, mongodbPartitioner)

  behavior of "A schema"

  it should "be inferred from rdd with primitives" + scalaBinaryVersion in {
    withEmbedMongoFixture(primitiveFieldAndType) { mongodProc =>
      val schema = MongodbSchema(mongodbRDD, 1.0).schema()

      schema.fields should have size 7
      schema.fieldNames should contain allOf("string", "integer", "long", "double", "boolean", "null")

      schema.printTreeString()
    }
  }

  it should "be inferred from rdd with complex fields" + scalaBinaryVersion in {
    withEmbedMongoFixture(complexFieldAndType1) { mongodProc =>
      val schema = MongodbSchema(mongodbRDD, 1.0).schema()

      schema.fields should have size 13

      schema.fields filter {
        case StructField(name, ArrayType(StringType, _), _, _) => Set("arrayOfNull", "arrayEmpty") contains name
        case _ => false
      } should have size 2

      schema.printTreeString()
    }
  }

  it should "resolve type conflicts between fields" + scalaBinaryVersion in {
    withEmbedMongoFixture(primitiveFieldValueTypeConflict) { mongodProc =>
      val schema = MongodbSchema(mongodbRDD, 1.0).schema()

      schema.fields should have size 7

      schema.printTreeString()
    }
  }

  it should "be inferred from rdd with more complex fields" + scalaBinaryVersion in {
    withEmbedMongoFixture(complexFieldAndType2) { mongodProc =>
      val schema = MongodbSchema(mongodbRDD, 1.0).schema()

      schema.fields should have size 5

      schema.printTreeString()
    }
  }

  it should "read java.util.Date fields as timestamptype" + scalaBinaryVersion in {
    val dfunc = (s: String) => new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.ENGLISH).parse(s)
    import com.mongodb.casbah.Imports.DBObject
    val stringAndDate = List(DBObject("string" -> "this is a simple string.", "date" -> dfunc("Mon Aug 10 07:52:49 EDT 2015")))
    withEmbedMongoFixture(stringAndDate) { mongodProc =>
      val schema = MongodbSchema(mongodbRDD, 1.0).schema()

      schema.fields should have size 3
      schema.fields.filter(_.name == "date").head.dataType should equal(TimestampType)
      schema.printTreeString()
    }
  }
}

Source File: LOFSuite.scala From spark-lof with Apache License 2.0

5 votes

package org.apache.spark.ml.outlier

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.apache.spark.sql.functions._

object LOFSuite {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("LOFExample")
      .master("local[4]")
      .getOrCreate()

    val schema = new StructType(Array(
      new StructField("col1", DataTypes.DoubleType),
      new StructField("col2", DataTypes.DoubleType)))
    val df = spark.read.schema(schema).csv("data/outlier.csv")

    val assembler = new VectorAssembler()
      .setInputCols(df.columns)
      .setOutputCol("features")
    val data = assembler.transform(df).repartition(4)

    val startTime = System.currentTimeMillis()
    val result = new LOF()
      .setMinPts(5)
      .transform(data)
    val endTime = System.currentTimeMillis()
    result.count()

    // Outliers have much higher LOF value than normal data
    result.sort(desc(LOF.lof)).head(10).foreach { row =>
      println(row.get(0) + " | " + row.get(1) + " | " + row.get(2))
    }
    println("Total time = " + (endTime - startTime) / 1000.0 + "s")
  }
}

Source File: CloudPartitionTest.scala From cloud-integration with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.sql._
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


abstract class CloudPartitionTest extends AbstractCloudRelationTest {

import testImplicits._

  ctest(
    "save-findClass-partitioned-part-columns-in-data",
    "Save sets of files in explicitly set up partition tree; read") {
    withTempPathDir("part-columns", None) { path =>
      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(path, s"p1=$p1/p2=$p2")
        val df = sparkContext
          .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1))
          .toDF("a", "b", "p1")

         df.write
          .format(dataSourceName)
          .mode(SaveMode.ErrorIfExists)
          .save(partitionDir.toString)
        // each of these directories as its own success file; there is
        // none at the root
        resolveSuccessFile(partitionDir, true)
      }

      val dataSchemaWithPartition =
        StructType(
          dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        spark.read.options(Map(
          "path" -> path.toString,
          "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName)
          .load())
    }
  }
}

Source File: PileupTestBase.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.pileup

import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext}
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.sql.types.{IntegerType, ShortType, StringType, StructField, StructType}
import org.scalatest.{BeforeAndAfter, FunSuite}

class PileupTestBase extends FunSuite
  with DataFrameSuiteBase
  with BeforeAndAfter
  with SharedSparkContext{

  val sampleId = "NA12878.multichrom.md"
  val samResPath: String = getClass.getResource("/multichrom/mdbam/samtools.pileup").getPath
  val referencePath: String = getClass.getResource("/reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta").getPath
  val bamPath: String = getClass.getResource(s"/multichrom/mdbam/${sampleId}.bam").getPath
  val cramPath : String = getClass.getResource(s"/multichrom/mdcram/${sampleId}.cram").getPath
  val tableName = "reads_bam"
  val tableNameCRAM = "reads_cram"

  val schema: StructType = StructType(
    List(
      StructField("contig", StringType, nullable = true),
      StructField("position", IntegerType, nullable = true),
      StructField("reference", StringType, nullable = true),
      StructField("coverage", ShortType, nullable = true),
      StructField("pileup", StringType, nullable = true),
      StructField("quality", StringType, nullable = true)
    )
  )
  before {
    System.setProperty("spark.kryo.registrator", "org.biodatageeks.sequila.pileup.serializers.CustomKryoRegistrator")
    spark
      .conf.set("spark.sql.shuffle.partitions",1) //FIXME: In order to get orderBy in Samtools tests working - related to exchange partitions stage
    spark.sql(s"DROP TABLE IF EXISTS $tableName")
    spark.sql(
      s"""
         |CREATE TABLE $tableName
         |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource
         |OPTIONS(path "$bamPath")
         |
      """.stripMargin)

    spark.sql(s"DROP TABLE IF EXISTS $tableNameCRAM")
    spark.sql(
      s"""
         |CREATE TABLE $tableNameCRAM
         |USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource
         |OPTIONS(path "$cramPath", refPath "$referencePath" )
         |
      """.stripMargin)

    val mapToString = (map: Map[Byte, Short]) => {
      if (map == null)
        "null"
      else
        map.map({
          case (k, v) => k.toChar -> v}).mkString.replace(" -> ", ":")
    }

    val byteToString = ((byte: Byte) => byte.toString)

    spark.udf.register("mapToString", mapToString)
    spark.udf.register("byteToString", byteToString)
  }

}

Source File: A_1_DataFrameTest.scala From wow-spark with MIT License

5 votes

package com.sev7e0.wow.sql

import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}


      //定义一个schema
    val schemaString = "name age"
    val fields = schemaString.split(" ")
      .map(filedName => StructField(filedName, StringType, nullable = true))
    val structType = StructType(fields)

    val personRDD = sparkSession.sparkContext.textFile("src/main/resources/sparkresource/people.txt")
      .map(_.split(","))
      //将RDD转换为行
      .map(attr => Row(attr(0), attr(1).trim))
    //将schema应用于RDD，并创建df
    sparkSession.createDataFrame(personRDD,structType).createOrReplaceTempView("people1")
    val dataFrameBySchema = sparkSession.sql("select name,age from people1 where age > 19 ")
    dataFrameBySchema.show()
  }

}

Source File: JDBCUtilityFunctions.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.jdbc.utilities

import scala.collection.immutable.Map

import org.apache.spark.sql.types.StructField

import com.paypal.gimel.common.conf.GimelConstants

object JDBCUtilityFunctions {
  
  def prepareCreateStatement(sql: String, dbtable: String, dataSetProps: Map[String, Any]): String = {
    // Here we remove the SELECT portion and have only the CREATE portion of the DDL supplied so that we can use that to create the table
    val sqlParts = sql.split(" ")
    val lenPartKeys = sqlParts.length
    val index = sqlParts.indexWhere(_.toUpperCase() == "SELECT")
    val createOnly: String = sqlParts.slice(0, index - 1).mkString(" ")

    // Here we remove the PCATALOG prefix => we replace pcatalog.storagetype.storagesystem.DB.Table with DB.Table
    val createParts = createOnly.split(" ")
    val pcatSQL = createParts.map(element => {
      if (element.toLowerCase().contains(GimelConstants.PCATALOG_STRING)) {
        // we replace pcatalog.storagetype.storagesystem.DB.Table with DB.Table
        element.split('.').tail.mkString(".").split('.').tail.mkString(".").split('.').tail.mkString(".")
      }
      else {
        element
      }
    }
    ).mkString(" ")

    val sparkSchema = dataSetProps(GimelConstants.TABLE_FILEDS).asInstanceOf[Array[StructField]]
    // From the dataframe schema, translate them into Teradata data types
    val gimelSchema: Array[com.paypal.gimel.common.catalog.Field] = sparkSchema.map(x => {
      com.paypal.gimel.common.catalog.Field(x.name, SparkToJavaConverter.getTeradataDataType(x.dataType), x.nullable)
    })
    val colList: Array[String] = gimelSchema.map(x => (x.fieldName + " " + (x.fieldType) + ","))
    val conCatColumns = colList.mkString("").dropRight(1)
    val colQulifier = s"""(${conCatColumns})"""

    // Here we inject back the columns with data types back in the SQL statemnt
    val newSqlParts = pcatSQL.split(" ")
    val PCATindex = newSqlParts.indexWhere(_.toUpperCase().contains("TABLE"))
    val catPrefix = newSqlParts.slice(0, PCATindex + 2).mkString(" ")
    val catSuffix = newSqlParts.slice(PCATindex + 2, newSqlParts.length).mkString(" ")
    val fullStatement = s"""${catPrefix} ${colQulifier} ${catSuffix}"""
    fullStatement.trim()
  }


}

Source File: JdbcConstants.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.jdbc.conf

import org.apache.spark.sql.types.{StringType, StructField, StructType}

import com.paypal.gimel.common.conf.GimelConstants

object JdbcConstants {

  // basic variable references
  val HDFS_PREFIX: String = "hdfs:///user"
  val TD_PASS_FILENAME_DEFAULT: String = "pass.dat"
  val P_FILEPATH = s"${HDFS_PREFIX}/${GimelConstants.USER_NAME}/password/teradata/${TD_PASS_FILENAME_DEFAULT}"
  val MYSQL = "MYSQL"
  val TERADATA = "TERADATA"
  val ORCALE = "ORACLE"
  val POSTGRESQL = "POSTGRESQL"

  val HDFS_FILE_SOURCE = "hdfs"
  val LOCAL_FILE_SOURCE = "local"
  val DEFAULT_P_FILE_SOURCE = HDFS_FILE_SOURCE
  val JDBC_FILE_PASSWORD_STRATEGY = "file"
  val JDBC_INLINE_PASSWORD_STRATEGY = "inline"
  val JDBC_PROXY_USERNAME = "gimelproxyuser"
  val JDBC_CUSTOM_PASSWORD_STRATEGY = "custom"
  val JDBC_DEFAULT_PASSWORD_STRATEGY = JDBC_CUSTOM_PASSWORD_STRATEGY
  val JDBC_AUTH_REQUEST_TYPE = "JDBC"

  // default TD properties
  val DEFAULT_TD_SESSIONS = 5
  val DEFAULT_CHARSET = "UTF16"
  val DEFAULT_SESSIONS = "6"
  val TD_FASTLOAD_KEY: String = "FASTLOAD"
  val TD_FASTEXPORT_KEY: String = "FASTEXPORT"
  val TD_FASTLOAD_KEY_LC: String = TD_FASTLOAD_KEY.toLowerCase
  val TD_FASTEXPORT_KEY_LC: String = TD_FASTEXPORT_KEY.toLowerCase

  // JDBC READ configs
  val MAX_TD_JDBC_READ_PARTITIONS = 24
  val MAX_FAST_EXPORT_READ_PARTITIONS = 2
  val DEFAULT_READ_FETCH_SIZE = 1000
  val DEFAULT_LOWER_BOUND = 0
  val DEFAULT_UPPER_BOUND = 20
  val DEFAULT_READ_TYPE = "BATCH"
  val READ_OPERATION = "read"

  // JDBC write configs
  val GIMEL_TEMP_PARTITION = "GIMEL_TEMP_PARTITION"
  val DEFAULT_WRITE_BATCH_SIZE = 10000
  val MAX_TD_JDBC_WRITE_PARTITIONS: Int = 24
  val MAX_FAST_LOAD_WRITE_PARTITIONS: Int = 2
  val DEFAULT_INSERT_STRATEGY = "insert"
  val DEFAULT_WRITE_TYPE = "BATCH"
  val WRITE_OPERATION = "write"
  val REPARTITION_METHOD = "repartition"
  val COALESCE_METHOD = "coalesce"

  // partitions for Systems other than Teradata
  val DEFAULT_JDBC_READ_PARTITIONS = 100
  val DEFAULT_JDBC_WRTIE_PARTITIONS = 100
  val DEFAULT_JDBC_PER_PROCESS_MAX_ROWS_LIMIT: Long = 1000000000L
  val DEFAULT_JDBC_PER_PROCESS_MAX_ROWS_LIMIT_STRING: String = DEFAULT_JDBC_PER_PROCESS_MAX_ROWS_LIMIT.toString

  // pushdown constants
  val DEF_JDBC_PUSH_DOWN_SCHEMA: StructType = new StructType(fields = Seq(
    StructField("QUERY_EXECUTION", StringType, nullable = false)
  ).toArray)
}

Source File: RestApiConsumer.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.restapi.reader

import scala.language.implicitConversions

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import com.paypal.gimel.common.gimelservices.GimelServiceUtilities
import com.paypal.gimel.logger.Logger
import com.paypal.gimel.restapi.conf.RestApiClientConfiguration



object RestApiConsumer {

  val logger: Logger = Logger()
  val utils: GimelServiceUtilities = GimelServiceUtilities()

  def consume(sparkSession: SparkSession, conf: RestApiClientConfiguration): DataFrame = {
    def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName()
    logger.info(" @Begin --> " + MethodName)

    val responsePayload = conf.httpsFlag match {
      case false => utils.get(conf.resolvedUrl.toString)
      case true => utils.httpsGet(conf.resolvedUrl.toString)
    }
    conf.parsePayloadFlag match {
      case false =>
        logger.info("NOT Parsing payload.")
        val rdd: RDD[String] = sparkSession.sparkContext.parallelize(Seq(responsePayload))
        val rowRdd: RDD[Row] = rdd.map(Row(_))
        val field: StructType = StructType(Seq(StructField(conf.payloadFieldName, StringType)))
        sparkSession.sqlContext.createDataFrame(rowRdd, field)
      case true =>
        logger.info("Parsing payload to fields - as requested.")
        val rdd: RDD[String] = sparkSession.sparkContext.parallelize(Seq(responsePayload))
        sparkSession.sqlContext.read.json(rdd)
    }
  }

}

Source File: TransformerSpec.scala From modelmatrix with Apache License 2.0

5 votes

package com.collective.modelmatrix.transform

import com.collective.modelmatrix.{ModelFeature, ModelMatrix, TestSparkContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalatest.FlatSpec

class TransformerSpec extends FlatSpec with TestSparkContext {

  val sqlContext = ModelMatrix.sqlContext(sc)

  val schema = StructType(Seq(
    StructField("adv_site", StringType),
    StructField("adv_id", IntegerType)
  ))

  val input = Seq(
    Row("cnn.com", 1),
    Row("bbc.com", 2),
    Row("hbo.com", 1),
    Row("mashable.com", 3)
  )

  val isActive = true
  val withAllOther = true

  // Can't call 'day_of_week' with String
  val badFunctionType = ModelFeature(isActive, "advertisement", "f1", "day_of_week(adv_site, 'UTC')", Top(95.0, allOther = false))

  // Not enough parameters for 'concat'
  val wrongParametersCount = ModelFeature(isActive, "advertisement", "f2", "concat(adv_site)", Top(95.0, allOther = false))

  val df = sqlContext.createDataFrame(sc.parallelize(input), schema)

  "Transformer" should "report failed feature extraction" in {
    val features = Transformer.extractFeatures(df, Seq(badFunctionType, wrongParametersCount))
    assert(features.isLeft)
    val errors = features.fold(identity, _ => sys.error("Should not be here"))

    assert(errors.length == 2)
    assert(errors(0).feature == badFunctionType)
    assert(errors(1).feature == wrongParametersCount)
  }

}

Source File: IdentityTransformerSpec.scala From modelmatrix with Apache License 2.0

5 votes

package com.collective.modelmatrix.transform

import com.collective.modelmatrix.{ModelFeature, ModelMatrix, TestSparkContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalatest.FlatSpec

import scalaz.syntax.either._
import scalaz.{-\/, \/-}

class IdentityTransformerSpec extends FlatSpec with TestSparkContext {

  val sqlContext = ModelMatrix.sqlContext(sc)

  val schema = StructType(Seq(
    StructField("adv_site", StringType),
    StructField("adv_id", IntegerType)
  ))

  val input = Seq(
      Row("cnn.com", 1),
      Row("bbc.com", 2),
      Row("hbo.com", 1),
      Row("mashable.com", 3)
  )

  val isActive = true
  val withAllOther = true

  val adSite = ModelFeature(isActive, "Ad", "ad_site", "adv_site", Identity)
  val adId = ModelFeature(isActive, "Ad", "ad_id", "adv_id", Identity)

  val df = sqlContext.createDataFrame(sc.parallelize(input), schema)
  val transformer = new IdentityTransformer(Transformer.extractFeatures(df, Seq(adSite, adId)) match {
    case -\/(err) => sys.error(s"Can't extract features: $err")
    case \/-(suc) => suc
  })

  "Identity Transformer" should "support integer typed model feature" in {
    val valid = transformer.validate(adId)
    assert(valid == TypedModelFeature(adId, IntegerType).right)
  }

  it should "fail if feature column doesn't exists" in {
    val failed = transformer.validate(adSite.copy(feature = "adv_site"))
    assert(failed == FeatureTransformationError.FeatureColumnNotFound("adv_site").left)
  }

  it should "fail if column type is not supported" in {
    val failed = transformer.validate(adSite)
    assert(failed == FeatureTransformationError.UnsupportedTransformDataType("ad_site", StringType, Identity).left)
  }

}

Source File: DataFrameNaFunctionsSpec.scala From spark-spec with MIT License

5 votes

package com.github.mrpowers.spark.spec.sql

import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.apache.spark.sql.Row
import org.scalatest._
import com.github.mrpowers.spark.daria.sql.SparkSessionExt._

import com.github.mrpowers.spark.spec.SparkSessionTestWrapper

import com.github.mrpowers.spark.fast.tests.DatasetComparer

class DataFrameNaFunctionsSpec extends FunSpec with SparkSessionTestWrapper with DatasetComparer {

  import spark.implicits._

  describe("#drop") {

    it("drops rows that contains null values") {

      val sourceData = List(
        Row(1, null),
        Row(null, null),
        Row(3, 30),
        Row(10, 20)
      )

      val sourceSchema = List(
        StructField("num1", IntegerType, true),
        StructField("num2", IntegerType, true)
      )

      val sourceDF = spark.createDataFrame(
        spark.sparkContext.parallelize(sourceData),
        StructType(sourceSchema)
      )

      val actualDF = sourceDF.na.drop()

      val expectedData = List(
        Row(3, 30),
        Row(10, 20)
      )

      val expectedSchema = List(
        StructField("num1", IntegerType, true),
        StructField("num2", IntegerType, true)
      )

      val expectedDF = spark.createDataFrame(
        spark.sparkContext.parallelize(expectedData),
        StructType(expectedSchema)
      )

      assertSmallDatasetEquality(actualDF, expectedDF)

    }

  }

  describe("#fill") {

    it("Returns a new DataFrame that replaces null or NaN values in numeric columns with value") {

      val sourceDF = spark.createDF(
        List(
          (1, null),
          (null, null),
          (3, 30),
          (10, 20)
        ), List(
          ("num1", IntegerType, true),
          ("num2", IntegerType, true)
        )
      )

      val actualDF = sourceDF.na.fill(77)

      val expectedDF = spark.createDF(
        List(
          (1, 77),
          (77, 77),
          (3, 30),
          (10, 20)
        ), List(
          ("num1", IntegerType, false),
          ("num2", IntegerType, false)
        )
      )

      assertSmallDatasetEquality(actualDF, expectedDF)

    }

  }

  describe("#replace") {
    pending
  }

}

Source File: N1qlSpec.scala From couchbase-spark-connector with Apache License 2.0

5 votes

package com.couchbase.spark.n1ql

import com.couchbase.client.core.CouchbaseException
import com.couchbase.client.java.error.QueryExecutionException
import com.couchbase.client.java.query.N1qlQuery
import org.apache.spark.{SparkConf, SparkContext, SparkException}
import org.apache.spark.sql.sources.EqualTo
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.scalatest._
import com.couchbase.spark._
import com.couchbase.spark.connection.CouchbaseConnection
import com.couchbase.spark.sql.N1QLRelation
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import scala.util.control.NonFatal

class N1qlSpec extends FunSuite with Matchers with BeforeAndAfterAll {

  private val master = "local[2]"
  private val appName = "cb-int-specs1"

  private var spark: SparkSession = _


  override def beforeAll(): Unit = {
    spark = SparkSession
      .builder()
      .master(master)
      .appName(appName)
      .config("spark.couchbase.username", "Administrator")
      .config("spark.couchbase.password", "password")
      // Open 2 buckets as tests below rely on it
      .config("com.couchbase.bucket.default", "")
      .config("com.couchbase.bucket.travel-sample", "")
      .getOrCreate()
  }

  override def afterAll(): Unit = {
    CouchbaseConnection().stop()
    spark.stop()
  }

  test("Creating N1QLRelation with default bucket, when two buckets exist, should fail") {
    assertThrows[IllegalStateException] {
      spark.read
        .format("com.couchbase.spark.sql.DefaultSource")
        .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline")))
        .option("schemaFilter", "`type` = 'airline'")
        .schema(StructType(StructField("name", StringType) :: Nil))
        .load()
    }
  }

  test("Creating N1QLRelation with non-default bucket, when two buckets exist, should succeed") {
    spark.read
      .format("com.couchbase.spark.sql.DefaultSource")
      .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline")))
      .option("schemaFilter", "`type` = 'airline'")
      .option("bucket", "travel-sample")
      .schema(StructType(StructField("name", StringType) :: Nil))
      .load()
  }

  test("N1QL failures should fail the Observable") {
    try {
      spark.sparkContext
        .couchbaseQuery(N1qlQuery.simple("BAD QUERY"), bucketName = "default")
        .collect()
        .foreach(println)
      fail()
    }
    catch {
      case e: SparkException =>
        assert (e.getCause.isInstanceOf[QueryExecutionException])
        val err = e.getCause.asInstanceOf[QueryExecutionException]
        assert (err.getMessage == "syntax error - at QUERY")
      case NonFatal(e) =>
        println(e)
        fail()
    }
  }
}

Source File: CouchbaseDataFrameSpec.scala From couchbase-spark-connector with Apache License 2.0

5 votes

package com.couchbase.spark.sql

import com.couchbase.spark.connection.CouchbaseConnection
import org.apache.avro.generic.GenericData.StringType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}
import org.apache.spark.sql.sources.EqualTo
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest._
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class CouchbaseDataFrameSpec extends FlatSpec with Matchers with BeforeAndAfterAll {

  private val master = "local[2]"
  private val appName = "cb-int-specs1"

  private var spark: SparkSession = null


  override def beforeAll(): Unit = {
    val conf = new SparkConf()
      .setMaster(master)
      .setAppName(appName)
      .set("spark.couchbase.nodes", "127.0.0.1")
      .set("com.couchbase.username", "Administrator")
      .set("com.couchbase.password", "password")
      .set("com.couchbase.bucket.default", "")
      .set("com.couchbase.bucket.travel-sample", "")
    spark = SparkSession.builder().config(conf).getOrCreate()

    loadData()
  }

  override def afterAll(): Unit = {
    CouchbaseConnection().stop()
    spark.stop()
  }

  def loadData(): Unit = {

  }

  "If two buckets are used and the bucket is specified the API" should
    "not fail" in {
    val ssc = spark.sqlContext
    ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample"))
  }

  "The DataFrame API" should "infer the schemas" in {
    val ssc = spark.sqlContext
    import com.couchbase.spark.sql._

    val airline = ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample"))
    val airport = ssc.read.couchbase(EqualTo("type", "airport"), Map("bucket" -> "travel-sample"))
    val route = ssc.read.couchbase(EqualTo("type", "route"), Map("bucket" -> "travel-sample"))
    val landmark = ssc.read.couchbase(EqualTo("type", "landmark"), Map("bucket" -> "travel-sample"))


    airline
      .limit(10)
      .write
      .mode(SaveMode.Overwrite)
      .couchbase(Map("bucket" -> "default"))

    // TODO: validate schemas which are inferred on a field and type basis

  }

  it should "write and ignore" in {
    val ssc = spark.sqlContext
    import com.couchbase.spark.sql._

    // create df, write it twice
    val data = ("Michael", 28, true)
    val df = ssc.createDataFrame(spark.sparkContext.parallelize(Seq(data)))

    df.write
      .mode(SaveMode.Ignore)
      .couchbase(options = Map("idField" -> "_1", "bucket" -> "default"))

    df.write
      .mode(SaveMode.Ignore)
      .couchbase(options = Map("idField" -> "_1", "bucket" -> "default"))
  }

  it should "filter based on a function" in {
    val ssc = spark.sqlContext
    import com.couchbase.spark.sql._

    val airlineBySubstrCountry: DataFrame = ssc.read.couchbase(
      EqualTo("'substr(country, 0, 6)'", "United"), Map("bucket" -> "travel-sample"))

    airlineBySubstrCountry.count() should equal(6797)
  }

}

Source File: StatisticsTest.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.statistics

import java.io.ByteArrayOutputStream

import scala.collection.mutable.ArrayBuffer

import org.scalatest.BeforeAndAfterEach

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.BaseOrdering
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering
import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache
import org.apache.spark.sql.execution.datasources.oap.index.RangeInterval
import org.apache.spark.sql.execution.datasources.oap.utils.{NonNullKeyReader, NonNullKeyWriter}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.unsafe.Platform
import org.apache.spark.unsafe.memory.MemoryBlock
import org.apache.spark.unsafe.types.UTF8String

abstract class StatisticsTest extends SparkFunSuite with BeforeAndAfterEach {

  protected def rowGen(i: Int): InternalRow = InternalRow(i, UTF8String.fromString(s"test#$i"))

  protected lazy val schema: StructType = StructType(StructField("a", IntegerType)
    :: StructField("b", StringType) :: Nil)
  @transient
  protected lazy val nnkw: NonNullKeyWriter = new NonNullKeyWriter(schema)
  @transient
  protected lazy val nnkr: NonNullKeyReader = new NonNullKeyReader(schema)
  @transient
  protected lazy val ordering: BaseOrdering = GenerateOrdering.create(schema)
  @transient
  protected lazy val partialOrdering: BaseOrdering =
    GenerateOrdering.create(StructType(schema.dropRight(1)))
  protected var out: ByteArrayOutputStream = _

  protected var intervalArray: ArrayBuffer[RangeInterval] = new ArrayBuffer[RangeInterval]()

  override def beforeEach(): Unit = {
    out = new ByteArrayOutputStream(8000)
  }

  override def afterEach(): Unit = {
    out.close()
    intervalArray.clear()
  }

  protected def generateInterval(
      start: InternalRow, end: InternalRow,
      startInclude: Boolean, endInclude: Boolean): Unit = {
    intervalArray.clear()
    intervalArray.append(new RangeInterval(start, end, startInclude, endInclude))
  }

  protected def checkInternalRow(row1: InternalRow, row2: InternalRow): Unit = {
    val res = row1 == row2 // it works..
    assert(res, s"row1: $row1 does not match $row2")
  }

  protected def wrapToFiberCache(out: ByteArrayOutputStream): FiberCache = {
    val bytes = out.toByteArray
    FiberCache(bytes)
  }
}

Source File: ProxyFeedback.scala From oni-ml with Apache License 2.0

5 votes

package org.opennetworkinsight.proxy

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StructType, StructField, StringType}
import scala.io.Source

import org.opennetworkinsight.proxy.ProxySchema._


object ProxyFeedback {

  
  def loadFeedbackDF(sc: SparkContext,
                     sqlContext: SQLContext,
                     feedbackFile: String,
                     duplicationFactor: Int): DataFrame = {


    val feedbackSchema = StructType(
      List(StructField(Date, StringType, nullable= true),
        StructField(Time, StringType, nullable= true),
        StructField(ClientIP, StringType, nullable= true),
        StructField(Host, StringType, nullable= true),
        StructField(ReqMethod, StringType, nullable= true),
        StructField(UserAgent, StringType, nullable= true),
        StructField(ResponseContentType, StringType, nullable= true),
        StructField(RespCode, StringType, nullable= true),
        StructField(FullURI, StringType, nullable= true)))

    if (new java.io.File(feedbackFile).exists) {

      val dateIndex = 0
      val timeIndex = 1
      val clientIpIndex = 2
      val hostIndex = 3
      val reqMethodIndex = 4
      val userAgentIndex = 5
      val resContTypeIndex = 6
      val respCodeIndex = 11
      val fullURIIndex = 18

      val fullURISeverityIndex = 22

      val lines = Source.fromFile(feedbackFile).getLines().toArray.drop(1)
      val feedback: RDD[String] = sc.parallelize(lines)

      sqlContext.createDataFrame(feedback.map(_.split("\t"))
        .filter(row => row(fullURISeverityIndex).trim.toInt == 3)
        .map(row => Row.fromSeq(List(row(dateIndex),
          row(timeIndex),
          row(clientIpIndex),
          row(hostIndex),
          row(reqMethodIndex),
          row(userAgentIndex),
          row(resContTypeIndex),
          row(respCodeIndex),
          row(fullURIIndex))))
        .flatMap(row => List.fill(duplicationFactor)(row)), feedbackSchema)
        .select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, RespCode, FullURI)
    } else {
      sqlContext.createDataFrame(sc.emptyRDD[Row], feedbackSchema)
    }
  }
}

Source File: CountByRatingChart.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import java.awt.Font

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.jfree.chart.axis.CategoryLabelPositions

import scalax.chart.module.ChartFactories



    val customSchema = StructType(Array(
      StructField("user_id", IntegerType, true),
      StructField("movie_id", IntegerType, true),
      StructField("rating", IntegerType, true),
      StructField("timestamp", IntegerType, true)))

    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val spark = SparkSession
      .builder()
      .appName("SparkRatingData").config(spConfig)
      .getOrCreate()

    val rating_df = spark.read.format("com.databricks.spark.csv")
      .option("delimiter", "\t").schema(customSchema)
      .load("../../data/ml-100k/u.data")

    val rating_df_count = rating_df.groupBy("rating").count().sort("rating")
    //val rating_df_count_sorted = rating_df_count.sort("count")
    rating_df_count.show()
    val rating_df_count_collection = rating_df_count.collect()

    val ds = new org.jfree.data.category.DefaultCategoryDataset
    val mx = scala.collection.immutable.ListMap()

    for( x <- 0 until rating_df_count_collection.length) {
      val occ = rating_df_count_collection(x)(0)
      val count = Integer.parseInt(rating_df_count_collection(x)(1).toString)
      ds.addValue(count,"UserAges", occ.toString)
    }


    //val sorted =  ListMap(ratings_count.toSeq.sortBy(_._1):_*)
    //val ds = new org.jfree.data.category.DefaultCategoryDataset
    //sorted.foreach{ case (k,v) => ds.addValue(v,"Rating Values", k)}

    val chart = ChartFactories.BarChart(ds)
    val font = new Font("Dialog", Font.PLAIN,5);

    chart.peer.getCategoryPlot.getDomainAxis().
      setCategoryLabelPositions(CategoryLabelPositions.UP_90);
    chart.peer.getCategoryPlot.getDomainAxis.setLabelFont(font)
    chart.show()
    Util.sc.stop()
  }
}

Source File: UserRatingsChart.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

import scalax.chart.module.ChartFactories


object UserRatingsChart {

  def main(args: Array[String]) {

    val customSchema = StructType(Array(
      StructField("user_id", IntegerType, true),
      StructField("movie_id", IntegerType, true),
      StructField("rating", IntegerType, true),
      StructField("timestamp", IntegerType, true)))

    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val spark = SparkSession
      .builder()
      .appName("SparkRatingData").config(spConfig)
      .getOrCreate()

    val rating_df = spark.read.format("com.databricks.spark.csv")
      .option("delimiter", "\t").schema(customSchema)
      .load("../../data/ml-100k/u.data")

    val rating_nos_by_user = rating_df.groupBy("user_id").count().sort("count")
    val ds = new org.jfree.data.category.DefaultCategoryDataset
    rating_nos_by_user.show(rating_nos_by_user.collect().length)
    val rating_nos_by_user_collect =rating_nos_by_user.collect()

    var mx = Map(0 -> 0)
    val min = 1
    val max = 1000
    val bins = 100

    val step = (max/bins).toInt
    for (i <- step until (max + step) by step) {
      mx += (i -> 0);
    }
    for( x <- 0 until rating_nos_by_user_collect.length) {
      val user_id = Integer.parseInt(rating_nos_by_user_collect(x)(0).toString)
      val count = Integer.parseInt(rating_nos_by_user_collect(x)(1).toString)
      ds.addValue(count,"Ratings", user_id)
    }
   // ------------------------------------------------------------------

    val chart = ChartFactories.BarChart(ds)
    chart.peer.getCategoryPlot.getDomainAxis().setVisible(false)

    chart.show()
    Util.sc.stop()
  }
}

Source File: UserData.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.df
//import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType};
package object UserData {
  def main(args: Array[String]): Unit = {
    val customSchema = StructType(Array(
      StructField("no", IntegerType, true),
      StructField("age", StringType, true),
      StructField("gender", StringType, true),
      StructField("occupation", StringType, true),
      StructField("zipCode", StringType, true)));
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val spark = SparkSession
      .builder()
      .appName("SparkUserData").config(spConfig)
      .getOrCreate()

    val user_df = spark.read.format("com.databricks.spark.csv")
      .option("delimiter", "|").schema(customSchema)
      .load("/home/ubuntu/work/ml-resources/spark-ml/data/ml-100k/u.user")
    val first = user_df.first()
    println("First Record : " + first)

    val num_genders = user_df.groupBy("gender").count().count()
    val num_occupations = user_df.groupBy("occupation").count().count()
    val num_zipcodes = user_df.groupBy("zipCode").count().count()

    println("num_users : " + user_df.count())
    println("num_genders : "+ num_genders)
    println("num_occupations : "+ num_occupations)
    println("num_zipcodes: " + num_zipcodes)
    println("Distribution by Occupation")
    println(user_df.groupBy("occupation").count().show())

  }
}

Source File: Util.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructField, StructType}


object Util {
  val PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/"
  val DATA_PATH= "../../../data/ml-100k"
  val PATH_MOVIES = DATA_PATH + "/u.item"

  def reduceDimension2(x: Vector) : String= {
    var i = 0
    var l = x.toArray.size
    var l_2 = l/2.toInt
    var x_ = 0.0
    var y_ = 0.0

    for(i <- 0 until l_2) {
      x_ += x(i).toDouble
    }
    for(i <- (l_2 + 1) until l) {
      y_ += x(i).toDouble
    }
    var t = x_ + "," + y_
    return t
  }

  def getMovieDataDF(spark : SparkSession) : DataFrame = {

    //1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
    // |0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
    val customSchema = StructType(Array(
      StructField("id", StringType, true),
      StructField("name", StringType, true),
      StructField("date", StringType, true),
      StructField("url", StringType, true)));
    val movieDf = spark.read.format("com.databricks.spark.csv")
      .option("delimiter", "|").schema(customSchema)
      .load(PATH_MOVIES)
    return movieDf
  }

}

Source File: PrettifyTest.scala From spark-testing-base with Apache License 2.0

5 votes

package com.holdenkarau.spark.testing

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalacheck.Gen
import org.scalacheck.Prop._
import org.scalacheck.util.Pretty
import org.scalatest.FunSuite
import org.scalatest.exceptions.GeneratorDrivenPropertyCheckFailedException
import org.scalatest.prop.Checkers

class PrettifyTest extends FunSuite with SharedSparkContext with Checkers with Prettify {
  implicit val propertyCheckConfig = PropertyCheckConfig(minSize = 2, maxSize = 2)

  test("pretty output of DataFrame's check") {
    val schema = StructType(List(StructField("name", StringType), StructField("age", IntegerType)))
    val sqlContext = new SQLContext(sc)
    val nameGenerator = new Column("name", Gen.const("Holden Hanafy"))
    val ageGenerator = new Column("age", Gen.const(20))

    val dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields(sqlContext, schema)(nameGenerator, ageGenerator)

    val actual = runFailingCheck(dataframeGen.arbitrary)
    val expected =
      Some("arg0 = <DataFrame: schema = [name: string, age: int], size = 2, values = ([Holden Hanafy,20], [Holden Hanafy,20])>")
    assert(actual == expected)
  }

  test("pretty output of RDD's check") {
    val rddGen = RDDGenerator.genRDD[(String, Int)](sc) {
      for {
        name <- Gen.const("Holden Hanafy")
        age <- Gen.const(20)
      } yield name -> age
    }

    val actual = runFailingCheck(rddGen)
    val expected =
      Some("""arg0 = <RDD: size = 2, values = ((Holden Hanafy,20), (Holden Hanafy,20))>""")
    assert(actual == expected)
  }

  test("pretty output of Dataset's check") {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val datasetGen = DatasetGenerator.genDataset[(String, Int)](sqlContext) {
      for {
        name <- Gen.const("Holden Hanafy")
        age <- Gen.const(20)
      } yield name -> age
    }

    val actual = runFailingCheck(datasetGen)
    val expected =
      Some("""arg0 = <Dataset: schema = [_1: string, _2: int], size = 2, values = ((Holden Hanafy,20), (Holden Hanafy,20))>""")
    assert(actual == expected)
  }

  private def runFailingCheck[T](genUnderTest: Gen[T])(implicit p: T => Pretty) = {
    val property = forAll(genUnderTest)(_ => false)
    val e = intercept[GeneratorDrivenPropertyCheckFailedException] {
      check(property)
    }
    takeSecondToLastLine(e.message)
  }

  private def takeSecondToLastLine(msg: Option[String]) =
    msg.flatMap(_.split("\n").toList.reverse.tail.headOption.map(_.trim))

}

Source File: MLScalaCheckTest.scala From spark-testing-base with Apache License 2.0

5 votes

package com.holdenkarau.spark.testing

import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.{StructField, StructType}
import org.scalacheck.Prop.forAll
import org.scalatest.FunSuite
import org.scalatest.prop.Checkers

class MLScalaCheckTest extends FunSuite with SharedSparkContext with Checkers {
  // re-use the spark context
  override implicit def reuseContextIfPossible: Boolean = false

  test("vector generation") {
    val schema = StructType(List(StructField("vector", VectorType)))
    val sqlContext = new SQLContext(sc)
    val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)

    val property =
      forAll(dataframeGen.arbitrary) {
        dataframe => {
          dataframe.schema === schema && dataframe.count >= 0
        }
      }

    check(property)
  }

  test("matrix generation") {
    val schema = StructType(List(StructField("matrix", MatrixType)))
    val sqlContext = new SQLContext(sc)
    val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)

    val property =
      forAll(dataframeGen.arbitrary) {
        dataframe => {
          dataframe.schema === schema && dataframe.count >= 0
        }
      }

    check(property)
  }
}

Source File: PartitionAndSleepWorkload.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.workload.exercise

import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._
import com.ibm.sparktc.sparkbench.utils.SaveModes
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object PartitionAndSleepWorkload extends WorkloadDefaults {
  val name = "timedsleep"
  val partitions: Int = 48
  val sleepms: Long = 12000L

  def apply(m: Map[String, Any]) = new PartitionAndSleepWorkload(
    input = None,
    output = None,
    partitions = getOrDefault[Int](m, "partitions", partitions),
    sleepMS = getOrDefault[Long](m, "sleepms", sleepms, any2Long))
}

case class PartitionAndSleepWorkload(input: Option[String] = None,
                                     output: Option[String] = None,
                                     saveMode: String = SaveModes.error,
                                     partitions: Int,
                                     sleepMS: Long) extends Workload {

  def doStuff(spark: SparkSession): (Long, Unit) = time {

    val ms = sleepMS
    val stuff: RDD[Int] = spark.sparkContext.parallelize(0 until partitions * 100, partitions)

    val cool: RDD[(Int, Int)] = stuff.map { i =>
      Thread.sleep(ms)
      (i % 10, i + 42)
    }

    val yeah = cool.reduceByKey(_ + _)
    yeah.collect()
  }

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {
    val (t, _) = doStuff(spark)

    val schema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("runtime", LongType, nullable = false)
      )
    )

    val timeList = spark.sparkContext.parallelize(Seq(Row("timedsleep", System.currentTimeMillis(), t)))

    spark.createDataFrame(timeList, schema)
  }
}

Source File: GraphDataGen.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.datageneration

import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{any2Long, getOrDefault, getOrThrow, time}
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.apache.spark.graphx.util.GraphGenerators

object GraphDataGen extends WorkloadDefaults {

  val name = "graph-data-generator"
  val defaultMu = 4.0
  val defaultSigma = 1.3
  val defaultSeed = -1L
  val defaultNumOfPartitions = 0

  override def apply(m: Map[String, Any]): GraphDataGen = {
      val numVertices = getOrThrow(m, "vertices").asInstanceOf[Int]
      val mu = getOrDefault[Double](m, "mu", defaultMu)
      val sigma = getOrDefault[Double](m, "sigma", defaultSigma)
      val numPartitions = getOrDefault[Int](m, "partitions", defaultNumOfPartitions)
      val seed = getOrDefault[Long](m, "seed", defaultSeed, any2Long)
      val output = {
        val str = getOrThrow(m, "output").asInstanceOf[String]
        val s = verifySuitabilityOfOutputFileFormat(str)
        Some(s)
      }
    val saveMode = getOrDefault[String](m, "save-mode", SaveModes.error)

    new GraphDataGen(
      numVertices = numVertices,
      input = None,
      output = output,
      saveMode = saveMode,
      mu = mu,
      sigma = sigma,
      seed = seed,
      numPartitions = numPartitions
    )
  }

  
  private[datageneration] def verifySuitabilityOfOutputFileFormat(str: String): String = {
    val strArr: Array[String] = str.split('.')

    (strArr.length, strArr.last) match {
      case (1, _) => throw SparkBenchException("Output file for GraphDataGen must have \".txt\" as the file extension." +
        "Please modify your config file.")
      case (2, "txt") => str
      case (_, _) => throw SparkBenchException("Due to limitations of the GraphX GraphLoader, " +
        "the graph data generators may only save files as \".txt\"." +
        "Please modify your config file.")
    }
  }

}

case class GraphDataGen (
                          numVertices: Int,
                          input: Option[String] = None,
                          output: Option[String],
                          saveMode: String,
                          mu: Double = 4.0,
                          sigma: Double = 1.3,
                          seed: Long = 1,
                          numPartitions: Int = 0
                        ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {
    val timestamp = System.currentTimeMillis()
    val (generateTime, graph) = time(GraphGenerators.logNormalGraph(spark.sparkContext, numVertices, numPartitions, mu, sigma))
    val (convertTime, out) = time(graph.edges.map(e => s"${e.srcId.toString} ${e.dstId}"))
    val (saveTime, _) = time(out.saveAsTextFile(output.get))

    val timeResultSchema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)
      )
    )
    val total = generateTime + convertTime + saveTime
    val timeList = spark.sparkContext.parallelize(Seq(Row(GraphDataGen.name, timestamp, generateTime, convertTime, saveTime, total)))
    spark.createDataFrame(timeList, timeResultSchema)
  }
}

Source File: LinearRegressionDataGen.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.datageneration.mlgenerator

import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{getOrDefault, getOrThrow, time}
import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object LinearRegressionDataGen extends WorkloadDefaults {
  val name = "data-generation-lr"
  // Application parameters #1million points have 200M data size
  val numOfExamples: Int = 40000
  val numOfFeatures: Int = 4
  val eps: Double = 0.5
  val intercepts: Double = 0.1
  val numOfPartitions: Int = 10
  val maxIteration: Int = 3
  override def apply(m: Map[String, Any]) = new LinearRegressionDataGen(
    numRows = getOrThrow(m, "rows").asInstanceOf[Int],
    numCols = getOrThrow(m, "cols").asInstanceOf[Int],
    output = Some(getOrThrow(m, "output").asInstanceOf[String]),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    eps = getOrDefault[Double](m, "eps", eps),
    intercepts = getOrDefault[Double](m, "intercepts", intercepts),
    numPartitions = getOrDefault[Int](m, "partitions", numOfPartitions)
  )
}

case class LinearRegressionDataGen (
                                      numRows: Int,
                                      numCols: Int,
                                      input: Option[String] = None,
                                      output: Option[String],
                                      saveMode: String,
                                      eps: Double,
                                      intercepts: Double,
                                      numPartitions: Int
                                   ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {

    val timestamp = System.currentTimeMillis()

    val (generateTime, data): (Long, RDD[LabeledPoint]) = time {
      LinearDataGenerator.generateLinearRDD(
        spark.sparkContext,
        numRows,
        numCols,
        eps,
        numPartitions,
        intercepts
      )
    }

    import spark.implicits._
    val (convertTime, dataDF) = time {
      data.toDF
    }

    val (saveTime, _) = time {
      val outputstr = output.get
      if(outputstr.endsWith(".csv")) throw SparkBenchException("LabeledPoints cannot be saved to CSV. Please try outputting to Parquet instead.")
      writeToDisk(output.get, saveMode, dataDF, spark)
    }//TODO you can't output this to CSV. Parquet is fine

    val timeResultSchema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)
      )
    )

    val total = generateTime + convertTime + saveTime

    val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total)))

    spark.createDataFrame(timeList, timeResultSchema)

  }
}

Source File: KMeansDataGen.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.datageneration.mlgenerator

import com.ibm.sparktc.sparkbench.workload.ml.KMeansWorkload
import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._
import com.ibm.sparktc.sparkbench.utils.SaveModes
import org.apache.spark.mllib.util.KMeansDataGenerator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types._

object KMeansDataGen extends WorkloadDefaults {
  val name = "data-generation-kmeans"
  override def apply(m: Map[String, Any]) = new KMeansDataGen(
    numRows = getOrThrow(m, "rows").asInstanceOf[Int],
    numCols = getOrThrow(m, "cols").asInstanceOf[Int],
    output = Some(getOrThrow(m, "output").asInstanceOf[String]),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    k = getOrDefault[Int](m, "k", KMeansWorkload.numOfClusters),
    scaling = getOrDefault[Double](m, "scaling", KMeansWorkload.scaling),
    numPartitions = getOrDefault[Int](m, "partitions", KMeansWorkload.numOfPartitions)
  )
}

case class KMeansDataGen(
                          numRows: Int,
                          numCols: Int,
                          input: Option[String] = None,
                          output: Option[String],
                          saveMode: String,
                          k: Int,
                          scaling: Double,
                          numPartitions: Int
                        ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {
    val timestamp = System.currentTimeMillis()

    val (generateTime, data): (Long, RDD[Array[Double]]) = time {
      KMeansDataGenerator.generateKMeansRDD(
        spark.sparkContext,
        numRows,
        k,
        numCols,
        scaling,
        numPartitions
      )
    }

    val (convertTime, dataDF) = time {
      val schemaString = data.first().indices.map(i => "c" + i.toString).mkString(" ")
      val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false))
      val schema = StructType(fields)
      val rowRDD = data.map(arr => Row(arr:_*))
      spark.createDataFrame(rowRDD, schema)
    }

    val (saveTime, _) = time { writeToDisk(output.get, saveMode, dataDF, spark) }

    val timeResultSchema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)
      )
    )

    val total = generateTime + convertTime + saveTime

    val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total)))

    spark.createDataFrame(timeList, timeResultSchema)
  }
}

Source File: BuildAndTeardownData.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.testfixtures

import java.io.File

import com.holdenkarau.spark.testing.Utils
import com.ibm.sparktc.sparkbench.utils.SaveModes
import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk
import com.ibm.sparktc.sparkbench.workload.ml.KMeansWorkload
import org.apache.spark.mllib.util.KMeansDataGenerator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}

class BuildAndTeardownData(dirname: String = System.currentTimeMillis.toString) {
  val prefix = "/tmp/spark-bench-scalatest/" + dirname
  val sparkBenchTestFolder = s"$prefix/spark-bench-test"
  val kmeansFile = s"$sparkBenchTestFolder/kmeans-data.parquet"
  val sparkBenchDemoFolder = s"$prefix/spark-bench-demo"
  val spark = SparkSessionProvider.spark

  def createFolders(): Unit = {
    val fileSeq = Seq(new File(sparkBenchTestFolder), new File(sparkBenchDemoFolder))
    fileSeq.foreach(folder => folder.mkdirs())
  }

  def deleteFolders(): Unit = {
    Utils.deleteRecursively(new File(prefix))
  }

  def generateKMeansData(rows: Int, cols: Int, outputFile: String): Unit = {

    val data: RDD[Array[Double]] = KMeansDataGenerator.generateKMeansRDD(
      spark.sparkContext,
      rows,
      KMeansWorkload.numOfClusters,
      cols,
      KMeansWorkload.scaling,
      KMeansWorkload.numOfPartitions
    )

    val schemaString = data.first().indices.map(_.toString).mkString(" ")
    val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false))
    val schema = StructType(fields)

    val rowRDD = data.map(arr => Row(arr:_*))

    val df = spark.createDataFrame(rowRDD, schema)

    writeToDisk(outputFile, SaveModes.overwrite, df, spark)
  }
}

Source File: KMeansWorkloadTest.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.workload.ml

import java.io.File

import com.holdenkarau.spark.testing.Utils
import com.ibm.sparktc.sparkbench.testfixtures.SparkSessionProvider
import com.ibm.sparktc.sparkbench.utils.SaveModes
import com.ibm.sparktc.sparkbench.utils.SparkFuncs.{load, writeToDisk}
import org.apache.spark.mllib.util.KMeansDataGenerator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}

class KMeansWorkloadTest extends FlatSpec with Matchers with BeforeAndAfterEach {
  private val spark = SparkSessionProvider.spark
  private val fileName = s"/tmp/spark-bench-scalatest/kmeans-${java.util.UUID.randomUUID.toString}.csv"

  override def afterEach() {
    Utils.deleteRecursively(new File(fileName))
  }

  def makeDataFrame(): DataFrame = {
    val data: RDD[Array[Double]] = KMeansDataGenerator.generateKMeansRDD(
      spark.sparkContext, 1, 1, 1, KMeansWorkload.scaling, KMeansWorkload.numOfPartitions
    )
    val schemaString = data.first().indices.map(_.toString).mkString(" ")
    val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false))
    val schema = StructType(fields)
    val rowRDD = data.map(arr => Row(arr: _*))
    spark.createDataFrame(rowRDD, schema)
  }

  "reconcileSchema" should "handle a StringType schema and turn it into a DoubleType Schema" in {
    val df2Disk = makeDataFrame()
    writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv"))
    val conf = Map("name" -> "kmeans", "input" -> fileName)
    val work = KMeansWorkload(conf)
    val df = load(spark, fileName)
    val ddf = work.reconcileSchema(df)
    ddf.schema.head.dataType shouldBe DoubleType
  }

  "The load function" should "parse the DataFrame it's given into an RDD[Vector]" in {
    val df = makeDataFrame()
    val conf = Map("name" -> "kmeans", "input" -> "")
    val work = KMeansWorkload(conf)
    val ddf = work.reconcileSchema(df)
    val (_, rdd) = work.loadToCache(ddf, spark)
    rdd.first()
  }

  it should "work even when we've pulled the data from disk" in {
    val df2Disk = makeDataFrame()
    writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv"))
    val conf = Map("name" -> "kmeans", "input" -> fileName)
    val work = KMeansWorkload(conf)
    val df = load(spark, fileName)
    val ddf = work.reconcileSchema(df)
    val (_, rdd) = work.loadToCache(ddf, spark)
    rdd.first()
  }

  "doWorkload" should "work" in {
    val df2Disk = makeDataFrame()
    writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv"))
    val conf = Map("name" -> "kmeans", "input" -> fileName)
    val work = KMeansWorkload(conf)
    val df = load(spark, fileName)
    val ddf = work.reconcileSchema(df)
    work.doWorkload(Some(ddf), spark)
  }
}

Source File: MetadataUtils.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.util

import scala.collection.immutable.HashMap

import org.apache.spark.ml.attribute._
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.types.StructField



  def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = {
    require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col"
      + s" to be Vector type, but it was type ${col.dataType} instead.")
    val inputAttr = AttributeGroup.fromStructField(col)
    names.map { name =>
      require(inputAttr.hasAttr(name),
        s"getFeatureIndicesFromNames found no feature with name $name in column $col.")
      inputAttr.getAttr(name).index.get
    }
  }
}

Source File: VectorSlicerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{StructField, StructType}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    val slicer = new VectorSlicer().setInputCol("feature")
    ParamsSuite.checkParams(slicer)
    assert(slicer.getIndices.length === 0)
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {
        slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true))))
      }
    }
  }

  test("feature validity checks") {
    import VectorSlicer._
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(validIndices(Array.empty[Int]))
    assert(!validIndices(Array(-1)))
    assert(!validIndices(Array(1, 2, 1)))

    assert(validNames(Array("a", "b")))
    assert(validNames(Array.empty[String]))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))
  }

  test("Test vector slicer") {
    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())
    )

    // Expected after selecting indices 1, 4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) }
    val df = spark.createDataFrame(rdd,
      StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))

    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {
      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      }
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
      resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
        assert(a === b)
      }
    }

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    validateResults(vectorSlicer.transform(df))
  }

  test("read/write") {
    val t = new VectorSlicer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setIndices(Array(1, 3))
      .setNames(Array("a", "d"))
    testDefaultReadWrite(t)
  }
}

Source File: SQLTransformerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.types.{LongType, StructField, StructType}

class SQLTransformerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new SQLTransformer())
  }

  test("transform numeric data") {
    val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    val result = sqlTrans.transform(original)
    val resultSchema = sqlTrans.transformSchema(original.schema)
    val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
      .toDF("id", "v1", "v2", "v3", "v4")
    assert(result.schema.toString == resultSchema.toString)
    assert(resultSchema == expected.schema)
    assert(result.collect().toSeq == expected.collect().toSeq)
    assert(original.sparkSession.catalog.listTables().count() == 0)
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }

  test("transformSchema") {
    val df = spark.range(10)
    val outputSchema = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
      .transformSchema(df.schema)
    val expected = StructType(Seq(StructField("id1", LongType, nullable = false)))
    assert(outputSchema === expected)
  }
}

Source File: MetastoreRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.sql.{QueryTest, Row}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
  test("makeCopy and toJSON should work") {
    val table = CatalogTable(
      identifier = TableIdentifier("test", Some("db")),
      tableType = CatalogTableType.VIEW,
      storage = CatalogStorageFormat.empty,
      schema = StructType(StructField("a", IntegerType, true) :: Nil))
    val relation = MetastoreRelation("db", "test")(table, null)

    // No exception should be thrown
    relation.makeCopy(Array("db", "test"))
    // No exception should be thrown
    relation.toJSON
  }

  test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") {
    withTable("bar") {
      withTempView("foo") {
        sql("select 0 as id").createOrReplaceTempView("foo")
        // If we optimize the query in CTAS more than once, the following saveAsTable will fail
        // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])`
        sql("CREATE TABLE bar AS SELECT * FROM foo group by id")
        checkAnswer(spark.table("bar"), Row(0) :: Nil)
        val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar"))
        assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table")
      }
    }
  }
}

Source File: SparkExecuteStatementOperationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{NullType, StructField, StructType}

class SparkExecuteStatementOperationSuite extends SparkFunSuite {
  test("SPARK-17112 `select null` via JDBC triggers IllegalArgumentException in ThriftServer") {
    val field1 = StructField("NULL", NullType)
    val field2 = StructField("(IF(true, NULL, NULL))", NullType)
    val tableSchema = StructType(Seq(field1, field2))
    val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors()
    assert(columns.size() == 2)
    assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
    assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
  }
}

Source File: LocalRelation.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.analysis
import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
import org.apache.spark.sql.types.{StructField, StructType}

object LocalRelation {
  def apply(output: Attribute*): LocalRelation = new LocalRelation(output)

  def apply(output1: StructField, output: StructField*): LocalRelation = {
    new LocalRelation(StructType(output1 +: output).toAttributes)
  }

  def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }

  def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }
}

case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
  extends LeafNode with analysis.MultiInstanceRelation {

  // A local relation must have resolved output.
  require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.")

  
  override final def newInstance(): this.type = {
    LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type]
  }

  override protected def stringArgs: Iterator[Any] = {
    if (data.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def sameResult(plan: LogicalPlan): Boolean = {
    plan.canonicalized match {
      case LocalRelation(otherOutput, otherData) =>
        otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data
      case _ => false
    }
  }

  override lazy val statistics =
    Statistics(sizeInBytes =
      (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length)

  def toSQL(inlineTableName: String): String = {
    require(data.nonEmpty)
    val types = output.map(_.dataType)
    val rows = data.map { row =>
      val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql }
      cells.mkString("(", ", ", ")")
    }
    "VALUES " + rows.mkString(", ") +
      " AS " + inlineTableName +
      output.map(_.name).mkString("(", ", ", ")")
  }
}

Source File: ResolveInlineTables.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import scala.util.control.NonFatal

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Cast
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.types.{StructField, StructType}


  private[analysis] def convert(table: UnresolvedInlineTable): LocalRelation = {
    // For each column, traverse all the values and find a common data type and nullability.
    val fields = table.rows.transpose.zip(table.names).map { case (column, name) =>
      val inputTypes = column.map(_.dataType)
      val tpe = TypeCoercion.findWiderTypeWithoutStringPromotion(inputTypes).getOrElse {
        table.failAnalysis(s"incompatible types found in column $name for inline table")
      }
      StructField(name, tpe, nullable = column.exists(_.nullable))
    }
    val attributes = StructType(fields).toAttributes
    assert(fields.size == table.names.size)

    val newRows: Seq[InternalRow] = table.rows.map { row =>
      InternalRow.fromSeq(row.zipWithIndex.map { case (e, ci) =>
        val targetType = fields(ci).dataType
        try {
          if (e.dataType.sameType(targetType)) {
            e.eval()
          } else {
            Cast(e, targetType).eval()
          }
        } catch {
          case NonFatal(ex) =>
            table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}")
        }
      })
    }

    LocalRelation(attributes, newRows)
  }
}

Source File: resources.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import java.io.File
import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("Results", StringType, nullable = false)() :: Nil
  }
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val jarList = sparkSession.sparkContext.listJars()
    if (jars.nonEmpty) {
      for {
        jarName <- jars.map(f => new Path(f).getName)
        jarPath <- jarList if jarPath.contains(jarName)
      } yield Row(jarPath)
    } else {
      jarList.map(Row(_))
    }
  }
}

Source File: DDLSourceLoadSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      spark.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without Hive Support") {
    val e = intercept[AnalysisException] {
      spark.read.format("orc").load()
    }
    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

Source File: BlockingSource.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.util

import java.util.concurrent.CountDownLatch

import org.apache.spark.sql.{SQLContext, _}
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class BlockingSource extends StreamSourceProvider with StreamSinkProvider {

  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    BlockingSource.latch.await()
    new Source {
      override def schema: StructType = fakeSchema
      override def getOffset: Option[Offset] = Some(new LongOffset(0))
      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
        import spark.implicits._
        Seq[Int]().toDS().toDF()
      }
      override def stop() {}
    }
  }

  override def createSink(
      spark: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new Sink {
      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
    }
  }
}

object BlockingSource {
  var latch: CountDownLatch = null
}

Source File: XGBoost.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import eleflow.uberdata.models.UberXGBOOSTModel
import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType}

import scala.reflect.ClassTag


class XGBoost[I](override val uid: String,
                 val models: RDD[(I, (UberXGBOOSTModel,
                   Seq[(ModelParamEvaluation[I])]))])(
  implicit kt: ClassTag[I],
  ord: Ordering[I] = null)
    extends ForecastBaseModel[XGBoostSmallModel[I]]
    with HasInputCol
    with HasOutputCol
    with DefaultParamsWritable
    with HasFeaturesCol
    with HasNFutures
    with HasGroupByCol {

  def this(
    models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))]
  )(implicit kt: ClassTag[I], ord: Ordering[I] ) =
    this(Identifiable.randomUID("xgboost"), models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val schema = dataSet.schema
    val predSchema = transformSchema(schema)
    val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)})

    val predictions = joined.map {
      case (id, ((bestModel, metrics), row)) =>
        val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]](
          IUberdataForecastUtil.FEATURES_COL_NAME
        )
        val label = DataTransformer.toFloat(row.getAs($(featuresCol)))
        val labelPoint = features.map { vec =>
          val array = vec.toArray.map(_.toFloat)
          LabeledPoint(label, null, array)
        }
        val matrix = new DMatrix(labelPoint.toIterator)
        val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance
          .predict(matrix)
          .flatMap(_.map(_.toDouble))
          .splitAt(features.length)
        Row(
          row.toSeq :+ Vectors
            .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params
            .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _*
        )
    }
    dataSet.sqlContext.createDataFrame(predictions, predSchema)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra)
}

Source File: XGBoostBaseBestModel.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType}


trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]]
    extends BestModelFinder[G, M]
    with HasGroupByCol {

  protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast {
    StructType(
      Seq(
        StructField($(groupByCol).get, FloatType),
        StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT))))
  }


  protected def xGBoostEvaluation(row: Row,
                                  model: Booster,
                                  broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]],
                                  id: G,
                                  parameters: ParamMap): ModelParamEvaluation[G] = {
    val featuresArray = row
      .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME)
      .map { vec =>
        val values = vec.toArray.map(DataTransformer.toFloat)
        LabeledPoint(values.head, null, values.tail)
      }
    val features = new DMatrix(featuresArray.toIterator)
    log.warn(s"Evaluating forecast for id $id, with xgboost")
    val prediction = model.predict(features).flatten
    val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length)
    val toBeValidated = featuresArray.zip(forecastToBeValidated)
    val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f =>
      (f._1.label.toDouble, f._2.toDouble)))
    val metricName = broadcastEvaluator.value.getMetricName
    new ModelParamEvaluation[G](
      id,
      metric,
      parameters,
      Some(metricName),
      SupportedAlgorithm.XGBoostAlgorithm)
  }
}

Source File: TimeSeriesGenerator.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, StructType}

import scala.reflect.ClassTag


  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val rdd = dataSet.rdd

    val sparkContext = dataSet.sqlContext.sparkContext
    val index = sparkContext.broadcast(dataSet.schema.fieldIndex($(timeCol).get))
    val labelColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(groupByCol).get))
    val featuresColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(featuresCol)))
    val grouped = rdd.map { case (row: Row) =>
      val timeColRow =
        IUberdataForecastUtil.convertColumnToLong(row, index.value)
      convertColumnToDouble(timeColRow, featuresColIndex)
    }.groupBy { row =>
      row.getAs[L](labelColIndex.value)
    }.map {
      case (key, values) =>
        val toBeUsed =
          values.toArray.sortBy(row => row.getAs[Long](index.value))
        (key, toBeUsed)
    }

    val toBeTrained = grouped.map {
      case (key, values) =>
        org.apache.spark.sql.Row(
          key,
          Vectors.dense(values.map(_.getAs[Double](featuresColIndex.value)))
        )
    }

    val trainSchema = transformSchema(dataSet.schema)
    dataSet.sqlContext.createDataFrame(toBeTrained, trainSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    val labelIndex = schema.fieldIndex($(groupByCol).get)
    StructType(
      Seq(
        schema.fields(labelIndex),
        StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT)
      )
    )
  }

  override def copy(extra: ParamMap): TimeSeriesGenerator[L] =
    defaultCopy(extra)

}

object TimeSeriesGenerator extends DefaultParamsReadable[TimeSeriesGenerator[_]] {

  override def load(path: String): TimeSeriesGenerator[_] = super.load(path)
}

Source File: XGBoostBigModel.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml


import com.cloudera.sparkts.models.UberXGBoostModel
import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint}
import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)])
    extends ForecastBaseModel[XGBoostBigModel[I]]
    with HasLabelCol
    with HasIdCol {

  def setLabelcol(label: String): this.type = set(labelCol, label)

  def setIdcol(id: String): this.type = set(idCol, id)

  override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
            )
      }
      .join(prediction)
      .map {
        case (id, (features, predictValue)) =>
          Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }

  protected def predict(dataSet: Dataset[_]) = {
    val features = dataSet.rdd.map { case (row: Row) =>
      val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
      val id = row.getAs[I]($(idCol))
      SparkLabeledPoint(DataTransformer.toFloat(id), features)
    }.cache
    val (_, model) = models.head
    UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(getPredictionSchema)

  protected def getPredictionSchema: Array[StructField] = {
    Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    )
  }
}

Source File: VectorizeEncoder.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.core.data.DataTransformer
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, StructType}


class VectorizeEncoder(override val uid: String)
    extends Transformer
    with HasIdCol
    with HasTimeCol
    with HasInputCols
    with HasLabelCol
    with HasGroupByCol
    with HasOutputCol
    with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("vectorizer"))

  def setIdCol(input: String) = set(idCol, input)

  def setLabelCol(input: String) = set(labelCol, input)

  def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy))

  def setInputCol(input: Array[String]) = set(inputCols, input)

  def setTimeCol(time: String) = set(timeCol, Some(time))

  def setOutputCol(output: String) = set(outputCol, output)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val context = dataSet.sqlContext.sparkContext
    val input = context.broadcast($(inputCols))
    val allColumnNames = dataSet.schema.map(_.name)

    val nonInputColumnIndexes = context.broadcast(
      allColumnNames.zipWithIndex.filter(
        f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol)
          || f._1 == $(timeCol).getOrElse("")))
    val result = dataSet.rdd.map { case (row: Row) =>
      val rowSeq = row.toSeq
      val nonInputColumns = nonInputColumnIndexes.value.map {
        case (_, index) => rowSeq(index)
      }
      val size = input.value.length
      val (values, indices) = input.value
        .filter(col => row.getAs(col) != null)
        .map { column =>
          DataTransformer.toDouble(row.getAs(column))
        }
        .zipWithIndex
        .filter(f => f._1 != 0d)
        .unzip
      Row(
        nonInputColumns :+ org.apache.spark.ml.linalg.Vectors
          .sparse(size, indices.toArray, values.toArray): _*
      )
    }
    val newSchema = transformSchema(dataSet.schema)
    dataSet.sqlContext.createDataFrame(result, newSchema)
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(
      schema.filter(
        col =>
          !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol)
            || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("")
      )
    ).add(StructField($(outputCol), new VectorUDT))
}

Source File: AllColumnsTimeSeriesGenerator.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset

import scala.reflect.ClassTag


  def setOutputCol(value: String): this.type = set(outputCol, value)

//  override def transform(dataSet: DataFrame): DataFrame = {
  override def transform(dataSet: Dataset[_] ): DataFrame = {
    val rdd = dataSet.rdd
    val sparkContext = dataSet.sqlContext.sparkContext
    val labelColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(labelCol)))
    val keyValueDataSet = rdd.map { case (row: Row) =>
      Row(
        row.getAs[T](labelColIndex.value),
        row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol))
      )
    }
    val trainSchema = transformSchema(dataSet.schema)

    dataSet.sqlContext.createDataFrame(keyValueDataSet, trainSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    StructType(
      schema.filter(_.name == $(labelCol)).head +: Seq(
        StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT)
      )
    )
  }

  override def copy(extra: ParamMap): AllColumnsTimeSeriesGenerator[T, U] =
    defaultCopy(extra)
}

object AllColumnsTimeSeriesGenerator
    extends DefaultParamsReadable[AllColumnsTimeSeriesGenerator[_, _]] {

  override def load(path: String): AllColumnsTimeSeriesGenerator[_, _] =
    super.load(path)
}

Source File: ForecastPipelineStage.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import org.apache.spark.ml.param.shared.{HasNFutures, HasPredictionCol, HasValidationCol}
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.types.{StructType, StringType, StructField, MapType}


trait ForecastPipelineStage
    extends PipelineStage
    with HasNFutures
    with HasPredictionCol
    with HasValidationCol {

  def setValidationCol(value: String): this.type = set(validationCol, value)

  override def transformSchema(schema: StructType): StructType = {
    schema
      .add(StructField($(validationCol), new VectorUDT))
      .add(StructField(IUberdataForecastUtil.ALGORITHM, StringType))
      .add(StructField(IUberdataForecastUtil.PARAMS, MapType(StringType, StringType)))
  }
}

Source File: XGBoostBigModelTimeSeries.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import java.sql.Timestamp

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasTimeCol
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModelTimeSeries[I](override val uid: String,
                                   override val models: Seq[(ParamMap, XGBoostModel)])
                                  extends XGBoostBigModel[I](uid, models) with HasTimeCol{

  def setTimecol(time: String): this.type = set(timeCol, Some(time))

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME),
              row.getAs[java.sql.Timestamp]($(timeCol).get)))
      }
      .join(prediction)
      .map {
        case (id, ((features, time), predictValue)) =>
          Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }


  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField($(timeCol).get, TimestampType),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    ) )
}

Source File: CarbonHiveMetastoreListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.hive

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.metastore.MetaStorePreEventListener
import org.apache.hadoop.hive.metastore.api.{FieldSchema, MetaException}
import org.apache.hadoop.hive.metastore.events._
import org.apache.hadoop.hive.metastore.events.PreEventContext.PreEventType._
import org.apache.spark.sql.types.{DataType, StructField, StructType}

class CarbonHiveMetastoreListener(conf: Configuration) extends MetaStorePreEventListener(conf) {

  override def onEvent(preEventContext: PreEventContext): Unit = {
    preEventContext.getEventType match {
      case CREATE_TABLE =>
        val table = preEventContext.asInstanceOf[PreCreateTableEvent].getTable
        val tableProps = table.getParameters
        if (tableProps != null
          && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource"
          || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) {
          val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts")
          if (numSchemaParts != null && !numSchemaParts.isEmpty) {
            val parts = (0 until numSchemaParts.toInt).map { index =>
              val part = tableProps.get(s"spark.sql.sources.schema.part.${index}")
              if (part == null) {
                throw new MetaException(s"spark.sql.sources.schema.part.${index} is missing!")
              }
              part
            }
            // Stick all parts back to a single schema string.
            val schema = DataType.fromJson(parts.mkString).asInstanceOf[StructType]
            val hiveSchema = schema.map(toHiveColumn).asJava
            table.getSd.setCols(hiveSchema)
            table.getSd.setInputFormat("org.apache.carbondata.hive.MapredCarbonInputFormat")
            table.getSd.setOutputFormat("org.apache.carbondata.hive.MapredCarbonOutputFormat")
            val serdeInfo = table.getSd.getSerdeInfo
            serdeInfo.setSerializationLib("org.apache.carbondata.hive.CarbonHiveSerDe")
            val tablePath = serdeInfo.getParameters.get("tablePath")
            if (tablePath != null) {
              table.getSd.setLocation(tablePath)
            }
          }
        }
      case ALTER_TABLE =>
        val table = preEventContext.asInstanceOf[PreAlterTableEvent].getNewTable
        val tableProps = table.getParameters
        if (tableProps != null
          && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource"
          || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) {
          val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts")
          if (numSchemaParts != null && !numSchemaParts.isEmpty) {
            val schemaParts = (0 until numSchemaParts.toInt).map { index =>
              val schemaPart = tableProps.get(s"spark.sql.sources.schema.part.$index")
              if (schemaPart == null) {
                throw new MetaException(s"spark.sql.sources.schema.part.$index is missing!")
              }
              schemaPart
            }
            // Stick all schemaParts back to a single schema string.
            val schema = DataType.fromJson(schemaParts.mkString).asInstanceOf[StructType]
            val hiveSchema = schema.map(toHiveColumn).asJava
            table.getSd.setCols(hiveSchema)
          }
        }
      case _ =>
      // do nothing
    }
  }

  private def toHiveColumn(c: StructField): FieldSchema = {
    val typeString = if (c.metadata.contains("HIVE_TYPE_STRING")) {
      c.metadata.getString("HIVE_TYPE_STRING")
    } else {
      c.dataType.catalogString
    }
    new FieldSchema(c.name, typeString, c.getComment().orNull)
  }
}

org.apache.spark.sql.types.StructField Scala Examples