org.apache.spark.sql.types.StructField Scala Examples
The following examples show how to use org.apache.spark.sql.types.StructField.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OnErrorSuite.scala From spark-snowflake with Apache License 2.0 | 6 votes |
package net.snowflake.spark.snowflake import net.snowflake.client.jdbc.SnowflakeSQLException import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.apache.spark.sql.types.{StringType, StructField, StructType} class OnErrorSuite extends IntegrationSuiteBase { lazy val table = s"spark_test_table_$randomSuffix" lazy val schema = new StructType( Array(StructField("var", StringType, nullable = false)) ) lazy val df: DataFrame = sparkSession.createDataFrame( sc.parallelize( Seq(Row("{\"dsadas\nadsa\":12311}"), Row("{\"abc\":334}")) // invalid json key ), schema ) override def beforeAll(): Unit = { super.beforeAll() jdbcUpdate(s"create or replace table $table(var variant)") } override def afterAll(): Unit = { jdbcUpdate(s"drop table $table") super.afterAll() } test("continue_on_error off") { assertThrows[SnowflakeSQLException] { df.write .format(SNOWFLAKE_SOURCE_NAME) .options(connectorOptionsNoTable) .option("dbtable", table) .mode(SaveMode.Append) .save() } } test("continue_on_error on") { df.write .format(SNOWFLAKE_SOURCE_NAME) .options(connectorOptionsNoTable) .option("continue_on_error", "on") .option("dbtable", table) .mode(SaveMode.Append) .save() val result = sparkSession.read .format(SNOWFLAKE_SOURCE_NAME) .options(connectorOptionsNoTable) .option("dbtable", table) .load() assert(result.collect().length == 1) } }
Example 2
Source File: SnowflakePlan.scala From spark-snowflake with Apache License 2.0 | 5 votes |
package net.snowflake.spark.snowflake.pushdowns import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StructField, StructType} case class SnowflakePlan(output: Seq[Attribute], rdd: RDD[InternalRow]) extends SparkPlan { override def children: Seq[SparkPlan] = Nil protected override def doExecute(): RDD[InternalRow] = { val schema = StructType( output.map(attr => StructField(attr.name, attr.dataType, attr.nullable)) ) rdd.mapPartitions { iter => val project = UnsafeProjection.create(schema) iter.map(project) } } }
Example 3
Source File: cogroup.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.plumbus import org.apache.spark.Partitioner import org.apache.spark.rdd.{ CoGroupedRDD, RDD } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ ArrayType, StructField } import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row } import scala.reflect.ClassTag import scala.util.Try object cogroup { implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) { def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] = //Use SparkAddOn ? ??? } def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)( implicit encA: Encoder[A], encB: Encoder[B], encC: Encoder[K], enc: Encoder[(K, Seq[A], Seq[B])], ca: ClassTag[A], ck: ClassTag[K], cb: ClassTag[B] ): Dataset[(K, Seq[A], Seq[B])] = left.sparkSession.implicits .rddToDatasetHolder( RDD .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft)) .cogroup(right.rdd.keyBy(keyRight)) .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) }) ) .toDS def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)( byKey: String, partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*) ): Try[DataFrame] = Try { val subGroup: Seq[DataFrame] = namedSubGroup.map(_._2) val allFrames: Seq[DataFrame] = group +: subGroup val allFramesKeyed: Seq[RDD[(String, Row)]] = allFrames.map(df => { val idx = df.columns.indexOf(byKey) df.rdd.keyBy(_.get(idx).toString) }) val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner) val rowRdd: RDD[Row] = cogroupRdd.map(x => { val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq) val seq = rows.head.head.toSeq ++ rows.tail new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row] }) val schema = types.StructType( group.schema.fields ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) } ) group.sparkSession.createDataFrame(rowRdd, schema) } }
Example 4
Source File: DataFrameComparisonTest.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.sparktest import io.univalence.schema.SchemaComparator.SchemaError import org.apache.spark.SparkContext import org.apache.spark.sql.{ Row, SparkSession } import org.apache.spark.sql.types.{ IntegerType, StructField, StructType } import org.scalatest.FunSuite class DataFrameComparisonTest extends FunSuite with SparkTest { val sharedSparkSession: SparkSession = ss val sc: SparkContext = ss.sparkContext // TODO : unordered ignore("should assertEquals unordered between equal DF") { val dfUT = Seq(1, 2, 3).toDF("id") val dfExpected = Seq(3, 2, 1).toDF("id") dfUT.assertEquals(dfExpected) } // TODO : unordered ignore("should not assertEquals unordered between DF with different contents") { val dfUT = Seq(1, 2, 3).toDF("id") val dfExpected = Seq(2, 1, 4).toDF("id") assertThrows[SparkTestError] { dfUT.assertEquals(dfExpected) } } test("should assertEquals ordered between equal DF") { val dfUT = Seq(1, 2, 3).toDF("id") val dfExpected = Seq(1, 2, 3).toDF("id") dfUT.assertEquals(dfExpected) } test("should not assertEquals ordered between DF with different contents") { val dfUT = Seq(1, 2, 3).toDF("id") val dfExpected = Seq(1, 3, 4).toDF("id") assertThrows[SparkTestError] { dfUT.assertEquals(dfExpected) } } test("should not assertEquals between DF with different schema") { val dfUT = Seq(1, 2, 3).toDF("id") val dfExpected = Seq(1, 2, 3).toDF("di") assertThrows[SchemaError] { dfUT.assertEquals(dfExpected) } } test("assertEquals (DF & Seq) : a DF and a Seq with the same content are equal") { val seq = Seq(1, 2, 3) val df = ss.createDataFrame( sc.parallelize(seq.map(Row(_))), StructType(List(StructField("number", IntegerType, nullable = true))) ) df.assertEquals(seq) } test("assertEquals (DF & Seq) : a DF and a Seq with different content are not equal") { val df = Seq(1, 3, 3).toDF("number") val seqEx = Seq(1, 2, 3) assertThrows[SparkTestError] { df.assertEquals(seqEx) } } test("should assertEquals ordered between equal DF with columns containing special character") { val dfUT = Seq(1, 2, 3).toDF("id.a") val dfExpected = Seq(2, 1, 4).toDF("id.a") assertThrows[SparkTestError] { dfUT.assertEquals(dfExpected) } } }
Example 5
Source File: ConfigurableDataGeneratorMain.scala From Spark.TableStatsExample with Apache License 2.0 | 5 votes |
package com.cloudera.sa.examples.tablestats import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType} import org.apache.spark.{SparkContext, SparkConf} import scala.collection.mutable import scala.util.Random object ConfigurableDataGeneratorMain { def main(args: Array[String]): Unit = { if (args.length == 0) { println("ConfigurableDataGeneratorMain <outputPath> <numberOfColumns> <numberOfRecords> <numberOfPartitions> <local>") return } val outputPath = args(0) val numberOfColumns = args(1).toInt val numberOfRecords = args(2).toInt val numberOfPartitions = args(3).toInt val runLocal = (args.length == 5 && args(4).equals("L")) var sc: SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "test", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("ConfigurableDataGeneratorMain") sc = new SparkContext(sparkConfig) } val sqlContext = new org.apache.spark.sql.SQLContext(sc) //Part A val rowRDD = sc.parallelize( (0 until numberOfPartitions).map( i => i), numberOfPartitions) //Part B val megaDataRDD = rowRDD.flatMap( r => { val random = new Random() val dataRange = (0 until numberOfRecords/numberOfPartitions).iterator dataRange.map[Row]( x => { val values = new mutable.ArrayBuffer[Any] for (i <- 0 until numberOfColumns) { if (i % 2 == 0) { values.+=(random.nextInt(100).toLong) } else { values.+=(random.nextInt(100).toString) } } new GenericRow(values.toArray) }) }) //Part C val schema = StructType( (0 until numberOfColumns).map( i => { if (i % 2 == 0) { StructField("longColumn_" + i, LongType, true) } else { StructField("stringColumn_" + i, StringType, true) } }) ) val df = sqlContext.createDataFrame(megaDataRDD, schema) df.saveAsParquetFile(outputPath) //Part D sc.stop() } }
Example 6
Source File: TestTableStatsSinglePathMain.scala From Spark.TableStatsExample with Apache License 2.0 | 5 votes |
package com.cloudera.sa.examples.tablestats import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType} import org.scalatest.{FunSuite, BeforeAndAfterEach, BeforeAndAfterAll} class TestTableStatsSinglePathMain extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{ test("run table stats on sample data") { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") var sc = new SparkContext("local", "test", sparkConfig) try { val sqlContext = new org.apache.spark.sql.SQLContext(sc) val schema = StructType( Array( StructField("id", LongType, true), StructField("name", StringType, true), StructField("age", LongType, true), StructField("gender", StringType, true), StructField("height", LongType, true), StructField("job_title", StringType, true) ) ) val rowRDD = sc.parallelize(Array( Row(1l, "Name.1", 20l, "M", 6l, "dad"), Row(2l, "Name.2", 20l, "F", 5l, "mom"), Row(3l, "Name.3", 20l, "F", 5l, "mom"), Row(4l, "Name.4", 20l, "M", 5l, "mom"), Row(5l, "Name.5", 10l, "M", 4l, "kid"), Row(6l, "Name.6", 8l, "M", 3l, "kid"))) val df = sqlContext.createDataFrame(rowRDD, schema) val firstPassStats = TableStatsSinglePathMain.getFirstPassStat(df) assertResult(6l)(firstPassStats.columnStatsMap(0).maxLong) assertResult(1l)(firstPassStats.columnStatsMap(0).minLong) assertResult(21l)(firstPassStats.columnStatsMap(0).sumLong) assertResult(3l)(firstPassStats.columnStatsMap(0).avgLong) assertResult(2)(firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.length) firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.foreach { r => if (r._1.equals("M")) { assertResult(4l)(r._2) } else if (r._1.equals("F")) { assertResult(2l)(r._2) } else { throw new RuntimeException("Unknown gender: " + r._1) } } } finally { sc.stop() } } }
Example 7
Source File: MetadataUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.types.StructField def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 8
Source File: VectorSlicerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 9
Source File: SQLTransformerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.types.{LongType, StructField, StructType} class SQLTransformerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new SQLTransformer()) } test("transform numeric data") { val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val result = sqlTrans.transform(original) val resultSchema = sqlTrans.transformSchema(original.schema) val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)) .toDF("id", "v1", "v2", "v3", "v4") assert(result.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(result.collect().toSeq == expected.collect().toSeq) assert(original.sparkSession.catalog.listTables().count() == 0) } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } test("transformSchema") { val df = spark.range(10) val outputSchema = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") .transformSchema(df.schema) val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) assert(outputSchema === expected) } }
Example 10
Source File: MetastoreRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("makeCopy and toJSON should work") { val table = CatalogTable( identifier = TableIdentifier("test", Some("db")), tableType = CatalogTableType.VIEW, storage = CatalogStorageFormat.empty, schema = StructType(StructField("a", IntegerType, true) :: Nil)) val relation = MetastoreRelation("db", "test")(table, null) // No exception should be thrown relation.makeCopy(Array("db", "test")) // No exception should be thrown relation.toJSON } test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") { withTable("bar") { withTempView("foo") { sql("select 0 as id").createOrReplaceTempView("foo") // If we optimize the query in CTAS more than once, the following saveAsTable will fail // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])` sql("CREATE TABLE bar AS SELECT * FROM foo group by id") checkAnswer(spark.table("bar"), Row(0) :: Nil) val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar")) assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table") } } } }
Example 11
Source File: SparkExecuteStatementOperationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{NullType, StructField, StructType} class SparkExecuteStatementOperationSuite extends SparkFunSuite { test("SPARK-17112 `select null` via JDBC triggers IllegalArgumentException in ThriftServer") { val field1 = StructField("NULL", NullType) val field2 = StructField("(IF(true, NULL, NULL))", NullType) val tableSchema = StructType(Seq(field1, field2)) val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors() assert(columns.size() == 2) assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.NULL_TYPE) assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.NULL_TYPE) } }
Example 12
Source File: LocalRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def sameResult(plan: LogicalPlan): Boolean = { plan.canonicalized match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 13
Source File: ResolveInlineTables.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import scala.util.control.NonFatal import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Cast import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.types.{StructField, StructType} private[analysis] def convert(table: UnresolvedInlineTable): LocalRelation = { // For each column, traverse all the values and find a common data type and nullability. val fields = table.rows.transpose.zip(table.names).map { case (column, name) => val inputTypes = column.map(_.dataType) val tpe = TypeCoercion.findWiderTypeWithoutStringPromotion(inputTypes).getOrElse { table.failAnalysis(s"incompatible types found in column $name for inline table") } StructField(name, tpe, nullable = column.exists(_.nullable)) } val attributes = StructType(fields).toAttributes assert(fields.size == table.names.size) val newRows: Seq[InternalRow] = table.rows.map { row => InternalRow.fromSeq(row.zipWithIndex.map { case (e, ci) => val targetType = fields(ci).dataType try { if (e.dataType.sameType(targetType)) { e.eval() } else { Cast(e, targetType).eval() } } catch { case NonFatal(ex) => table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}") } }) } LocalRelation(attributes, newRows) } }
Example 14
Source File: resources.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 15
Source File: DDLSourceLoadSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{StringType, StructField, StructType} // please note that the META-INF/services had to be modified for the test directory for this to work class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { test("data sources with the same name") { intercept[RuntimeException] { spark.read.format("Fluet da Bomb").load() } } test("load data source from format alias") { spark.read.format("gathering quorum").load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("specify full classname with duplicate formats") { spark.read.format("org.apache.spark.sql.sources.FakeSourceOne") .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("should fail to load ORC without Hive Support") { val e = intercept[AnalysisException] { spark.read.format("orc").load() } assert(e.message.contains("The ORC data source must be used with Hive support enabled")) } } class FakeSourceOne extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceTwo extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceThree extends RelationProvider with DataSourceRegister { def shortName(): String = "gathering quorum" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } }
Example 16
Source File: CarbonDataFrameExample.scala From CarbonDataLearning with GNU General Public License v3.0 | 5 votes |
package org.github.xubo245.carbonDataLearning.example import org.apache.carbondata.examples.util.ExampleUtils import org.apache.spark.sql.{SaveMode, SparkSession} object CarbonDataFrameExample { def main(args: Array[String]) { val spark = ExampleUtils.createCarbonSession("CarbonDataFrameExample") exampleBody(spark) spark.close() } def exampleBody(spark : SparkSession): Unit = { // Writes Dataframe to CarbonData file: import spark.implicits._ val df = spark.sparkContext.parallelize(1 to 100) .map(x => ("a" + x % 10, "b", x)) .toDF("c1", "c2", "number") // Saves dataframe to carbondata file df.write .format("carbondata") .option("tableName", "carbon_df_table") .option("partitionColumns", "c1") // a list of column names .mode(SaveMode.Overwrite) .save() spark.sql(""" SELECT * FROM carbon_df_table """).show() spark.sql("SHOW PARTITIONS carbon_df_table").show() // Specify schema import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} val customSchema = StructType(Array( StructField("c1", StringType), StructField("c2", StringType), StructField("number", IntegerType))) // Reads carbondata to dataframe val carbondf = spark.read .format("carbondata") .schema(customSchema) // .option("dbname", "db_name") the system will use "default" as dbname if not set this option .option("tableName", "carbon_df_table") .load() df.write .format("csv") .option("tableName", "csv_df_table") .option("partitionColumns", "c1") // a list of column names // .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") .mode(SaveMode.Overwrite) .csv("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv/1.csv") // Reads carbondata to dataframe val carbondf2 = spark.read .format("csv") .schema(customSchema) // .option("dbname", "db_name") the system will use "default" as dbname if not set this option .option("tableName", "csv_df_table") // .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") .load("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv") carbondf2.show() // Dataframe operations carbondf.printSchema() carbondf.select($"c1", $"number" + 10).show() carbondf.filter($"number" > 31).show() spark.sql("DROP TABLE IF EXISTS carbon_df_table") } }
Example 17
Source File: TestSFObjectWriter.scala From spark-salesforce with Apache License 2.0 | 5 votes |
package com.springml.spark.salesforce import org.mockito.Mockito._ import org.mockito.Matchers._ import org.scalatest.mock.MockitoSugar import org.scalatest.{ FunSuite, BeforeAndAfterEach} import com.springml.salesforce.wave.api.BulkAPI import org.apache.spark.{ SparkConf, SparkContext} import com.springml.salesforce.wave.model.{ JobInfo, BatchInfo} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{ Row, DataFrame, SQLContext} import org.apache.spark.sql.types.{ StructType, StringType, StructField} class TestSFObjectWriter extends FunSuite with MockitoSugar with BeforeAndAfterEach { val contact = "Contact"; val jobId = "750B0000000WlhtIAC"; val batchId = "751B0000000scSHIAY"; val data = "Id,Description\n003B00000067Rnx,123456\n003B00000067Rnw,7890"; val bulkAPI = mock[BulkAPI](withSettings().serializable()) val writer = mock[SFObjectWriter] var sparkConf: SparkConf = _ var sc: SparkContext = _ override def beforeEach() { val jobInfo = new JobInfo jobInfo.setId(jobId) when(bulkAPI.createJob(contact)).thenReturn(jobInfo) val batchInfo = new BatchInfo batchInfo.setId(batchId) batchInfo.setJobId(jobId) when(bulkAPI.addBatch(jobId, data)).thenReturn(batchInfo) when(bulkAPI.closeJob(jobId)).thenReturn(jobInfo) when(bulkAPI.isCompleted(jobId)).thenReturn(true) sparkConf = new SparkConf().setMaster("local").setAppName("Test SF Object Update") sc = new SparkContext(sparkConf) } private def sampleDF() : DataFrame = { val rowArray = new Array[Row](2) val fieldArray = new Array[String](2) fieldArray(0) = "003B00000067Rnx" fieldArray(1) = "Desc1" rowArray(0) = Row.fromSeq(fieldArray) val fieldArray1 = new Array[String](2) fieldArray1(0) = "001B00000067Rnx" fieldArray1(1) = "Desc2" rowArray(1) = Row.fromSeq(fieldArray1) val rdd = sc.parallelize(rowArray) val schema = StructType( StructField("id", StringType, true) :: StructField("desc", StringType, true) :: Nil) val sqlContext = new SQLContext(sc) sqlContext.createDataFrame(rdd, schema) } test ("Write Object to Salesforce") { val df = sampleDF(); val csvHeader = Utils.csvHeadder(df.schema) writer.writeData(df.rdd) sc.stop() } }
Example 18
Source File: SparkScoreDoc.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd.models import org.apache.lucene.document.Document import org.apache.lucene.index.IndexableField import org.apache.lucene.search.{IndexSearcher, ScoreDoc} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.inferNumericType import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField} import scala.collection.JavaConverters._ sealed trait FieldType extends Serializable object TextType extends FieldType object IntType extends FieldType object DoubleType extends FieldType object LongType extends FieldType object FloatType extends FieldType private def inferNumericType(num: Number): FieldType = { num match { case _: java.lang.Double => DoubleType case _: java.lang.Long => LongType case _: java.lang.Integer => IntType case _: java.lang.Float => FloatType case _ => TextType } } }
Example 19
Source File: Schema.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} object Schema { val GraphElementSchema = StructType(CommonFields ++ Seq( StructField("id", StringType, nullable = true), StructField("service", StringType, nullable = true), StructField("column", StringType, nullable = true), StructField("from", StringType, nullable = true), StructField("to", StringType, nullable = true), StructField("label", StringType, nullable = true), StructField("props", StringType, nullable = true) )) }
Example 20
Source File: ExecutePython.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.script import java.util import java.util.UUID import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.util.FileUtil import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import jep.Jep import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{Row, SparkSession} import scala.collection.JavaConversions._ class ExecutePython extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Execute python script" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var script : String = _ override def setProperties(map: Map[String, Any]): Unit = { script = MapUtil.get(map,"script").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val script = new PropertyDescriptor() .name("script") .displayName("script") .description("The code of python") .defaultValue("") .required(true) descriptor = script :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/script/python.png") } override def getGroup(): List[String] = { List(StopGroup.ScriptGroup) } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val jep = new Jep() val scriptPath = "/tmp/pythonExcutor-"+ UUID.randomUUID() +".py" FileUtil.writeFile(script,scriptPath) jep.runScript(scriptPath) } }
Example 21
Source File: ExecutePythonWithDataFrame.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.script import java.util import java.util.UUID import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.util.FileUtil import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import jep.Jep import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{Row, SparkSession} import scala.collection.JavaConversions._ class ExecutePythonWithDataFrame extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Execute python script with dataframe" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var script : String = _ var execFunction : String = _ override def setProperties(map: Map[String, Any]): Unit = { script = MapUtil.get(map,"script").asInstanceOf[String] execFunction = MapUtil.get(map,"execFunction").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val script = new PropertyDescriptor() .name("script") .displayName("script") .description("The code of python") .defaultValue("") .required(true) val execFunction = new PropertyDescriptor() .name("execFunction") .displayName("execFunction") .description("The function of python script to be executed.") .defaultValue("") .required(true) descriptor = script :: descriptor descriptor = execFunction :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/script/python.png") } override def getGroup(): List[String] = { List(StopGroup.ScriptGroup) } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() val df = in.read() val jep = new Jep() val scriptPath = "/tmp/pythonExcutor-"+ UUID.randomUUID() +".py" FileUtil.writeFile(script,scriptPath) jep.runScript(scriptPath) val listInfo = df.toJSON.collectAsList() jep.eval(s"result = $execFunction($listInfo)") val resultArrayList = jep.getValue("result",new util.ArrayList().getClass) println(resultArrayList) var resultList = List[Map[String, Any]]() val it = resultArrayList.iterator() while(it.hasNext){ val i = it.next().asInstanceOf[java.util.HashMap[String, Any]] val item = mapAsScalaMap(i).toMap[String, Any] resultList = item +: resultList } val rows = resultList.map( m => Row(m.values.toSeq:_*)) val header = resultList.head.keys.toList val schema = StructType(header.map(fieldName => new StructField(fieldName, StringType, true))) val rdd = spark.sparkContext.parallelize(rows) val resultDF = spark.createDataFrame(rdd, schema) out.write(resultDF) } }
Example 22
Source File: DataFrameRowParser.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.script import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf._ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{Row, SparkSession} import scala.beans.BeanProperty class DataFrameRowParser extends ConfigurableStop{ val authorEmail: String = "[email protected]" val description: String = "Create dataframe by schema" val inportList: List[String] = List(Port.DefaultPort.toString) val outportList: List[String] = List(Port.DefaultPort.toString) var schema: String = _ var separator: String = _ override def setProperties(map: Map[String, Any]): Unit = { schema = MapUtil.get(map,"schema").asInstanceOf[String] separator = MapUtil.get(map,"separator").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val schema = new PropertyDescriptor().name("schema").displayName("schema").description("The schema of dataframe").defaultValue("").required(true) val separator = new PropertyDescriptor().name("separator").displayName("separator").description("The separator of schema").defaultValue("").required(true) descriptor = schema :: descriptor descriptor = separator :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/script/DataFrameRowParser.png") } override def getGroup(): List[String] = { List(StopGroup.ScriptGroup.toString) } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() val inDF = in.read() //parse RDD val rdd = inDF.rdd.map(row => { val fieldArray = row.get(0).asInstanceOf[String].split(",") Row.fromSeq(fieldArray.toSeq) }) //parse schema val field = schema.split(separator) val structFieldArray : Array[StructField] = new Array[StructField](field.size) for(i <- 0 to field.size - 1){ structFieldArray(i) = new StructField(field(i),StringType, nullable = true) } val schemaStructType = StructType(structFieldArray) //create DataFrame val df = spark.createDataFrame(rdd,schemaStructType) //df.show() out.write(df) } }
Example 23
Source File: WordSpliter.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.nlp import cn.piflow._ import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import com.huaban.analysis.jieba.JiebaSegmenter.SegMode import com.huaban.analysis.jieba._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer class WordSpliter extends ConfigurableStop { val authorEmail: String = "[email protected]" val description: String = "Word segmentation" val inportList: List[String] = List(Port.AnyPort.toString) val outportList: List[String] = List(Port.DefaultPort.toString) var path:String = _ val jiebaSegmenter = new JiebaSegmenter() var tokenARR:ArrayBuffer[String]=ArrayBuffer() def segmenter(str:String): Unit ={ var strVar = str //delete symbol strVar = strVar.replaceAll( "[\\p{P}+~$`^=|<>~`$^+=|<>¥×+\\s]" , ""); val tokens = jiebaSegmenter.process(strVar,SegMode.SEARCH).asScala for (token: SegToken <- tokens){ tokenARR += token.word } } def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session: SparkSession = pec.get[SparkSession]() //read val strDF = session.read.text(path) //segmenter segmenter(strDF.head().getString(0)) //write df val rows: List[Row] = tokenARR.map(each => { var arr:Array[String]=Array(each) val row: Row = Row.fromSeq(arr) row }).toList val rowRDD: RDD[Row] = session.sparkContext.makeRDD(rows) val schema: StructType = StructType(Array( StructField("words",StringType) )) val df: DataFrame = session.createDataFrame(rowRDD,schema) out.write(df) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map : Map[String, Any]) = { path = MapUtil.get(map,"path").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val path = new PropertyDescriptor().name("path").displayName("path").description("The path of text file").defaultValue("").required(true) descriptor = path :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/nlp/NLP.png") } override def getGroup(): List[String] = { List(StopGroup.Alg_NLPGroup.toString) } }
Example 24
Source File: PutIntoSolr.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.solr import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.solr.client.solrj.impl.HttpSolrClient import org.apache.solr.client.solrj.response.UpdateResponse import org.apache.solr.common.SolrInputDocument import org.apache.spark.sql.types.StructField import org.apache.spark.sql.DataFrame class PutIntoSolr extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Write data to solr" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var solrURL:String=_ var SolrCollection:String=_ override def setProperties(map: Map[String, Any]): Unit = { solrURL=MapUtil.get(map,"solrURL").asInstanceOf[String] SolrCollection=MapUtil.get(map,"SolrCollection").asInstanceOf[String] } var client: HttpSolrClient =_ var doc:SolrInputDocument =_ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val df: DataFrame = in.read() val SchemaList: List[StructField] = df.schema.toList val length: Int = SchemaList.length var url=solrURL+"/"+SolrCollection df.collect().foreach(row=>{ client= new HttpSolrClient.Builder(url).build() doc= new SolrInputDocument() for(x<-0 until length){ doc.addField(SchemaList(x).name,row.get(x)) } val update: UpdateResponse = client.add(doc) client.commit() }) } override def initialize(ctx: ProcessContext): Unit = { } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/solr/PutSolr.png") } override def getGroup(): List[String] = { List(StopGroup.SolrGroup) } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val solrURL = new PropertyDescriptor() .name("solrURL") .displayName("SolrURL") .description("The url of solr") .defaultValue("") .required(true) .example("http://127.0.0.1:8886/solr") descriptor = solrURL :: descriptor val SolrCollection = new PropertyDescriptor() .name("SolrCollection") .displayName("SolrCollection") .description("The name of collection") .defaultValue("") .required(true) .example("test") descriptor = SolrCollection :: descriptor descriptor } }
Example 25
Source File: PhoneNumberClean.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.clean import cn.piflow.bundle.util.CleanUtil import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructField class PhoneNumberClean extends ConfigurableStop{ val authorEmail: String = "[email protected]" val description: String = "Cleaning data in mobile number format" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var columnName:String=_ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() val sqlContext=spark.sqlContext val dfOld = in.read() dfOld.createOrReplaceTempView("thesis") sqlContext.udf.register("regexPro",(str:String)=>CleanUtil.processPhonenum(str)) val structFields: Array[String] = dfOld.schema.fieldNames val columnNames = columnName.split(",").toSet val sqlNewFieldStr = new StringBuilder columnNames.foreach(c=>{ if (columnNames.contains(c)) { sqlNewFieldStr ++= ",regexPro(" sqlNewFieldStr ++= c sqlNewFieldStr ++= ") as " sqlNewFieldStr ++= c sqlNewFieldStr ++= "_new " } }) val sqlText:String="select * " + sqlNewFieldStr + " from thesis" val dfNew=sqlContext.sql(sqlText) dfNew.createOrReplaceTempView("thesis") val schemaStr = new StringBuilder structFields.foreach(field => { schemaStr ++= field if (columnNames.contains(field)) { schemaStr ++= "_new as " schemaStr ++= field } schemaStr ++= "," }) val sqlTextNew:String = "select " + schemaStr.substring(0,schemaStr.length -1) + " from thesis" val dfNew1=sqlContext.sql(sqlTextNew) out.write(dfNew1) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map: Map[String, Any]): Unit = { columnName=MapUtil.get(map,key="columnName").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val columnName = new PropertyDescriptor() .name("columnName") .displayName("Column_Name") .description("Column names are what you want to clean,multiple column names are separated by commas") .defaultValue("") .required(true) .example("phonenumber") descriptor = columnName :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/clean/PhoneNumberClean.png") } override def getGroup(): List[String] = { List(StopGroup.CleanGroup) } }
Example 26
Source File: TitleClean.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.clean import cn.piflow.bundle.util.CleanUtil import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructField class TitleClean extends ConfigurableStop{ val authorEmail: String = "[email protected]" val description: String = "Cleaning title data" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var columnName:String=_ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() val sqlContext=spark.sqlContext val dfOld = in.read() dfOld.createOrReplaceTempView("thesis") sqlContext.udf.register("regexPro",(str:String)=>CleanUtil.processTitle(str)) val structFields: Array[String] = dfOld.schema.fieldNames val columnNames = columnName.split(",").toSet val sqlNewFieldStr = new StringBuilder columnNames.foreach(c=>{ if (columnNames.contains(c)) { sqlNewFieldStr ++= ",regexPro(" sqlNewFieldStr ++= c sqlNewFieldStr ++= ") as " sqlNewFieldStr ++= c sqlNewFieldStr ++= "_new " } }) val sqlText:String="select * " + sqlNewFieldStr + " from thesis" val dfNew=sqlContext.sql(sqlText) dfNew.createOrReplaceTempView("thesis") val schemaStr = new StringBuilder structFields.foreach(field => { schemaStr ++= field if (columnNames.contains(field)) { schemaStr ++= "_new as " schemaStr ++= field } schemaStr ++= "," }) val sqlTextNew:String = "select " + schemaStr.substring(0,schemaStr.length -1) + " from thesis" val dfNew1=sqlContext.sql(sqlTextNew) out.write(dfNew1) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map: Map[String, Any]): Unit = { columnName=MapUtil.get(map,key="columnName").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val columnName = new PropertyDescriptor() .name("columnName") .displayName("Column_Name") .description("Column names are what you want to clean,multiple column names are separated by commas") .defaultValue("") .required(true) .example("title") descriptor = columnName :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/clean/TitleClean.png") } override def getGroup(): List[String] = { List(StopGroup.CleanGroup) } }
Example 27
Source File: TestIndexing.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark import java.util.UUID import com.lucidworks.spark.util.SolrDataFrameImplicits._ import com.lucidworks.spark.util.{ConfigurationConstants, SolrCloudUtil, SolrQuerySupport, SolrSupport} import org.apache.spark.sql.functions.{concat, lit} import org.apache.spark.sql.types.{DataTypes, StructField, StructType} class TestIndexing extends TestSuiteBuilder { test("Load csv file and index to Solr") { val collectionName = "testIndexing-" + UUID.randomUUID().toString SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc) try { val csvFileLocation = "src/test/resources/test-data/nyc_yellow_taxi_sample_1k.csv" val csvDF = sparkSession.read.format("com.databricks.spark.csv") .option("header", "true") .option("inferSchema", "true") .load(csvFileLocation) assert(csvDF.count() == 999) val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName) val newDF = csvDF .withColumn("pickup_location", concat(csvDF.col("pickup_latitude"), lit(","), csvDF.col("pickup_longitude"))) .withColumn("dropoff_location", concat(csvDF.col("dropoff_latitude"), lit(","), csvDF.col("dropoff_longitude"))) newDF.write.option("zkhost", zkHost).option(ConfigurationConstants.GENERATE_UNIQUE_KEY, "true").solr(collectionName) // Explicit commit to make sure all docs are visible val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost) solrCloudClient.commit(collectionName, true, true) val solrDF = sparkSession.read.format("solr").options(solrOpts).load() solrDF.printSchema() assert (solrDF.count() == 999) solrDF.take(10) } finally { SolrCloudUtil.deleteCollection(collectionName, cluster) } } test("Solr field types config") { val collectionName = "testIndexing-" + UUID.randomUUID().toString SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc) try { val csvFileLocation = "src/test/resources/test-data/simple.csv" val csvDF = sparkSession.read.format("com.databricks.spark.csv") .option("header", "true") .option("inferSchema", "true") .load(csvFileLocation) val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName, ConfigurationConstants.SOLR_FIELD_TYPES -> "ntitle:text_en,nrating:string") csvDF.write.options(solrOpts).solr(collectionName) // Explicit commit to make sure all docs are visible val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost) solrCloudClient.commit(collectionName, true, true) val solrBaseUrl = SolrSupport.getSolrBaseUrl(zkHost) val solrUrl = solrBaseUrl + collectionName + "/" val fieldTypes = SolrQuerySupport.getFieldTypes(Set.empty, solrUrl, cloudClient, collectionName) assert(fieldTypes("nrating").fieldType === "string") assert(fieldTypes("ntitle").fieldType === "text_en") } finally { SolrCloudUtil.deleteCollection(collectionName, cluster) } } test("Field additions") { val insertSchema = StructType(Array( StructField("index_only_field", DataTypes.StringType, nullable = true), StructField("store_only_field", DataTypes.BooleanType, nullable = true), StructField("a_s", DataTypes.StringType, nullable = true), StructField("s_b", DataTypes.StringType, nullable = true) )) val collection = "testFieldAdditions" + UUID.randomUUID().toString.replace("-", "_") try { SolrCloudUtil.buildCollection(zkHost, collection, null, 2, cloudClient, sc) val opts = Map("zkhost" -> zkHost, "collection" -> collection) val solrRelation = new SolrRelation(opts, sparkSession) val fieldsToAdd = SolrRelation.getFieldsToAdd(insertSchema, solrRelation.conf, solrRelation.solrVersion, solrRelation.dynamicSuffixes) assert(fieldsToAdd.isEmpty) } finally { SolrCloudUtil.deleteCollection(collection, cluster) } } }
Example 28
Source File: SparkSchemaFnsTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.spark import io.eels.schema.{DateType, DecimalType, Field, LongType, Precision, Scale, StringType, StructType} import org.apache.spark.sql.types.StructField import org.scalatest.{FunSuite, Matchers} class SparkSchemaFnsTest extends FunSuite with Matchers { test("eel schema to spark") { val schema = StructType( Field("a", StringType), Field("b", DecimalType(Precision(14), Scale(4))), Field("c", LongType.Signed), Field("d", DateType) ) SparkSchemaFns.toSparkSchema(schema) shouldBe org.apache.spark.sql.types.StructType( Seq( StructField("a", org.apache.spark.sql.types.StringType, true), StructField("b", org.apache.spark.sql.types.DecimalType(14, 4), true), StructField("c", org.apache.spark.sql.types.LongType, true), StructField("d", org.apache.spark.sql.types.DateType, true) ) ) } test("spark schema to eel") { val schema = org.apache.spark.sql.types.StructType( Seq( StructField("a", org.apache.spark.sql.types.StringType, true), StructField("b", org.apache.spark.sql.types.DecimalType(14, 4), true), StructField("c", org.apache.spark.sql.types.LongType, true), StructField("d", org.apache.spark.sql.types.DateType, true) ) ) SparkSchemaFns.fromSparkSchema(schema) shouldBe StructType( Field("a", StringType), Field("b", DecimalType(Precision(14), Scale(4))), Field("c", LongType.Signed), Field("d", DateType) ) } }
Example 29
Source File: StackSummarizerFactory.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.summarize._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.{ ArrayType, StructField, StructType } case class StackSummarizerFactory(factories: Seq[SummarizerFactory]) extends SummarizerFactory { factories.foreach { case factory => require( !factory.isInstanceOf[OverlappableSummarizerFactory], "Stacking overlappable summarizers are not supported" ) } override val requiredColumns: ColumnList = factories.map(_.requiredColumns).reduce(_ ++ _) def apply(inputSchema: StructType): Summarizer = { val summarizers = factories.map(f => f.apply(inputSchema)) new StackSummarizer(inputSchema, prefixOpt, requiredColumns, summarizers) } } class StackSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, summarizers: Seq[Summarizer] ) extends Summarizer with InputAlwaysValid { override type T = InternalRow override type U = Seq[Any] override type V = Seq[InternalRow] require( summarizers.forall(s => s.outputSchema == summarizers.head.outputSchema), s"Summarizers must have identical schemas to be stacked: ${summarizers.map(_.outputSchema).mkString(" vs. ")}" ) override val schema: StructType = StructType( StructField(StackSummarizer.stackColumn, ArrayType(summarizers.head.outputSchema)) :: Nil ) override val summarizer = com.twosigma.flint.rdd.function.summarize.summarizer.StackSummarizer(summarizers) // Convert the output of `summarizer` to the InternalRow. override def fromV(v: V): InternalRow = InternalRow(new GenericArrayData(v)) // Convert the InternalRow to the type of row expected by the `summarizer`. override def toT(r: InternalRow): T = r } object StackSummarizer { val stackColumn = "stack" }
Example 30
Source File: MetadataTransformUtils.scala From automl with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature.operator import org.apache.spark.sql.types.{MetadataBuilder, StructField} import scala.collection.mutable.ArrayBuffer def vectorCartesianTransform(fields: Array[StructField], numFeatures: Int): MetadataBuilder = { if (fields.length < 2) { throw new IllegalArgumentException("the number of cols in the input DataFrame should be no less than 2") } var res = Array[String]() if (fields.head.metadata.contains(DERIVATION)) { res = fields.head.metadata.getStringArray(DERIVATION) } else { res = createDerivation(numFeatures) } for (i <- 1 until fields.length) { if (fields(i).metadata.contains(DERIVATION)) { res = cartesianWithArray(res, fields(i).metadata.getStringArray(DERIVATION)) } else { res = cartesianWithArray(res, createDerivation(numFeatures)) } } val metadata = fields.last.metadata new MetadataBuilder().withMetadata(metadata).putStringArray(DERIVATION, res) } }
Example 31
Source File: Surrogate.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.tuner.surrogate import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace import org.apache.commons.logging.{Log, LogFactory} import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import scala.collection.mutable.ArrayBuffer def predict(X: Vector): (Double, Double) def stop(): Unit def curBest: (Vector, Double) = { if (minimize) curMin else curMax } def curMin: (Vector, Double) = { if (preY.isEmpty) (null, Double.MaxValue) else { val maxIdx: Int = preY.zipWithIndex.max._2 (preX(maxIdx), -preY(maxIdx)) } } def curMax: (Vector, Double) = { if (preY.isEmpty) (null, Double.MinValue) else { val maxIdx: Int = preY.zipWithIndex.max._2 (preX(maxIdx), preY(maxIdx)) } } }
Example 32
Source File: BasicDataSourceSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import com.pingcap.tikv.exception.TiBatchWriteException import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} class BasicDataSourceSuite extends BaseDataSourceTest("test_datasource_basic") { private val row1 = Row(null, "Hello") private val row2 = Row(2, "TiDB") private val row3 = Row(3, "Spark") private val row4 = Row(4, null) private val schema = StructType( List(StructField("i", IntegerType), StructField("s", StringType))) override def beforeAll(): Unit = { super.beforeAll() dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')") } test("Test Select") { if (!supportBatchWrite) { cancel } testTiDBSelect(Seq(row1, row2)) } test("Test Write Append") { if (!supportBatchWrite) { cancel } val data: RDD[Row] = sc.makeRDD(List(row3, row4)) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .mode("append") .save() testTiDBSelect(Seq(row1, row2, row3, row4)) } test("Test Write Overwrite") { if (!supportBatchWrite) { cancel } val data: RDD[Row] = sc.makeRDD(List(row3, row4)) val df = sqlContext.createDataFrame(data, schema) val caught = intercept[TiBatchWriteException] { df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .mode("overwrite") .save() } assert( caught.getMessage .equals("SaveMode: Overwrite is not supported. TiSpark only support SaveMode.Append.")) } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 33
Source File: UpperCaseColumnNameSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class UpperCaseColumnNameSuite extends BaseDataSourceTest("test_datasource_uppser_case_column_name") { private val row1 = Row(1, 2) private val schema = StructType( List(StructField("O_ORDERKEY", IntegerType), StructField("O_CUSTKEY", IntegerType))) override def beforeAll(): Unit = { super.beforeAll() dropTable() jdbcUpdate(s""" |CREATE TABLE $dbtable (O_ORDERKEY INTEGER NOT NULL, | O_CUSTKEY INTEGER NOT NULL); """.stripMargin) } test("Test insert upper case column name") { if (!supportBatchWrite) { cancel } val data: RDD[Row] = sc.makeRDD(List(row1)) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .mode("append") .save() } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 34
Source File: CheckUnsupportedSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import com.pingcap.tikv.exception.TiBatchWriteException import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} class CheckUnsupportedSuite extends BaseDataSourceTest("test_datasource_check_unsupported") { override def beforeAll(): Unit = super.beforeAll() test("Test write to partition table") { if (!supportBatchWrite) { cancel } dropTable() tidbStmt.execute("set @@tidb_enable_table_partition = 1") jdbcUpdate( s"create table $dbtable(i int, s varchar(128)) partition by range(i) (partition p0 values less than maxvalue)") jdbcUpdate(s"insert into $dbtable values(null, 'Hello')") val row1 = Row(null, "Hello") val row2 = Row(2, "TiDB") val row3 = Row(3, "Spark") val schema = StructType(List(StructField("i", IntegerType), StructField("s", StringType))) { val caught = intercept[TiBatchWriteException] { tidbWrite(List(row2, row3), schema) } assert( caught.getMessage .equals("tispark currently does not support write data to partition table!")) } testTiDBSelect(Seq(row1)) } test("Check Virtual Generated Column") { if (!supportBatchWrite) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i INT, c1 INT, c2 INT, c3 INT AS (c1 + c2))") val row1 = Row(1, 2, 3) val schema = StructType( List( StructField("i", IntegerType), StructField("c1", IntegerType), StructField("c2", IntegerType))) val caught = intercept[TiBatchWriteException] { tidbWrite(List(row1), schema) } assert( caught.getMessage .equals("tispark currently does not support write data to table with generated column!")) } test("Check Stored Generated Column") { if (!supportBatchWrite) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i INT, c1 INT, c2 INT, c3 INT AS (c1 + c2) STORED)") val row1 = Row(1, 2, 3) val schema = StructType( List( StructField("i", IntegerType), StructField("c1", IntegerType), StructField("c2", IntegerType))) val caught = intercept[TiBatchWriteException] { tidbWrite(List(row1), schema) } assert( caught.getMessage .equals("tispark currently does not support write data to table with generated column!")) } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 35
Source File: TiSparkTypeSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import org.apache.spark.sql.Row import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} class TiSparkTypeSuite extends BaseDataSourceTest("type_test") { private val row1 = Row(null, "Hello") private val row2 = Row(2L, "TiDB") private val row3 = Row(3L, "Spark") private val row5 = Row(Long.MaxValue, "Duplicate") private val schema = StructType(List(StructField("i", LongType), StructField("s", StringType))) test("bigint test") { if (!supportBatchWrite) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i bigint, s varchar(128))") jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')") tidbWrite(List(row3, row5), schema) testTiDBSelect(List(row1, row2, row3, row5)) } }
Example 36
Source File: RegionSplitSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import com.pingcap.tikv.TiBatchWriteUtils import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class RegionSplitSuite extends BaseDataSourceTest("region_split_test") { private val row1 = Row(1) private val row2 = Row(2) private val row3 = Row(3) private val schema = StructType(List(StructField("a", IntegerType))) test("index region split test") { if (!supportBatchWrite) { cancel } // do not test this case on tidb which does not support split region if (!isEnableSplitRegion) { cancel } dropTable() jdbcUpdate( s"CREATE TABLE $dbtable ( `a` int(11), unique index(a)) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin") val options = Some(Map("enableRegionSplit" -> "true", "regionSplitNum" -> "3")) tidbWrite(List(row1, row2, row3), schema, options) val tiTableInfo = ti.tiSession.getCatalog.getTable(dbPrefix + database, table) val regionsNum = TiBatchWriteUtils .getRegionByIndex(ti.tiSession, tiTableInfo, tiTableInfo.getIndices.get(0)) .size() assert(regionsNum == 3) } test("table region split test") { if (!supportBatchWrite) { cancel } // do not test this case on tidb which does not support split region if (!isEnableSplitRegion) { cancel } dropTable() jdbcUpdate( s"CREATE TABLE $dbtable ( `a` int(11) DEFAULT NULL) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin") val options = Some(Map("enableRegionSplit" -> "true", "regionSplitNum" -> "3")) tidbWrite(List(row1, row2, row3), schema, options) val tiTableInfo = ti.tiSession.getCatalog.getTable(dbPrefix + database, table) val regionsNum = TiBatchWriteUtils.getRecordRegions(ti.tiSession, tiTableInfo).size() assert(regionsNum == 3) } }
Example 37
Source File: MissingParameterSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} class MissingParameterSuite extends BaseDataSourceTest("test_datasource_missing_parameter") { private val row1 = Row(null, "Hello") private val schema = StructType( List(StructField("i", IntegerType), StructField("s", StringType))) test("Missing parameter: database") { if (!supportBatchWrite) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") val caught = intercept[IllegalArgumentException] { val rows = row1 :: Nil val data: RDD[Row] = sc.makeRDD(rows) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("table", table) .mode("append") .save() } assert( caught.getMessage .equals("requirement failed: Option 'database' is required.")) } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 38
Source File: ShardRowIDBitsSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class ShardRowIDBitsSuite extends BaseDataSourceTest("test_shard_row_id_bits") { private val row1 = Row(1) private val row2 = Row(2) private val row3 = Row(3) private val schema = StructType(List(StructField("a", IntegerType))) test("reading and writing a table with shard_row_id_bits") { if (!supportBatchWrite) { cancel } dropTable() jdbcUpdate(s"CREATE TABLE $dbtable ( `a` int(11)) SHARD_ROW_ID_BITS = 4") jdbcUpdate(s"insert into $dbtable values(null)") tidbWrite(List(row1, row2, row3), schema) testTiDBSelect(List(Row(null), row1, row2, row3), sortCol = "a") } }
Example 39
Source File: OnlyOnePkSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class OnlyOnePkSuite extends BaseDataSourceTest("test_datasource_only_one_pk") { private val row3 = Row(3) private val row4 = Row(4) private val schema = StructType(List(StructField("i", IntegerType))) override def beforeAll(): Unit = { super.beforeAll() dropTable() jdbcUpdate(s"create table $dbtable(i int primary key)") } test("Test Write Append") { if (!supportBatchWrite) { cancel } val data: RDD[Row] = sc.makeRDD(List(row3, row4)) val df = sqlContext.createDataFrame(data, schema) df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .mode("append") .save() testTiDBSelect(Seq(row3, row4)) } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 40
Source File: BatchWriteIssueSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark import com.pingcap.tispark.datasource.BaseDataSourceTest import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} class BatchWriteIssueSuite extends BaseDataSourceTest("test_batchwrite_issue") { override def beforeAll(): Unit = { super.beforeAll() } test("Combine unique index with null value test") { doTestNullValues(s"create table $dbtable(a int, b varchar(64), CONSTRAINT ab UNIQUE (a, b))") } test("Combine primary key with null value test") { doTestNullValues(s"create table $dbtable(a int, b varchar(64), PRIMARY KEY (a, b))") } test("PK is handler with null value test") { doTestNullValues(s"create table $dbtable(a int, b varchar(64), PRIMARY KEY (a))") } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } private def doTestNullValues(createTableSQL: String): Unit = { if (!supportBatchWrite) { cancel } val schema = StructType( List( StructField("a", IntegerType), StructField("b", StringType), StructField("c", StringType))) val options = Some(Map("replace" -> "true")) dropTable() jdbcUpdate(createTableSQL) jdbcUpdate(s"alter table $dbtable add column to_delete int") jdbcUpdate(s"alter table $dbtable add column c varchar(64) default 'c33'") jdbcUpdate(s"alter table $dbtable drop column to_delete") jdbcUpdate(s""" |insert into $dbtable values(11, 'c12', null); |insert into $dbtable values(21, 'c22', null); |insert into $dbtable (a, b) values(31, 'c32'); |insert into $dbtable values(41, 'c42', 'c43'); | """.stripMargin) assert(queryTiDBViaJDBC(s"select c from $dbtable where a=11").head.head == null) assert(queryTiDBViaJDBC(s"select c from $dbtable where a=21").head.head == null) assert( queryTiDBViaJDBC(s"select c from $dbtable where a=31").head.head.toString.equals("c33")) assert( queryTiDBViaJDBC(s"select c from $dbtable where a=41").head.head.toString.equals("c43")) { val row1 = Row(11, "c12", "c13") val row3 = Row(31, "c32", null) tidbWrite(List(row1, row3), schema, options) assert( queryTiDBViaJDBC(s"select c from $dbtable where a=11").head.head.toString.equals("c13")) assert(queryTiDBViaJDBC(s"select c from $dbtable where a=21").head.head == null) assert(queryTiDBViaJDBC(s"select c from $dbtable where a=31").head.head == null) assert( queryTiDBViaJDBC(s"select c from $dbtable where a=41").head.head.toString.equals("c43")) } { val row1 = Row(11, "c12", "c213") val row3 = Row(31, "c32", "tt") tidbWrite(List(row1, row3), schema, options) assert( queryTiDBViaJDBC(s"select c from $dbtable where a=11").head.head.toString.equals("c213")) assert(queryTiDBViaJDBC(s"select c from $dbtable where a=21").head.head == null) assert( queryTiDBViaJDBC(s"select c from $dbtable where a=31").head.head.toString.equals("tt")) assert( queryTiDBViaJDBC(s"select c from $dbtable where a=41").head.head.toString.equals("c43")) } } }
Example 41
Source File: LockTimeoutSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.ttl import com.pingcap.tikv.TTLManager import com.pingcap.tikv.exception.GrpcException import com.pingcap.tispark.datasource.BaseDataSourceTest import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} class LockTimeoutSuite extends BaseDataSourceTest("test_lock_timeout") { private val row1 = Row(1, "Hello") private val schema = StructType( List(StructField("i", IntegerType), StructField("s", StringType))) override def beforeAll(): Unit = { super.beforeAll() dropTable() jdbcUpdate(s"create table $dbtable(i int, s varchar(128))") } test("Test Lock TTL Timeout") { if (!supportTTLUpdate) { cancel } val seconds = 1000 val sleep1 = TTLManager.MANAGED_LOCK_TTL + 10 * seconds val sleep2 = TTLManager.MANAGED_LOCK_TTL + 15 * seconds val data: RDD[Row] = sc.makeRDD(List(row1)) val df = sqlContext.createDataFrame(data, schema) new Thread(new Runnable { override def run(): Unit = { Thread.sleep(sleep1) queryTiDBViaJDBC(s"select * from $dbtable") } }).start() val grpcException = intercept[GrpcException] { df.write .format("tidb") .options(tidbOptions) .option("database", database) .option("table", table) .option("sleepAfterPrewritePrimaryKey", sleep2) .mode("append") .save() } assert(grpcException.getMessage.equals("retry is exhausted.")) assert(grpcException.getCause.getMessage.startsWith("Txn commit primary key failed")) assert( grpcException.getCause.getCause.getMessage.startsWith( "Key exception occurred and the reason is retryable: \"Txn(Mvcc(TxnLockNotFound")) } override def afterAll(): Unit = try { dropTable() } finally { super.afterAll() } }
Example 42
Source File: RedisStreamProvider.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis.stream import com.redislabs.provider.redis.util.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} import org.apache.spark.sql.types.{StringType, StructField, StructType} class RedisStreamProvider extends DataSourceRegister with StreamSourceProvider with Logging { override def shortName(): String = "redis" override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { providerName -> schema.getOrElse { StructType(Seq(StructField("_id", StringType))) } } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { val (_, ss) = sourceSchema(sqlContext, schema, providerName, parameters) val source = new RedisSource(sqlContext, metadataPath, Some(ss), parameters) source.start() source } }
Example 43
Source File: HousePriceDataBusinessLogic.scala From bdd-spark with MIT License | 5 votes |
import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import org.json4s._ import org.json4s.jackson.JsonMethods._ object HousePriceDataBusinessLogic { import Spark._ def processHousePrices(housePrices : DataFrame, postcodes : DataFrame) : DataFrame = { housePrices.join(postcodes, "Postcode") } def processHousePricesAndSaveToParquet(housePrices : DataFrame, postcodes : DataFrame, parquetWriter: ParquetWriter) : Unit = { parquetWriter.write(housePrices.join(postcodes, "Postcode"), "results.parquet") } def processDataFromFilesFilterItThenSaveItToParquet(reader: FileReader, geoFilename : String, priceFilename: String, postcodeFileName: String, writer: ParquetWriter) : Unit = { val joined = loadAndJoin(reader, priceFilename, postcodeFileName) // If this was real code, a geoJSON library would be sensible here. Dirty code follows: val json = parse(reader.readText(geoFilename))\\ "coordinates" val coords = json match { case JArray(outer) => outer.map{ case JArray(inner) => inner } } val points = coords .map(c => (c(0), c(1))) .map{ case (JDouble(long), JDouble(lat)) => (long, lat) } val minLat = Math.min(points(0)._2, points(1)._2) val maxLat = Math.max(points(0)._2, points(1)._2) val minLong = Math.min(points(0)._1, points(1)._1) val maxLong = Math.max(points(0)._1, points(1)._1) val filtered = joined .filter(s"Latitude >= $minLat and Latitude <= $maxLat") .filter(s"Longitude >= $minLong and Longitude <= $maxLong") writer.write(filtered, "results.parquet") } def processDataFromFilesAndSaveToParquet(reader: FileReader, priceFilename: String, postcodeFileName: String, writer: ParquetWriter) : Unit = { val joined = loadAndJoin(reader, priceFilename, postcodeFileName) writer.write(joined, "results.parquet") } private def loadAndJoin(reader: FileReader, priceFilename: String, postcodeFileName: String): DataFrame = { val priceSchema = StructType(Seq( StructField("Price", DataTypes.IntegerType), StructField("Postcode", DataTypes.StringType), StructField("HouseType", DataTypes.StringType) )) val prices = reader .readLinesToRdd(priceFilename) .map(_.split(',')) .map(row => row.map(_.trim())) .map(splits => Row(splits(0).toInt, splits(1), splits(2))) val priceDf = spark.createDataFrame(prices, priceSchema) val postcodeSchema = StructType(Seq( StructField("Postcode", DataTypes.StringType), StructField("Latitude", DataTypes.DoubleType), StructField("Longitude", DataTypes.DoubleType) )) val postcodes = reader .readLinesToRdd(postcodeFileName) .map(_.split(',')) .map(row => row.map(_.trim())) .map(splits => Row(splits(0), splits(1).toDouble, splits(2).toDouble)) val postcodeDf = spark.createDataFrame(postcodes, postcodeSchema) val joined = priceDf.join(postcodeDf, "Postcode") joined } }
Example 44
Source File: Preprocess.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.BitCoin import java.io.{ BufferedWriter, File, FileWriter } import org.apache.spark.sql.types.{ DoubleType, IntegerType, StructField, StructType } import org.apache.spark.sql.{ DataFrame, Row, SparkSession } import scala.collection.mutable.ListBuffer object Preprocess { //how many of first rows are omitted val dropFirstCount: Int = 612000 def rollingWindow(data: DataFrame, window: Int, xFilename: String, yFilename: String): Unit = { var i = 0 val xWriter = new BufferedWriter(new FileWriter(new File(xFilename))) val yWriter = new BufferedWriter(new FileWriter(new File(yFilename))) val zippedData = data.rdd.zipWithIndex().collect() System.gc() val dataStratified = zippedData.drop(dropFirstCount) //todo slice fisrt 614K while (i < (dataStratified.length - window)) { val x = dataStratified .slice(i, i + window) .map(r => r._1.getAs[Double]("Delta")).toList val y = dataStratified.apply(i + window)._1.getAs[Integer]("label") val stringToWrite = x.mkString(",") xWriter.write(stringToWrite + "\n") yWriter.write(y + "\n") i += 1 if (i % 10 == 0) { xWriter.flush() yWriter.flush() } } xWriter.close() yWriter.close() } def main(args: Array[String]): Unit = { //todo modify these variables to match desirable files val priceDataFileName: String = "C:/Users/admin-karim/Desktop/bitstampUSD_1-min_data_2012-01-01_to_2017-10-20.csv/bitstampUSD_1-min_data_2012-01-01_to_2017-10-20.csv" val outputDataFilePath: String = "output/scala_test_x.csv" val outputLabelFilePath: String = "output/scala_test_y.csv" val spark = SparkSession .builder() .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Bitcoin Preprocessing") .getOrCreate() val data = spark.read.format("com.databricks.spark.csv").option("header", "true").load(priceDataFileName) data.show(10) println((data.count(), data.columns.size)) val dataWithDelta = data.withColumn("Delta", data("Close") - data("Open")) import org.apache.spark.sql.functions._ import spark.sqlContext.implicits._ val dataWithLabels = dataWithDelta.withColumn("label", when($"Close" - $"Open" > 0, 1).otherwise(0)) rollingWindow(dataWithLabels, 22, outputDataFilePath, outputLabelFilePath) spark.stop() } }
Example 45
Source File: XSDToSchemaSuite.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml.util import java.nio.file.Paths import org.apache.spark.sql.types.{ArrayType, StructField, StructType, StringType} import org.scalatest.funsuite.AnyFunSuite class XSDToSchemaSuite extends AnyFunSuite { test("Basic parsing") { val parsedSchema = XSDToSchema.read(Paths.get("src/test/resources/basket.xsd")) val expectedSchema = StructType(Array( StructField("basket", StructType(Array( StructField("entry", ArrayType( StructType(Array( StructField("key", StringType), StructField("value", StringType) ))) )) ))) ) assert(expectedSchema === parsedSchema) } }
Example 46
Source File: OptimizeHiveMetadataOnlyQuerySuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfter import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.expressions.NamedExpression import org.apache.spark.sql.catalyst.plans.logical.{Distinct, Filter, Project, SubqueryAlias} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_METADATA_ONLY import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class OptimizeHiveMetadataOnlyQuerySuite extends QueryTest with TestHiveSingleton with BeforeAndAfter with SQLTestUtils { import spark.implicits._ override def beforeAll(): Unit = { super.beforeAll() sql("CREATE TABLE metadata_only (id bigint, data string) PARTITIONED BY (part int)") (0 to 10).foreach(p => sql(s"ALTER TABLE metadata_only ADD PARTITION (part=$p)")) } override protected def afterAll(): Unit = { try { sql("DROP TABLE IF EXISTS metadata_only") } finally { super.afterAll() } } test("SPARK-23877: validate metadata-only query pushes filters to metastore") { withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") { val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount // verify the number of matching partitions assert(sql("SELECT DISTINCT part FROM metadata_only WHERE part < 5").collect().length === 5) // verify that the partition predicate was pushed down to the metastore assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount === 5) } } test("SPARK-23877: filter on projected expression") { withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") { val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount // verify the matching partitions val partitions = spark.internalCreateDataFrame(Distinct(Filter(($"x" < 5).expr, Project(Seq(($"part" + 1).as("x").expr.asInstanceOf[NamedExpression]), spark.table("metadata_only").logicalPlan.asInstanceOf[SubqueryAlias].child))) .queryExecution.toRdd, StructType(Seq(StructField("x", IntegerType)))) checkAnswer(partitions, Seq(1, 2, 3, 4).toDF("x")) // verify that the partition predicate was not pushed down to the metastore assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount == 11) } } }
Example 47
Source File: SparkExecuteStatementOperationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, NullType, StringType, StructField, StructType} class SparkExecuteStatementOperationSuite extends SparkFunSuite { test("SPARK-17112 `select null` via JDBC triggers IllegalArgumentException in ThriftServer") { val field1 = StructField("NULL", NullType) val field2 = StructField("(IF(true, NULL, NULL))", NullType) val tableSchema = StructType(Seq(field1, field2)) val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors() assert(columns.size() == 2) assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.NULL_TYPE) assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.NULL_TYPE) } test("SPARK-20146 Comment should be preserved") { val field1 = StructField("column1", StringType).withComment("comment 1") val field2 = StructField("column2", IntegerType) val tableSchema = StructType(Seq(field1, field2)) val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors() assert(columns.size() == 2) assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.STRING_TYPE) assert(columns.get(0).getComment() == "comment 1") assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.INT_TYPE) assert(columns.get(1).getComment() == "") } }
Example 48
Source File: LocalRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def computeStats(): Statistics = Statistics(sizeInBytes = EstimationUtils.getSizePerRow(output) * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 49
Source File: ResolveInlineTables.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import scala.util.control.NonFatal import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{StructField, StructType} private[analysis] def convert(table: UnresolvedInlineTable): LocalRelation = { // For each column, traverse all the values and find a common data type and nullability. val fields = table.rows.transpose.zip(table.names).map { case (column, name) => val inputTypes = column.map(_.dataType) val tpe = TypeCoercion.findWiderTypeWithoutStringPromotion(inputTypes).getOrElse { table.failAnalysis(s"incompatible types found in column $name for inline table") } StructField(name, tpe, nullable = column.exists(_.nullable)) } val attributes = StructType(fields).toAttributes assert(fields.size == table.names.size) val newRows: Seq[InternalRow] = table.rows.map { row => InternalRow.fromSeq(row.zipWithIndex.map { case (e, ci) => val targetType = fields(ci).dataType try { val castedExpr = if (e.dataType.sameType(targetType)) { e } else { cast(e, targetType) } castedExpr.eval() } catch { case NonFatal(ex) => table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}", ex) } }) } LocalRelation(attributes, newRows) } }
Example 50
Source File: CanonicalizeSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.Range import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class CanonicalizeSuite extends SparkFunSuite { test("SPARK-24276: IN expression with different order are semantically equal") { val range = Range(1, 1, 1, 1) val idAttr = range.output.head val in1 = In(idAttr, Seq(Literal(1), Literal(2))) val in2 = In(idAttr, Seq(Literal(2), Literal(1))) val in3 = In(idAttr, Seq(Literal(1), Literal(2), Literal(3))) assert(in1.canonicalized.semanticHash() == in2.canonicalized.semanticHash()) assert(in1.canonicalized.semanticHash() != in3.canonicalized.semanticHash()) assert(range.where(in1).sameResult(range.where(in2))) assert(!range.where(in1).sameResult(range.where(in3))) val arrays1 = In(idAttr, Seq(CreateArray(Seq(Literal(1), Literal(2))), CreateArray(Seq(Literal(2), Literal(1))))) val arrays2 = In(idAttr, Seq(CreateArray(Seq(Literal(2), Literal(1))), CreateArray(Seq(Literal(1), Literal(2))))) val arrays3 = In(idAttr, Seq(CreateArray(Seq(Literal(1), Literal(2))), CreateArray(Seq(Literal(3), Literal(1))))) assert(arrays1.canonicalized.semanticHash() == arrays2.canonicalized.semanticHash()) assert(arrays1.canonicalized.semanticHash() != arrays3.canonicalized.semanticHash()) assert(range.where(arrays1).sameResult(range.where(arrays2))) assert(!range.where(arrays1).sameResult(range.where(arrays3))) } test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") { val expId = NamedExpression.newExprId val qualifier = Seq.empty[String] val structType = StructType( StructField("a", StructType(StructField("b", IntegerType, false) :: Nil), false) :: Nil) // GetStructField with different names are semantically equal val fieldA1 = GetStructField( AttributeReference("data1", structType, false)(expId, qualifier), 0, Some("a1")) val fieldA2 = GetStructField( AttributeReference("data2", structType, false)(expId, qualifier), 0, Some("a2")) assert(fieldA1.semanticEquals(fieldA2)) val fieldB1 = GetStructField( GetStructField( AttributeReference("data1", structType, false)(expId, qualifier), 0, Some("a1")), 0, Some("b1")) val fieldB2 = GetStructField( GetStructField( AttributeReference("data2", structType, false)(expId, qualifier), 0, Some("a2")), 0, Some("b2")) assert(fieldB1.semanticEquals(fieldB2)) } }
Example 51
Source File: HadoopFsRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.util.Locale import scala.collection.mutable import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.{StructField, StructType} case class HadoopFsRelation( location: FileIndex, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext private def getColName(f: StructField): String = { if (sparkSession.sessionState.conf.caseSensitiveAnalysis) { f.name } else { f.name.toLowerCase(Locale.ROOT) } } val overlappedPartCols = mutable.Map.empty[String, StructField] partitionSchema.foreach { partitionField => if (dataSchema.exists(getColName(_) == getColName(partitionField))) { overlappedPartCols += getColName(partitionField) -> partitionField } } // When data and partition schemas have overlapping columns, the output // schema respects the order of the data schema for the overlapping columns, and it // respects the data types of the partition schema. val schema: StructType = { StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++ partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f)))) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = { val compressionFactor = sqlContext.conf.fileCompressionFactor (location.sizeInBytes * compressionFactor).toLong } override def inputFiles: Array[String] = location.inputFiles }
Example 52
Source File: EvalPythonExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import java.io.File import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.ChainedPythonFunctions import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.util.Utils abstract class EvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends SparkPlan { def children: Seq[SparkPlan] = child :: Nil override def producedAttributes: AttributeSet = AttributeSet(output.drop(child.output.length)) private def collectFunctions(udf: PythonUDF): (ChainedPythonFunctions, Seq[Expression]) = { udf.children match { case Seq(u: PythonUDF) => val (chained, children) = collectFunctions(u) (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children) case children => // There should not be any other UDFs, or the children can't be evaluated directly. assert(children.forall(_.find(_.isInstanceOf[PythonUDF]).isEmpty)) (ChainedPythonFunctions(Seq(udf.func)), udf.children) } } protected def evaluate( funcs: Seq[ChainedPythonFunctions], argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] protected override def doExecute(): RDD[InternalRow] = { val inputRDD = child.execute().map(_.copy()) inputRDD.mapPartitions { iter => val context = TaskContext.get() // The queue used to buffer input rows so we can drain it to // combine input with output from Python. val queue = HybridRowQueue(context.taskMemoryManager(), new File(Utils.getLocalDir(SparkEnv.get.conf)), child.output.length) context.addTaskCompletionListener[Unit] { ctx => queue.close() } val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip // flatten all the arguments val allInputs = new ArrayBuffer[Expression] val dataTypes = new ArrayBuffer[DataType] val argOffsets = inputs.map { input => input.map { e => if (allInputs.exists(_.semanticEquals(e))) { allInputs.indexWhere(_.semanticEquals(e)) } else { allInputs += e dataTypes += e.dataType allInputs.length - 1 } }.toArray }.toArray val projection = newMutableProjection(allInputs, child.output) val schema = StructType(dataTypes.zipWithIndex.map { case (dt, i) => StructField(s"_$i", dt) }) // Add rows to queue to join later with the result. val projectedRowIter = iter.map { inputRow => queue.add(inputRow.asInstanceOf[UnsafeRow]) projection(inputRow) } val outputRowIterator = evaluate( pyFuncs, argOffsets, projectedRowIter, schema, context) val joined = new JoinedRow val resultProj = UnsafeProjection.create(output, output) outputRowIterator.map { outputRow => resultProj(joined(queue.remove(), outputRow)) } } } }
Example 53
Source File: BatchEvalPythonExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import net.razorvine.pickle.{Pickler, Unpickler} import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StructField, StructType} case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends EvalPythonExec(udfs, output, child) { protected override def evaluate( funcs: Seq[ChainedPythonFunctions], argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] = { EvaluatePython.registerPicklers() // register pickler for Row val dataTypes = schema.map(_.dataType) val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython) // enable memo iff we serialize the row with schema (schema and class should be memorized) val pickle = new Pickler(needConversion) // Input iterator to Python: input rows are grouped so we send them in batches to Python. // For each row, add it to the queue. val inputIterator = iter.map { row => if (needConversion) { EvaluatePython.toJava(row, schema) } else { // fast path for these types that does not need conversion in Python val fields = new Array[Any](row.numFields) var i = 0 while (i < row.numFields) { val dt = dataTypes(i) fields(i) = EvaluatePython.toJava(row.get(i, dt), dt) i += 1 } fields } }.grouped(100).map(x => pickle.dumps(x.toArray)) // Output iterator for results from Python. val outputIterator = new PythonUDFRunner(funcs, PythonEvalType.SQL_BATCHED_UDF, argOffsets) .compute(inputIterator, context.partitionId(), context) val unpickle = new Unpickler val mutableRow = new GenericInternalRow(1) val resultType = if (udfs.length == 1) { udfs.head.dataType } else { StructType(udfs.map(u => StructField("", u.dataType, u.nullable))) } val fromJava = EvaluatePython.makeFromJava(resultType) outputIterator.flatMap { pickedResult => val unpickledBatch = unpickle.loads(pickedResult) unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala }.map { result => if (udfs.length == 1) { // fast path for single UDF mutableRow(0) = fromJava(result) mutableRow } else { fromJava(result).asInstanceOf[InternalRow] } } } }
Example 54
Source File: resources.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 55
Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.CompletionIterator case class StreamingGlobalLimitExec( streamLimit: Long, child: SparkPlan, stateInfo: Option[StatefulOperatorStateInfo] = None, outputMode: Option[OutputMode] = None) extends UnaryExecNode with StateStoreWriter { private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, "StreamingGlobalLimitExec is only valid for streams in Append output mode") child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, valueSchema, indexOrdinal = None, sqlContext.sessionState, Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) => val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null))) val numOutputRows = longMetric("numOutputRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows") val allUpdatesTimeMs = longMetric("allUpdatesTimeMs") val commitTimeMs = longMetric("commitTimeMs") val updatesStartTimeNs = System.nanoTime val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L) var cumulativeRowCount = preBatchRowCount val result = iter.filter { r => val x = cumulativeRowCount < streamLimit if (x) { cumulativeRowCount += 1 } x } CompletionIterator[InternalRow, Iterator[InternalRow]](result, { if (cumulativeRowCount > preBatchRowCount) { numUpdatedStateRows += 1 numOutputRows += cumulativeRowCount - preBatchRowCount store.put(key, getValueRow(cumulativeRowCount)) } allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { store.commit() } setStoreMetrics(store) }) } } override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil private def getValueRow(value: Long): UnsafeRow = { UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } }
Example 56
Source File: BlockingSource.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 57
Source File: MockSourceProvider.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.StreamSourceProvider import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MockSourceProvider extends StreamSourceProvider { override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", MockSourceProvider.fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { MockSourceProvider.sourceProviderFunction() } } object MockSourceProvider { // Function to generate sources. May provide multiple sources if the user implements such a // function. private var sourceProviderFunction: () => Source = _ final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = { var i = 0 val sources = source +: otherSources sourceProviderFunction = () => { val source = sources(i % sources.length) i += 1 source } try { f } finally { sourceProviderFunction = null } } }
Example 58
Source File: TableColumnsParser.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.parser import org.apache.spark.sql.catalyst.AbstractSparkSQLParser import org.apache.spark.sql.catalyst.util.DataTypeParser import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} trait TableColumnsParser extends AbstractSparkSQLParser with DataTypeParser with AnnotationParser { protected def commentIndicator: Keyword protected lazy val columnName = acceptMatch("column name", { case lexical.Identifier(chars) => chars case lexical.Keyword(chars) if !sqlReservedWords.contains(chars.toUpperCase) => chars }) protected lazy val tableColumns: Parser[Seq[StructField]] = "(" ~> repsep(annotatedCol, ",") <~ ")" protected lazy val annotatedCol: Parser[StructField] = columnName ~ metadata ~ dataType ^^ { case name ~ md ~ typ => StructField(name, typ, nullable = true, metadata = toTableMetadata(md)) } | columnName ~ dataType ~ (commentIndicator ~> stringLit).? ^^ { case name ~ typ ~ cm => val meta = cm match { case Some(comment) => new MetadataBuilder().putString(commentIndicator.str.toLowerCase, comment).build() case None => Metadata.empty } StructField(name, typ, nullable = true, meta) } }
Example 59
Source File: ShowTablesUsingCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.sources.DatasourceCatalog import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.types.{StringType, StructField, StructType} private[sql] case class ShowTablesUsingCommand(provider: String, options: Map[String, String]) extends LogicalPlan with RunnableCommand { override def output: Seq[Attribute] = StructType( StructField("TABLE_NAME", StringType, nullable = false) :: StructField("IS_TEMPORARY", StringType, nullable = false) :: StructField("KIND", StringType, nullable = false) :: Nil ).toAttributes override def run(sqlContext: SQLContext): Seq[Row] = { val dataSource: Any = DatasourceResolver.resolverFor(sqlContext).newInstanceOf(provider) dataSource match { case describableRelation: DatasourceCatalog => describableRelation .getRelations(sqlContext, new CaseInsensitiveMap(options)) .map(relationInfo => Row( relationInfo.name, relationInfo.isTemporary.toString.toUpperCase, relationInfo.kind.toUpperCase)) case _ => throw new RuntimeException(s"The provided data source $provider does not support " + "showing its relations.") } } }
Example 60
Source File: DescribeTableUsingCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.TableIdentifierUtils._ import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.sources.{DatasourceCatalog, RelationInfo} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext} private[sql] case class DescribeTableUsingCommand( name: TableIdentifier, provider: String, options: Map[String, String]) extends LogicalPlan with RunnableCommand { override def output: Seq[Attribute] = StructType( StructField("TABLE_NAME", StringType, nullable = false) :: StructField("DDL_STMT", StringType, nullable = false) :: Nil ).toAttributes override def run(sqlContext: SQLContext): Seq[Row] = { // Convert the table name according to the case-sensitivity settings val tableId = name.toSeq val resolver = DatasourceResolver.resolverFor(sqlContext) val catalog = resolver.newInstanceOfTyped[DatasourceCatalog](provider) Seq(catalog .getRelation(sqlContext, tableId, new CaseInsensitiveMap(options)) match { case None => Row("", "") case Some(RelationInfo(relName, _, _, ddl, _)) => Row( relName, ddl.getOrElse("")) }) } }
Example 61
Source File: dependenciesSystemTable.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis.systables import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.TableDependencyCalculator import org.apache.spark.sql.sources.{RelationKind, Table} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{Row, SQLContext} object DependenciesSystemTableProvider extends SystemTableProvider with LocalSpark { override def execute(): Seq[Row] = { val tables = getTables(sqlContext.catalog) val dependentsMap = buildDependentsMap(tables) def kindOf(tableIdentifier: TableIdentifier): String = tables .get(tableIdentifier) .map(plan => RelationKind.kindOf(plan).getOrElse(Table).name) .getOrElse(DependenciesSystemTable.UnknownType) .toUpperCase dependentsMap.flatMap { case (tableIdent, dependents) => val curKind = kindOf(tableIdent) dependents.map { dependent => val dependentKind = kindOf(dependent) Row( tableIdent.database.orNull, tableIdent.table, curKind, dependent.database.orNull, dependent.table, dependentKind, ReferenceDependency.id) } }.toSeq } override val schema: StructType = DependenciesSystemTable.schema } object DependenciesSystemTable extends SchemaEnumeration { val baseSchemaName = Field("BASE_SCHEMA_NAME", StringType, nullable = true) val baseObjectName = Field("BASE_OBJECT_NAME", StringType, nullable = false) val baseObjectType = Field("BASE_OBJECT_TYPE", StringType, nullable = false) val dependentSchemaName = Field("DEPENDENT_SCHEMA_NAME", StringType, nullable = true) val dependentObjectName = Field("DEPENDENT_OBJECT_NAME", StringType, nullable = false) val dependentObjectType = Field("DEPENDENT_OBJECT_TYPE", StringType, nullable = false) val dependencyType = Field("DEPENDENCY_TYPE", IntegerType, nullable = false) private[DependenciesSystemTable] val UnknownType = "UNKNOWN" }
Example 62
Source File: CaseSensitivityUtils.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.analysis.{Analyzer, Catalog} import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.CollectionUtils._ import scala.util.{Failure, Success, Try} case class DuplicateFieldsException( originalSchema: StructType, schema: StructType, duplicateFields: Set[String]) extends RuntimeException( s"""Given schema contains duplicate fields after applying case sensitivity rules: |${duplicateFields.mkString(", ")} |Given schema: |$originalSchema |After applying case sensitivity rules: |$schema """.stripMargin) }
Example 63
Source File: CollapseExpandSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.analysis.CollapseExpandSuite.SqlLikeCatalystSourceRelation import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.sources.sql.SqlLikeRelation import org.apache.spark.sql.sources.{BaseRelation, CatalystSource, Table} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.util.PlanComparisonUtils._ import org.apache.spark.sql.{GlobalSapSQLContext, Row} import org.mockito.Matchers._ import org.mockito.Mockito._ import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar class CollapseExpandSuite extends FunSuite with MockitoSugar with GlobalSapSQLContext { case object Leaf extends LeafNode { override def output: Seq[Attribute] = Seq.empty } test("Expansion with a single sequence of projections is correctly collapsed") { val expand = Expand( Seq(Seq('a.string, Literal(1))), Seq('a.string, 'gid.int), Leaf) val collapsed = CollapseExpand(expand) assertResult(normalizeExprIds(Project(Seq('a.string, Literal(1) as "gid"), Leaf)))( normalizeExprIds(collapsed)) } test("Expansion with multiple projections is correctly collapsed") { val expand = Expand( Seq( Seq('a.string, Literal(1)), Seq('b.string, Literal(1))), Seq('a.string, 'gid1.int, 'b.string, 'gid2.int), Leaf) val collapsed = CollapseExpand(expand) assertResult( normalizeExprIds( Project(Seq( 'a.string, Literal(1) as "gid1", 'b.string, Literal(1) as "gid2"), Leaf)))(normalizeExprIds(collapsed)) } test("Expand pushdown integration") { val relation = mock[SqlLikeCatalystSourceRelation] when(relation.supportsLogicalPlan(any[Expand])) .thenReturn(true) when(relation.isMultiplePartitionExecution(any[Seq[CatalystSource]])) .thenReturn(true) when(relation.schema) .thenReturn(StructType(StructField("foo", StringType) :: Nil)) when(relation.relationName) .thenReturn("t") when(relation.logicalPlanToRDD(any[LogicalPlan])) .thenReturn(sc.parallelize(Seq(Row("a", 1), Row("b", 1), Row("a", 1)))) sqlc.baseRelationToDataFrame(relation).registerTempTable("t") val dataFrame = sqlc.sql("SELECT COUNT(DISTINCT foo) FROM t") val Seq(Row(ct)) = dataFrame.collect().toSeq assertResult(2)(ct) } } object CollapseExpandSuite { abstract class SqlLikeCatalystSourceRelation extends BaseRelation with Table with SqlLikeRelation with CatalystSource }
Example 64
Source File: ResolveCountDistinctStarSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.Aggregate import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.scalatest.FunSuite import org.scalatest.Inside._ import org.scalatest.mock.MockitoSugar import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Count} import scala.collection.mutable.ArrayBuffer class ResolveCountDistinctStarSuite extends FunSuite with MockitoSugar { val persons = new LogicalRelation(new BaseRelation { override def sqlContext: SQLContext = mock[SQLContext] override def schema: StructType = StructType(Seq( StructField("age", IntegerType), StructField("name", StringType) )) }) test("Count distinct star is resolved correctly") { val projection = persons.select(UnresolvedAlias( AggregateExpression(Count(UnresolvedStar(None) :: Nil), Complete, true))) val stillNotCompletelyResolvedAggregate = SimpleAnalyzer.execute(projection) val resolvedAggregate = ResolveCountDistinctStar(SimpleAnalyzer) .apply(stillNotCompletelyResolvedAggregate) inside(resolvedAggregate) { case Aggregate(Nil, ArrayBuffer(Alias(AggregateExpression(Count(expressions), Complete, true), _)), _) => assert(expressions.collect { case a:AttributeReference => a.name }.toSet == Set("name", "age")) } assert(resolvedAggregate.resolved) } }
Example 65
Source File: HiveSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark import com.sap.spark.{GlobalSparkContext, WithSapHiveContext} import org.apache.spark.sql.Row import org.apache.spark.sql.hive.SapHiveContext import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.scalatest.FunSuite class HiveSuite extends FunSuite with GlobalSparkContext with WithSapHiveContext { val schema = StructType( StructField("foo", StringType) :: StructField("bar", StringType) :: Nil) test("NewSession returns a new SapHiveContext") { val hiveContext = sqlc.asInstanceOf[SapHiveContext] val newHiveContext = hiveContext.newSession() assert(newHiveContext.isInstanceOf[SapHiveContext]) assert(newHiveContext != hiveContext) } test("NewSession returns a hive context whose catalog is separated to the current one") { val newContext = sqlc.newSession() val emptyRdd = newContext.createDataFrame(sc.emptyRDD[Row], schema) emptyRdd.registerTempTable("foo") assert(!sqlc.tableNames().contains("foo")) assert(newContext.tableNames().contains("foo")) } }
Example 66
Source File: DummyRelationUtils.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.{ColumnName, Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.sources.sql.SqlLikeRelation import org.apache.spark.sql.types.{StructField, StructType} case class DummyCatalystSourceRelation( schema: StructType, isMultiplePartitionExecutionFunc: Option[Seq[CatalystSource] => Boolean] = None, supportsLogicalPlanFunc: Option[LogicalPlan => Boolean] = None, supportsExpressionFunc: Option[Expression => Boolean] = None, logicalPlanToRDDFunc: Option[LogicalPlan => RDD[Row]] = None) (@transient implicit val sqlContext: SQLContext) extends BaseRelation with CatalystSource { override def isMultiplePartitionExecution(relations: Seq[CatalystSource]): Boolean = isMultiplePartitionExecutionFunc.forall(_.apply(relations)) override def supportsLogicalPlan(plan: LogicalPlan): Boolean = supportsLogicalPlanFunc.forall(_.apply(plan)) override def supportsExpression(expr: Expression): Boolean = supportsExpressionFunc.forall(_.apply(expr)) override def logicalPlanToRDD(plan: LogicalPlan): RDD[Row] = logicalPlanToRDDFunc.getOrElse( (plan: LogicalPlan) => new LogicalPlanRDD(plan, sqlContext.sparkContext)).apply(plan) } }
Example 67
Source File: ExcelRelation.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import scala.collection.JavaConversions._ import org.apache.spark.sql.sources.{ BaseRelation, TableScan } import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.SQLContext import org.apache.spark.sql._ import org.apache.spark.rdd.RDD import org.apache.hadoop.conf._ import org.apache.hadoop.mapreduce._ import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log import org.zuinnote.hadoop.office.format.common.dao._ import org.zuinnote.hadoop.office.format.mapreduce._ import org.zuinnote.spark.office.excel.util.ExcelFile override def buildScan: RDD[Row] = { // read ExcelRows val excelRowsRDD = ExcelFile.load(sqlContext, location, hadoopParams) // map to schema val schemaFields = schema.fields excelRowsRDD.flatMap(excelKeyValueTuple => { // map the Excel row data structure to a Spark SQL schema val rowArray = new Array[Any](excelKeyValueTuple._2.get.length) var i = 0; for (x <- excelKeyValueTuple._2.get) { // parse through the SpreadSheetCellDAO val spreadSheetCellDAOStructArray = new Array[String](schemaFields.length) val currentSpreadSheetCellDAO: Array[SpreadSheetCellDAO] = excelKeyValueTuple._2.get.asInstanceOf[Array[SpreadSheetCellDAO]] spreadSheetCellDAOStructArray(0) = currentSpreadSheetCellDAO(i).getFormattedValue spreadSheetCellDAOStructArray(1) = currentSpreadSheetCellDAO(i).getComment spreadSheetCellDAOStructArray(2) = currentSpreadSheetCellDAO(i).getFormula spreadSheetCellDAOStructArray(3) = currentSpreadSheetCellDAO(i).getAddress spreadSheetCellDAOStructArray(4) = currentSpreadSheetCellDAO(i).getSheetName // add row representing one Excel row rowArray(i) = spreadSheetCellDAOStructArray i += 1 } Some(Row.fromSeq(rowArray)) }) } }
Example 68
Source File: utils.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
package org.apache.spark.sql.execution.streaming.http import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.TimestampType import org.apache.spark.SparkConf import org.apache.commons.io.IOUtils import org.apache.spark.serializer.KryoSerializer import java.io.InputStream import com.esotericsoftware.kryo.io.Input import java.io.ByteArrayOutputStream class WrongArgumentException(name: String, value: Any) extends RuntimeException(s"wrong argument: $name=$value") { } class MissingRequiredArgumentException(map: Map[String, String], paramName: String) extends RuntimeException(s"missing required argument: $paramName, all parameters=$map") { } class InvalidSerializerNameException(serializerName: String) extends RuntimeException(s"invalid serializer name: $serializerName") { } object SchemaUtils { def buildSchema(schema: StructType, includesTimestamp: Boolean, timestampColumnName: String = "_TIMESTAMP_"): StructType = { if (!includesTimestamp) schema; else StructType(schema.fields.toSeq :+ StructField(timestampColumnName, TimestampType, false)); } } object Params { def deserialize(bytes: Array[Byte]): Any = { val kryo = kryoSerializer.newKryo(); val input = new Input(); input.setBuffer(bytes); kryo.readClassAndObject(input); } }
Example 69
Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.http.HttpStreamClient import org.junit.Assert import org.junit.Test import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.types.BooleanType import org.apache.spark.sql.types.FloatType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.ByteType import org.apache.spark.sql.execution.streaming.http.HttpStreamServer import org.apache.spark.sql.execution.streaming.http.StreamPrinter import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException class HttpStreamServerClientTest { val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte), Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte), Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte)); val ROWS2 = Array(Row("hello"), Row("world"), Row("bye"), Row("world")); @Test def testHttpStreamIO() { //starts a http server val kryoSerializer = new KryoSerializer(new SparkConf()); val server = HttpStreamServer.start("/xxxx", 8080); val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]") .getOrCreate(); spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/"); val sqlContext = spark.sqlContext; import spark.implicits._ //add a local message buffer to server, with 2 topics registered server.withBuffer() .addListener(new StreamPrinter()) .createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1") .createTopic[String]("topic-2"); val client = HttpStreamClient.connect("http://localhost:8080/xxxx"); //tests schema of topics val schema1 = client.fetchSchema("topic-1"); Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType), schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]); val schema2 = client.fetchSchema("topic-2"); Assert.assertArrayEquals(Array[Object](StringType), schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]); //prepare to consume messages val sid1 = client.subscribe("topic-1")._1; val sid2 = client.subscribe("topic-2")._1; //produces some data client.sendRows("topic-1", 1, ROWS1); val sid4 = client.subscribe("topic-1")._1; val sid5 = client.subscribe("topic-2")._1; client.sendRows("topic-2", 1, ROWS2); //consumes data val fetched = client.fetchStream(sid1).map(_.originalRow); Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]); //it is empty now Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); client.unsubscribe(sid4); try { client.fetchStream(sid4); //exception should be thrown, because subscriber id is invalidated Assert.assertTrue(false); } catch { case e: Throwable ⇒ e.printStackTrace(); Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass); } server.stop(); } }
Example 70
Source File: VectorSlicerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.core.types.TensorShape import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.DataFrame import org.apache.spark.sql.mleap.TypeConverters.sparkToMleapDataShape import org.apache.spark.sql.types.StructField class VectorSlicerOp extends SimpleSparkOp[VectorSlicer] { override val Model: OpModel[SparkBundleContext, VectorSlicer] = new OpModel[SparkBundleContext, VectorSlicer] { override val klazz: Class[VectorSlicer] = classOf[VectorSlicer] override def opName: String = Bundle.BuiltinOps.feature.vector_slicer override def store(model: Model, obj: VectorSlicer) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val dataset = context.context.dataset.get val namedIndicesMap: Array[(String, Int)] = if(obj.getNames.nonEmpty) { extractNamedIndices(obj.getInputCol, obj.getNames, dataset) } else { Array() } val (names, namedIndices) = namedIndicesMap.unzip val inputShape = sparkToMleapDataShape(dataset.schema(obj.getInputCol), dataset).asInstanceOf[TensorShape] model.withValue("indices", Value.longList(obj.getIndices.map(_.toLong).toSeq)). withValue("names", Value.stringList(names)). withValue("named_indices", Value.intList(namedIndices)). withValue("input_size", Value.int(inputShape.dimensions.get.head)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): VectorSlicer = { val names = model.value("names").getStringList new VectorSlicer(uid = "").setIndices(model.value("indices").getLongList.map(_.toInt).toArray). setNames(names.toArray) } private def extractNamedIndices(inputCol: String, names: Array[String], dataset: DataFrame): Array[(String, Int)] = { names.zip(getFeatureIndicesFromNames(dataset.schema(inputCol), names)) } private def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } } override def sparkLoad(uid: String, shape: NodeShape, model: VectorSlicer): VectorSlicer = { new VectorSlicer(uid = uid).setIndices(model.getIndices).setNames(model.getNames) } override def sparkInputs(obj: VectorSlicer): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: VectorSlicer): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 71
Source File: OneHotEncoderOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribute, NumericAttribute} import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.OneHotEncoderModel import org.apache.spark.sql.types.StructField import scala.util.{Failure, Try} object OneHotEncoderOp { def sizeForField(field: StructField): Int = { val attr = Attribute.fromStructField(field) (attr match { case nominal: NominalAttribute => if (nominal.values.isDefined) { Try(nominal.values.get.length) } else if (nominal.numValues.isDefined) { Try(nominal.numValues.get) } else { Failure(new RuntimeException(s"invalid nominal value for field ${field.name}")) } case binary: BinaryAttribute => Try(2) case _: NumericAttribute => Failure(new RuntimeException(s"invalid numeric attribute for field ${field.name}")) case _ => Failure(new RuntimeException(s"unsupported attribute for field ${field.name}")) // optimistic about unknown attributes }).get } } class OneHotEncoderOp extends SimpleSparkOp[OneHotEncoderModel] { override val Model: OpModel[SparkBundleContext, OneHotEncoderModel] = new OpModel[SparkBundleContext, OneHotEncoderModel] { override val klazz: Class[OneHotEncoderModel] = classOf[OneHotEncoderModel] override def opName: String = Bundle.BuiltinOps.feature.one_hot_encoder override def store(model: Model, obj: OneHotEncoderModel) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val df = context.context.dataset.get val categorySizes = obj.getInputCols.map { f ⇒ OneHotEncoderOp.sizeForField(df.schema(f)) } model.withValue("category_sizes", Value.intList(categorySizes)) .withValue("drop_last", Value.boolean(obj.getDropLast)) .withValue("handle_invalid", Value.string(obj.getHandleInvalid)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): OneHotEncoderModel = { new OneHotEncoderModel(uid = "", categorySizes = model.value("category_sizes").getIntList.toArray) .setDropLast(model.value("drop_last").getBoolean) .setHandleInvalid(model.value("handle_invalid").getString) } } override def sparkLoad(uid: String, shape: NodeShape, model: OneHotEncoderModel): OneHotEncoderModel = { new OneHotEncoderModel(uid = uid, categorySizes = model.categorySizes) .setDropLast(model.getDropLast) .setHandleInvalid(model.getHandleInvalid) } override def sparkInputs(obj: OneHotEncoderModel): Seq[ParamSpec] = Seq(ParamSpec("input", obj.inputCols)) override def sparkOutputs(obj: OneHotEncoderModel): Seq[ParamSpec] = Seq(ParamSpec("output", obj.outputCols)) }
Example 72
Source File: ReverseStringIndexerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.core.types.{DataShape, ScalarShape} import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribute, NumericAttribute} import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.IndexToString import org.apache.spark.sql.types.StructField import ml.combust.mleap.runtime.types.BundleTypeConverters._ import scala.util.{Failure, Try} object ReverseStringIndexerOp { def labelsForField(field: StructField): Array[String] = { val attr = Attribute.fromStructField(field) (attr match { case nominal: NominalAttribute => if (nominal.values.isDefined) { Try(nominal.values.get) } else { Failure(new RuntimeException(s"invalid nominal value for field ${field.name}")) } case _: BinaryAttribute => Failure(new RuntimeException(s"invalid binary attribute for field ${field.name}")) case _: NumericAttribute => Failure(new RuntimeException(s"invalid numeric attribute for field ${field.name}")) case _ => Failure(new RuntimeException(s"unsupported attribute for field ${field.name}")) // optimistic about unknown attributes }).get } } class ReverseStringIndexerOp extends SimpleSparkOp[IndexToString] { override val Model: OpModel[SparkBundleContext, IndexToString] = new OpModel[SparkBundleContext, IndexToString] { override val klazz: Class[IndexToString] = classOf[IndexToString] override def opName: String = Bundle.BuiltinOps.feature.reverse_string_indexer override def store(model: Model, obj: IndexToString) (implicit context: BundleContext[SparkBundleContext]): Model = { val labels = obj.get(obj.labels).getOrElse { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val df = context.context.dataset.get ReverseStringIndexerOp.labelsForField(df.schema(obj.getInputCol)) } model.withValue("labels", Value.stringList(labels)). withValue("input_shape", Value.dataShape(ScalarShape(false))) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): IndexToString = { model.getValue("input_shape").map(_.getDataShape: DataShape).foreach { shape => require(shape.isScalar, "cannot deserialize non-scalar input to Spark IndexToString model") } new IndexToString(uid = "").setLabels(model.value("labels").getStringList.toArray) } } override def sparkLoad(uid: String, shape: NodeShape, model: IndexToString): IndexToString = { new IndexToString(uid = uid).setLabels(model.getLabels) } override def sparkInputs(obj: IndexToString): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: IndexToString): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 73
Source File: MathUnary.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType} import org.apache.spark.sql.functions.udf private val className = classOf[MathUnary].getName override def load(path: String): MathUnary = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("operation").head() val operation = data.getAs[String](0) val model = MathUnaryModel(UnaryOperation.forName(operation)) val transformer = new MathUnary(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 74
Source File: WrappersSpec.scala From sparksql-scalapb with Apache License 2.0 | 5 votes |
package scalapb.spark import com.example.protos.wrappers._ import org.apache.spark.sql.SparkSession import org.apache.hadoop.io.ArrayPrimitiveWritable import scalapb.GeneratedMessageCompanion import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.Row import org.scalatest.BeforeAndAfterAll import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.must.Matchers class WrappersSpec extends AnyFlatSpec with Matchers with BeforeAndAfterAll { val spark: SparkSession = SparkSession .builder() .appName("ScalaPB Demo") .master("local[2]") .getOrCreate() import spark.implicits.StringToColumn val data = Seq( PrimitiveWrappers( intValue = Option(45), stringValue = Option("boo"), ints = Seq(17, 19, 25), strings = Seq("foo", "bar") ), PrimitiveWrappers( intValue = None, stringValue = None, ints = Seq(17, 19, 25), strings = Seq("foo", "bar") ) ) "converting df with primitive wrappers" should "work with primitive implicits" in { import ProtoSQL.withPrimitiveWrappers.implicits._ val df = ProtoSQL.withPrimitiveWrappers.createDataFrame(spark, data) df.schema.fields.map(_.dataType).toSeq must be( Seq( IntegerType, StringType, ArrayType(IntegerType, false), ArrayType(StringType, false) ) ) df.collect must contain theSameElementsAs ( Seq( Row(45, "boo", Seq(17, 19, 25), Seq("foo", "bar")), Row(null, null, Seq(17, 19, 25), Seq("foo", "bar")) ) ) } "converting df with primitive wrappers" should "work with default implicits" in { import ProtoSQL.implicits._ val df = ProtoSQL.createDataFrame(spark, data) df.schema.fields.map(_.dataType).toSeq must be( Seq( StructType(Seq(StructField("value", IntegerType, true))), StructType(Seq(StructField("value", StringType, true))), ArrayType( StructType(Seq(StructField("value", IntegerType, true))), false ), ArrayType( StructType(Seq(StructField("value", StringType, true))), false ) ) ) df.collect must contain theSameElementsAs ( Seq( Row( Row(45), Row("boo"), Seq(Row(17), Row(19), Row(25)), Seq(Row("foo"), Row("bar")) ), Row( null, null, Seq(Row(17), Row(19), Row(25)), Seq(Row("foo"), Row("bar")) ) ) ) } }
Example 75
Source File: KustoSourceTests.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark import com.microsoft.kusto.spark.datasource.KustoSourceOptions import com.microsoft.kusto.spark.utils.{KustoDataSourceUtils => KDSU} import org.apache.spark.SparkContext import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{SQLContext, SparkSession} import org.junit.runner.RunWith import org.scalamock.scalatest.MockFactory import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class KustoSourceTests extends FlatSpec with MockFactory with Matchers with BeforeAndAfterAll { private val loggingLevel: Option[String] = Option(System.getProperty("logLevel")) if (loggingLevel.isDefined) KDSU.setLoggingLevel(loggingLevel.get) private val nofExecutors = 4 private val spark: SparkSession = SparkSession.builder() .appName("KustoSource") .master(f"local[$nofExecutors]") .getOrCreate() private var sc: SparkContext = _ private var sqlContext: SQLContext = _ private val cluster: String = "KustoCluster" private val database: String = "KustoDatabase" private val query: String = "KustoTable" private val appId: String = "KustoSinkTestApplication" private val appKey: String = "KustoSinkTestKey" private val appAuthorityId: String = "KustoSinkAuthorityId" override def beforeAll(): Unit = { super.beforeAll() sc = spark.sparkContext sqlContext = spark.sqlContext } override def afterAll(): Unit = { super.afterAll() sc.stop() } "KustoDataSource" should "recognize Kusto and get the correct schema" in { val spark: SparkSession = SparkSession.builder() .appName("KustoSource") .master(f"local[$nofExecutors]") .getOrCreate() val customSchema = "colA STRING, colB INT" val df = spark.sqlContext .read .format("com.microsoft.kusto.spark.datasource") .option(KustoSourceOptions.KUSTO_CLUSTER, cluster) .option(KustoSourceOptions.KUSTO_DATABASE, database) .option(KustoSourceOptions.KUSTO_QUERY, query) .option(KustoSourceOptions.KUSTO_AAD_APP_ID, appId) .option(KustoSourceOptions.KUSTO_AAD_APP_SECRET, appKey) .option(KustoSourceOptions.KUSTO_AAD_AUTHORITY_ID, appAuthorityId) .option(KustoSourceOptions.KUSTO_CUSTOM_DATAFRAME_COLUMN_TYPES, customSchema) .load("src/test/resources/") val expected = StructType(Array(StructField("colA", StringType, nullable = true),StructField("colB", IntegerType, nullable = true))) assert(df.schema.equals(expected)) } }
Example 76
Source File: DataFrameModifierHelper.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.writer import com.stratio.sparta.sdk.pipeline.autoCalculations.AutoCalculatedField import com.stratio.sparta.sdk.pipeline.output.Output import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{Column, DataFrame} object DataFrameModifierHelper { def applyAutoCalculateFields(dataFrame: DataFrame, autoCalculateFields: Seq[AutoCalculatedField], auxSchema: StructType): DataFrame = autoCalculateFields.headOption match { case Some(firstAutoCalculate) => applyAutoCalculateFields( addColumnToDataFrame(dataFrame, firstAutoCalculate, auxSchema), autoCalculateFields.drop(1), auxSchema) case None => dataFrame } private[driver] def addColumnToDataFrame(dataFrame: DataFrame, autoCalculateField: AutoCalculatedField, auxSchema: StructType): DataFrame = { (autoCalculateField.fromNotNullFields, autoCalculateField.fromPkFields, autoCalculateField.fromFields, autoCalculateField.fromFixedValue) match { case (Some(fromNotNullFields), _, _, _) => val fields = fieldsWithAuxMetadata(dataFrame.schema.fields, auxSchema.fields).flatMap(field => if (!field.nullable) Some(col(field.name)) else None).toSeq addField(fromNotNullFields.field.name, fromNotNullFields.field.outputType, dataFrame, fields) case (None, Some(fromPkFields), _, _) => val fields = fieldsWithAuxMetadata(dataFrame.schema.fields, auxSchema.fields).flatMap(field => if (field.metadata.contains(Output.PrimaryKeyMetadataKey)) Some(col(field.name)) else None).toSeq addField(fromPkFields.field.name, fromPkFields.field.outputType, dataFrame, fields) case (None, None, Some(fromFields), _) => val fields = autoCalculateField.fromFields.get.fromFields.map(field => col(field)) addField(fromFields.field.name, fromFields.field.outputType, dataFrame, fields) case (None, None, None, Some(fromFixedValue)) => addLiteral(fromFixedValue.field.name, fromFixedValue.field.outputType, dataFrame, fromFixedValue.value) case _ => dataFrame } } private[driver] def addField(name: String, outputType: String, dataFrame: DataFrame, fields: Seq[Column]): DataFrame = outputType match { case "string" => dataFrame.withColumn(name, concat_ws(Output.Separator, fields: _*)) case "array" => dataFrame.withColumn(name, array(fields: _*)) case "map" => dataFrame.withColumn(name, struct(fields: _*)) case _ => dataFrame } private[driver] def addLiteral(name: String, outputType: String, dataFrame: DataFrame, literal: String): DataFrame = outputType match { case "string" => dataFrame.withColumn(name, lit(literal)) case "array" => dataFrame.withColumn(name, array(lit(literal))) case "map" => dataFrame.withColumn(name, struct(lit(literal))) case _ => dataFrame } private[driver] def fieldsWithAuxMetadata(dataFrameFields: Array[StructField], auxFields: Array[StructField]) = dataFrameFields.map(field => { auxFields.find(auxField => auxField.name == field.name) match { case Some(auxFounded) => field.copy(metadata = auxFounded.metadata) case None => field } }) }
Example 77
Source File: RawDataWriterHelper.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.writer import com.stratio.sparta.driver.factory.SparkContextFactory import com.stratio.sparta.driver.step.RawData import com.stratio.sparta.sdk.pipeline.output.Output import com.stratio.sparta.sdk.utils.AggregationTime import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType} import org.apache.spark.streaming.dstream.DStream object RawDataWriterHelper { def writeRawData(rawData: RawData, outputs: Seq[Output], input: DStream[Row]): Unit = { val RawSchema = StructType(Seq( StructField(rawData.timeField, TimestampType, nullable = false), StructField(rawData.dataField, StringType, nullable = true))) val eventTime = AggregationTime.millisToTimeStamp(System.currentTimeMillis()) input.map(row => Row.merge(Row(eventTime), row)) .foreachRDD(rdd => { if (!rdd.isEmpty()) { val rawDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, RawSchema) WriterHelper.write(rawDataFrame, rawData.writerOptions, Map.empty[String, String], outputs) } }) } }
Example 78
Source File: CubeMakerTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.test.cube import java.sql.Timestamp import com.github.nscala_time.time.Imports._ import com.stratio.sparta.driver.step.{Cube, CubeOperations, Trigger} import com.stratio.sparta.driver.writer.WriterOptions import com.stratio.sparta.plugin.default.DefaultField import com.stratio.sparta.plugin.cube.field.datetime.DateTimeField import com.stratio.sparta.plugin.cube.operator.count.CountOperator import com.stratio.sparta.sdk.pipeline.aggregation.cube.{Dimension, DimensionValue, DimensionValuesTime, InputFields} import com.stratio.sparta.sdk.pipeline.schema.TypeOp import com.stratio.sparta.sdk.utils.AggregationTime import org.apache.spark.sql.Row import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, TimestampType} import org.apache.spark.streaming.TestSuiteBase import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class CubeMakerTest extends TestSuiteBase { val PreserverOrder = false def getEventOutput(timestamp: Timestamp, millis: Long): Seq[Seq[(DimensionValuesTime, InputFields)]] = { val dimensionString = Dimension("dim1", "eventKey", "identity", new DefaultField) val dimensionTime = Dimension("minute", "minute", "minute", new DateTimeField) val dimensionValueString1 = DimensionValue(dimensionString, "value1") val dimensionValueString2 = dimensionValueString1.copy(value = "value2") val dimensionValueString3 = dimensionValueString1.copy(value = "value3") val dimensionValueTs = DimensionValue(dimensionTime, timestamp) val tsMap = Row(timestamp) val valuesMap1 = InputFields(Row("value1", timestamp), 1) val valuesMap2 = InputFields(Row("value2", timestamp), 1) val valuesMap3 = InputFields(Row("value3", timestamp), 1) Seq(Seq( (DimensionValuesTime("cubeName", Seq(dimensionValueString1, dimensionValueTs)), valuesMap1), (DimensionValuesTime("cubeName", Seq(dimensionValueString2, dimensionValueTs)), valuesMap2), (DimensionValuesTime("cubeName", Seq(dimensionValueString3, dimensionValueTs)), valuesMap3) )) } }
Example 79
Source File: Parser.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.sdk.pipeline.transformation import java.io.{Serializable => JSerializable} import com.stratio.sparta.sdk.pipeline.schema.TypeOp import com.stratio.sparta.sdk.properties.{CustomProperties, Parameterizable} import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._ import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StructField, StructType} import scala.util.{Failure, Success, Try} abstract class Parser(order: Integer, inputField: Option[String], outputFields: Seq[String], schema: StructType, properties: Map[String, JSerializable]) extends Parameterizable(properties) with Ordered[Parser] with CustomProperties { val customKey = "transformationOptions" val customPropertyKey = "transformationOptionsKey" val customPropertyValue = "transformationOptionsValue" val propertiesWithCustom = properties ++ getCustomProperties val outputFieldsSchema = schema.fields.filter(field => outputFields.contains(field.name)) val inputFieldRemoved = Try(propertiesWithCustom.getBoolean("removeInputField")).getOrElse(false) val inputFieldIndex = inputField match { case Some(field) => Try(schema.fieldIndex(field)).getOrElse(0) case None => 0 } val whenErrorDo = Try(WhenError.withName(propertiesWithCustom.getString("whenError"))) .getOrElse(WhenError.Error) def parse(data: Row): Seq[Row] def getOrder: Integer = order def checkFields(keyMap: Map[String, JSerializable]): Map[String, JSerializable] = keyMap.flatMap(key => if (outputFields.contains(key._1)) Some(key) else None) def compare(that: Parser): Int = this.getOrder.compareTo(that.getOrder) //scalastyle:off def returnWhenError(exception: Exception): Null = whenErrorDo match { case WhenError.Null => null case _ => throw exception } //scalastyle:on def parseToOutputType(outSchema: StructField, inputValue: Any): Any = Try(TypeOp.transformValueByTypeOp(outSchema.dataType, inputValue.asInstanceOf[Any])) .getOrElse(returnWhenError(new IllegalStateException( s"Error parsing to output type the value: ${inputValue.toString}"))) def returnData(newData: Try[Seq[_]], prevData: Seq[_]): Seq[Row] = newData match { case Success(data) => Seq(Row.fromSeq(prevData ++ data)) case Failure(e) => whenErrorDo match { case WhenError.Discard => Seq.empty[Row] case _ => throw e } } def returnData(newData: Try[Row], prevData: Row): Seq[Row] = newData match { case Success(data) => Seq(Row.merge(prevData, data)) case Failure(e) => whenErrorDo match { case WhenError.Discard => Seq.empty[Row] case _ => throw e } } def removeIndex(row: Seq[_], inputFieldIndex: Int): Seq[_] = if (row.size < inputFieldIndex) row else row.take(inputFieldIndex) ++ row.drop(inputFieldIndex + 1) def removeInputField(row: Row): Seq[_] = { if (inputFieldRemoved && inputField.isDefined) removeIndex(row.toSeq, inputFieldIndex) else row.toSeq } } object Parser { final val ClassSuffix = "Parser" final val DefaultOutputType = "string" final val TypesFromParserClass = Map("datetime" -> "timestamp") }
Example 80
Source File: ParserTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.sdk.pipeline.transformation import java.io.{Serializable => JSerializable} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class ParserTest extends WordSpec with Matchers { "Parser" should { val parserTest = new ParserMock( 1, Some("input"), Seq("output"), StructType(Seq(StructField("some", StringType))), Map() ) "Order must be " in { val expected = 1 val result = parserTest.getOrder result should be(expected) } "Parse must be " in { val event = Row("value") val expected = Seq(event) val result = parserTest.parse(event) result should be(expected) } "checked fields not be contained in outputs must be " in { val keyMap = Map("field" -> "value") val expected = Map() val result = parserTest.checkFields(keyMap) result should be(expected) } "checked fields are contained in outputs must be " in { val keyMap = Map("output" -> "value") val expected = keyMap val result = parserTest.checkFields(keyMap) result should be(expected) } "classSuffix must be " in { val expected = "Parser" val result = Parser.ClassSuffix result should be(expected) } } }
Example 81
Source File: RedisOutput.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.output.redis import java.io.Serializable import com.stratio.sparta.plugin.output.redis.dao.AbstractRedisDAO import com.stratio.sparta.sdk.pipeline.output.Output._ import com.stratio.sparta.sdk.pipeline.output.{Output, SaveModeEnum} import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._ import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{DataFrame, Row} class RedisOutput(name: String, properties: Map[String, Serializable]) extends Output(name, properties) with AbstractRedisDAO with Serializable { override val hostname = properties.getString("hostname", DefaultRedisHostname) override val port = properties.getString("port", DefaultRedisPort).toInt override def save(dataFrame: DataFrame, saveMode: SaveModeEnum.Value, options: Map[String, String]): Unit = { val tableName = getTableNameFromOptions(options) val schema = dataFrame.schema validateSaveMode(saveMode) dataFrame.foreachPartition{ rowList => rowList.foreach{ row => val valuesList = getValuesList(row,schema.fieldNames) val hashKey = getHashKeyFromRow(valuesList, schema) getMeasuresFromRow(valuesList, schema).foreach { case (measure, value) => hset(hashKey, measure.name, value) } } } } def getHashKeyFromRow(valuesList: Seq[(String, String)], schema: StructType): String = valuesList.flatMap{ case (key, value) => val fieldSearch = schema.fields.find(structField => structField.metadata.contains(Output.PrimaryKeyMetadataKey) && structField.name == key) fieldSearch.map(structField => s"${structField.name}$IdSeparator$value") }.mkString(IdSeparator) def getMeasuresFromRow(valuesList: Seq[(String, String)], schema: StructType): Seq[(StructField, String)] = valuesList.flatMap{ case (key, value) => val fieldSearch = schema.fields.find(structField => structField.metadata.contains(Output.MeasureMetadataKey) && structField.name == key) fieldSearch.map(field => (field, value)) } def getValuesList(row: Row, fieldNames: Array[String]): Seq[(String, String)] = fieldNames.zip(row.toSeq).map{ case (key, value) => (key, value.toString)}.toSeq }
Example 82
Source File: KiteMorphlineImpl.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.transformation.morphline import com.typesafe.config.Config import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructField import org.kitesdk.morphline.api.{Command, MorphlineContext, Record} import org.kitesdk.morphline.base.Compiler case class KiteMorphlineImpl(config: Config, outputFieldsSchema: Array[StructField]) { private val morphlineContext: MorphlineContext = new MorphlineContext.Builder().build() private val collector: ThreadLocal[EventCollector] = new ThreadLocal[EventCollector]() { override def initialValue(): EventCollector = new EventCollector(outputFieldsSchema) } private val morphline: ThreadLocal[Command] = new ThreadLocal[Command]() { override def initialValue(): Command = new Compiler().compile(config, morphlineContext, collector.get()) } def process(inputRecord: Record): Row = { val coll = collector.get() coll.reset() morphline.get().process(inputRecord) coll.row } }
Example 83
Source File: EventCollector.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.transformation.morphline import com.stratio.sparta.sdk.pipeline.schema.TypeOp import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructField import org.kitesdk.morphline.api.{Command, Record} class EventCollector(outputFieldsSchema: Array[StructField]) extends Command { @volatile var row: Row = Row.empty override def notify(p1: Record): Unit = {} def reset(): Unit = row = Row.empty //scalastyle:off override def getParent: Command = null //scalastyle:om override def process(recordProcess: Record): Boolean = { Option(recordProcess) match { case Some(record) => row = Row.fromSeq(outputFieldsSchema.map(field => Option(record.getFirstValue(field.name)) match { case Some(value) => TypeOp.transformValueByTypeOp(field.dataType, value.asInstanceOf[Any]) case None => throw new IllegalStateException(s"Impossible to parse field: ${field.name}.") } ).toList) true case None => row = Row.empty false } } }
Example 84
Source File: MorphlinesParser.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.transformation.morphline import java.io.{ByteArrayInputStream, Serializable => JSerializable} import java.util.concurrent.ConcurrentHashMap import com.stratio.sparta.sdk.pipeline.transformation.Parser import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._ import com.typesafe.config.ConfigFactory import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StructField, StructType} import org.kitesdk.morphline.api.Record import scala.collection.JavaConverters._ import scala.util.Try class MorphlinesParser(order: Integer, inputField: Option[String], outputFields: Seq[String], schema: StructType, properties: Map[String, JSerializable]) extends Parser(order, inputField, outputFields, schema, properties) { assert(inputField.isDefined, "It's necessary to define one inputField in the Morphline Transformation") private val config: String = properties.getString("morphline") override def parse(row: Row): Seq[Row] = { val inputValue = Option(row.get(inputFieldIndex)) val newData = Try { inputValue match { case Some(s: String) => if (s.isEmpty) returnWhenError(new IllegalStateException(s"Impossible to parse because value is empty")) else parseWithMorphline(new ByteArrayInputStream(s.getBytes("UTF-8"))) case Some(b: Array[Byte]) => if (b.length == 0) returnWhenError(new IllegalStateException(s"Impossible to parse because value is empty")) else parseWithMorphline(new ByteArrayInputStream(b)) case _ => returnWhenError(new IllegalStateException(s"Impossible to parse because value is empty")) } } returnData(newData, removeInputFieldMorphline(row)) } private def removeIndex(row: Row, inputFieldIndex: Int): Row = if (row.size < inputFieldIndex) row else Row.fromSeq(row.toSeq.take(inputFieldIndex) ++ row.toSeq.drop(inputFieldIndex + 1)) private def removeInputFieldMorphline(row: Row): Row = if (inputFieldRemoved && inputField.isDefined) removeIndex(row, inputFieldIndex) else row private def parseWithMorphline(value: ByteArrayInputStream): Row = { val record = new Record() record.put(inputField.get, value) MorphlinesParser(order, config, outputFieldsSchema).process(record) } } object MorphlinesParser { private val instances = new ConcurrentHashMap[String, KiteMorphlineImpl].asScala def apply(order: Integer, config: String, outputFieldsSchema: Array[StructField]): KiteMorphlineImpl = { instances.get(config) match { case Some(kiteMorphlineImpl) => kiteMorphlineImpl case None => val kiteMorphlineImpl = KiteMorphlineImpl(ConfigFactory.parseString(config), outputFieldsSchema) instances.putIfAbsent(config, kiteMorphlineImpl) kiteMorphlineImpl } } }
Example 85
Source File: CassandraOutputTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.output.cassandra import java.io.{Serializable => JSerializable} import com.datastax.spark.connector.cql.CassandraConnector import com.stratio.sparta.sdk._ import com.stratio.sparta.sdk.properties.JsoneyString import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.mock.MockitoSugar import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class CassandraOutputTest extends FlatSpec with Matchers with MockitoSugar with AnswerSugar { val s = "sum" val properties = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042")) "getSparkConfiguration" should "return a Seq with the configuration" in { val configuration = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042")) val cass = CassandraOutput.getSparkConfiguration(configuration) cass should be(List(("spark.cassandra.connection.host", "127.0.0.1"), ("spark.cassandra.connection.port", "9042"))) } "getSparkConfiguration" should "return all cassandra-spark config" in { val config: Map[String, JSerializable] = Map( ("sparkProperties" -> JsoneyString( "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," + "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")), ("anotherProperty" -> "true") ) val sparkConfig = CassandraOutput.getSparkConfiguration(config) sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(true) sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(true) sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false) } "getSparkConfiguration" should "not return cassandra-spark config" in { val config: Map[String, JSerializable] = Map( ("hadoopProperties" -> JsoneyString( "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," + "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")), ("anotherProperty" -> "true") ) val sparkConfig = CassandraOutput.getSparkConfiguration(config) sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(false) sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(false) sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false) } }
Example 86
Source File: LastValueOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.lastValue import java.util.Date import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class LastValueOperatorTest extends WordSpec with Matchers { "LastValue operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new LastValueOperator("lastValue", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new LastValueOperator("lastValue", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new LastValueOperator("lastValue", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new LastValueOperator("lastValue", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields4.processMap(Row(1, 2)) should be(Some(1L)) val inputFields5 = new LastValueOperator("lastValue", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields5.processMap(Row(1, 2)) should be(None) val inputFields6 = new LastValueOperator("lastValue", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields6.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new LastValueOperator("lastValue", initSchema, Map()) inputFields.processReduce(Seq()) should be(None) val inputFields2 = new LastValueOperator("lastValue", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(2))) should be(Some(2)) val inputFields3 = new LastValueOperator("lastValue", initSchema, Map()) inputFields3.processReduce(Seq(Some("a"), Some("b"))) should be(Some("b")) } "associative process must be " in { val inputFields = new LastValueOperator("lastValue", initSchema, Map()) val resultInput = Seq((Operator.OldValuesKey, Some(1L)), (Operator.NewValuesKey, Some(1L)), (Operator.NewValuesKey, None)) inputFields.associativity(resultInput) should be(Some(1L)) val inputFields2 = new LastValueOperator("lastValue", initSchema, Map("typeOp" -> "int")) val resultInput2 = Seq((Operator.OldValuesKey, Some(1L)), (Operator.NewValuesKey, Some(1L))) inputFields2.associativity(resultInput2) should be(Some(1)) val inputFields3 = new LastValueOperator("lastValue", initSchema, Map("typeOp" -> null)) val resultInput3 = Seq((Operator.OldValuesKey, Some(1)), (Operator.NewValuesKey, Some(2))) inputFields3.associativity(resultInput3) should be(Some(2)) val inputFields4 = new LastValueOperator("lastValue", initSchema, Map()) val resultInput4 = Seq() inputFields4.associativity(resultInput4) should be(None) val inputFields5 = new LastValueOperator("lastValue", initSchema, Map()) val date = new Date() val resultInput5 = Seq((Operator.NewValuesKey, Some(date))) inputFields5.associativity(resultInput5) should be(Some(date)) } } }
Example 87
Source File: StddevOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.stddev import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class StddevOperatorTest extends WordSpec with Matchers { "Std dev operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new StddevOperator("stdev", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new StddevOperator("stdev", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row("1", 2)) should be(Some(1)) val inputFields6 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1")) inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5)) val inputFields7 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1")) inputFields7.processMap(Row(5L, 2)) should be(Some(5L)) val inputFields8 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields8.processMap(Row(1, 2)) should be(Some(1L)) val inputFields9 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields9.processMap(Row(1, 2)) should be(None) val inputFields10 = new StddevOperator("stdev", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields10.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new StddevOperator("stdev", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new StddevOperator("stdev", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be (Some(2.8284271247461903)) val inputFields3 = new StddevOperator("stdev", initSchema, Map()) inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be (Some(2.850438562747845)) val inputFields4 = new StddevOperator("stdev", initSchema, Map()) inputFields4.processReduce(Seq(None)) should be(Some(0d)) val inputFields5 = new StddevOperator("stdev", initSchema, Map("typeOp" -> "string")) inputFields5.processReduce( Seq(Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some("2.850438562747845")) } "processReduce distinct must be " in { val inputFields = new StddevOperator("stdev", initSchema, Map("distinct" -> "true")) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new StddevOperator("stdev", initSchema, Map("distinct" -> "true")) inputFields2.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be (Some(2.8284271247461903)) val inputFields3 = new StddevOperator("stdev", initSchema, Map("distinct" -> "true")) inputFields3.processReduce(Seq(Some(1), Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be (Some(2.850438562747845)) val inputFields4 = new StddevOperator("stdev", initSchema, Map("distinct" -> "true")) inputFields4.processReduce(Seq(None)) should be(Some(0d)) val inputFields5 = new StddevOperator("stdev", initSchema, Map("typeOp" -> "string", "distinct" -> "true")) inputFields5.processReduce( Seq(Some(1), Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some("2.850438562747845")) } } }
Example 88
Source File: MedianOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.median import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class MedianOperatorTest extends WordSpec with Matchers { "Median operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new MedianOperator("median", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new MedianOperator("median", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new MedianOperator("median", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new MedianOperator("median", initSchema, Map("inputField" -> "field1")) inputFields4.processMap(Row("1", 2)) should be(Some(1)) val inputFields6 = new MedianOperator("median", initSchema, Map("inputField" -> "field1")) inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5)) val inputFields7 = new MedianOperator("median", initSchema, Map("inputField" -> "field1")) inputFields7.processMap(Row(5L, 2)) should be(Some(5L)) val inputFields8 = new MedianOperator("median", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields8.processMap(Row(1, 2)) should be(Some(1L)) val inputFields9 = new MedianOperator("median", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields9.processMap(Row(1, 2)) should be(None) val inputFields10 = new MedianOperator("median", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields10.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new MedianOperator("median", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new MedianOperator("median", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some(3d)) val inputFields3 = new MedianOperator("median", initSchema, Map()) inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some(3)) val inputFields4 = new MedianOperator("median", initSchema, Map()) inputFields4.processReduce(Seq(None)) should be(Some(0d)) val inputFields5 = new MedianOperator("median", initSchema, Map("typeOp" -> "string")) inputFields5.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some("3.0")) } "processReduce distinct must be " in { val inputFields = new MedianOperator("median", initSchema, Map("distinct" -> "true")) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new MedianOperator("median", initSchema, Map("distinct" -> "true")) inputFields2.processReduce(Seq(Some(1), Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some(2.5)) val inputFields3 = new MedianOperator("median", initSchema, Map("distinct" -> "true")) inputFields3.processReduce(Seq(Some(1), Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some(3)) val inputFields4 = new MedianOperator("median", initSchema, Map("distinct" -> "true")) inputFields4.processReduce(Seq(None)) should be(Some(0d)) val inputFields5 = new MedianOperator("median", initSchema, Map("typeOp" -> "string", "distinct" -> "true")) inputFields5.processReduce(Seq(Some(1), Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some("2.5")) } } }
Example 89
Source File: ModeOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.mode import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class ModeOperatorTest extends WordSpec with Matchers { "Mode operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new ModeOperator("mode", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new ModeOperator("mode", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new ModeOperator("mode", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new ModeOperator("mode", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields4.processMap(Row(1, 2)) should be(Some(1L)) val inputFields5 = new ModeOperator("mode", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields5.processMap(Row(1, 2)) should be(None) val inputFields6 = new ModeOperator("mode", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields6.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new ModeOperator("mode", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(List())) val inputFields2 = new ModeOperator("mode", initSchema, Map()) inputFields2.processReduce(Seq(Some("hey"), Some("hey"), Some("hi"))) should be(Some(List("hey"))) val inputFields3 = new ModeOperator("mode", initSchema, Map()) inputFields3.processReduce(Seq(Some("1"), Some("1"), Some("4"))) should be(Some(List("1"))) val inputFields4 = new ModeOperator("mode", initSchema, Map()) inputFields4.processReduce(Seq( Some("1"), Some("1"), Some("4"), Some("4"), Some("4"), Some("4"))) should be(Some(List("4"))) val inputFields5 = new ModeOperator("mode", initSchema, Map()) inputFields5.processReduce(Seq( Some("1"), Some("1"), Some("2"), Some("2"), Some("4"), Some("4"))) should be(Some(List("1", "2", "4"))) val inputFields6 = new ModeOperator("mode", initSchema, Map()) inputFields6.processReduce(Seq( Some("1"), Some("1"), Some("2"), Some("2"), Some("4"), Some("4"), Some("5")) ) should be(Some(List("1", "2", "4"))) } } }
Example 90
Source File: RangeOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.range import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class RangeOperatorTest extends WordSpec with Matchers { "Range operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new RangeOperator("range", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new RangeOperator("range", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new RangeOperator("range", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new RangeOperator("range", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row("1", 2)) should be(Some(1)) val inputFields6 = new RangeOperator("range", initSchema, Map("inputField" -> "field1")) inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5)) val inputFields7 = new RangeOperator("range", initSchema, Map("inputField" -> "field1")) inputFields7.processMap(Row(5L, 2)) should be(Some(5L)) val inputFields8 = new RangeOperator("range", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields8.processMap(Row(1, 2)) should be(Some(1L)) val inputFields9 = new RangeOperator("range", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields9.processMap(Row(1, 2)) should be(None) val inputFields10 = new RangeOperator("range", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields10.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new RangeOperator("range", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new RangeOperator("range", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(1))) should be(Some(0)) val inputFields3 = new RangeOperator("range", initSchema, Map()) inputFields3.processReduce(Seq(Some(1), Some(2), Some(4))) should be(Some(3)) val inputFields4 = new RangeOperator("range", initSchema, Map()) inputFields4.processReduce(Seq(None)) should be(Some(0d)) val inputFields5 = new RangeOperator("range", initSchema, Map("typeOp" -> "string")) inputFields5.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some("6.0")) } "processReduce distinct must be " in { val inputFields = new RangeOperator("range", initSchema, Map("distinct" -> "true")) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new RangeOperator("range", initSchema, Map("distinct" -> "true")) inputFields2.processReduce(Seq(Some(1), Some(1))) should be(Some(0)) val inputFields3 = new RangeOperator("range", initSchema, Map("distinct" -> "true")) inputFields3.processReduce(Seq(Some(1), Some(2), Some(4))) should be(Some(3)) val inputFields4 = new RangeOperator("range", initSchema, Map("distinct" -> "true")) inputFields4.processReduce(Seq(None)) should be(Some(0d)) val inputFields5 = new RangeOperator("range", initSchema, Map("typeOp" -> "string", "distinct" -> "true")) inputFields5.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some("6.0")) } } }
Example 91
Source File: AccumulatorOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.accumulator import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class AccumulatorOperatorTest extends WordSpec with Matchers { "Accumulator operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new AccumulatorOperator("accumulator", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new AccumulatorOperator("accumulator", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new AccumulatorOperator("accumulator", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new AccumulatorOperator("accumulator", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields4.processMap(Row(1, 2)) should be(Some(1L)) val inputFields5 = new AccumulatorOperator("accumulator", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":2}]")) inputFields5.processMap(Row(1, 2)) should be(None) val inputFields6 = new AccumulatorOperator("accumulator", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":2}]" })) inputFields6.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new AccumulatorOperator("accumulator", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(Seq())) val inputFields2 = new AccumulatorOperator("accumulator", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(1))) should be(Some(Seq("1", "1"))) val inputFields3 = new AccumulatorOperator("accumulator", initSchema, Map()) inputFields3.processReduce(Seq(Some("a"), Some("b"))) should be(Some(Seq("a", "b"))) } "associative process must be " in { val inputFields = new AccumulatorOperator("accumulator", initSchema, Map()) val resultInput = Seq((Operator.OldValuesKey, Some(Seq(1L))), (Operator.NewValuesKey, Some(Seq(2L))), (Operator.NewValuesKey, None)) inputFields.associativity(resultInput) should be(Some(Seq("1", "2"))) val inputFields2 = new AccumulatorOperator("accumulator", initSchema, Map("typeOp" -> "arraydouble")) val resultInput2 = Seq((Operator.OldValuesKey, Some(Seq(1))), (Operator.NewValuesKey, Some(Seq(3)))) inputFields2.associativity(resultInput2) should be(Some(Seq(1d, 3d))) val inputFields3 = new AccumulatorOperator("accumulator", initSchema, Map("typeOp" -> null)) val resultInput3 = Seq((Operator.OldValuesKey, Some(Seq(1))), (Operator.NewValuesKey, Some(Seq(1)))) inputFields3.associativity(resultInput3) should be(Some(Seq("1", "1"))) } } }
Example 92
Source File: FirstValueOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.firstValue import java.util.Date import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class FirstValueOperatorTest extends WordSpec with Matchers { "FirstValue operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new FirstValueOperator("firstValue", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new FirstValueOperator("firstValue", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new FirstValueOperator("firstValue", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new FirstValueOperator("firstValue", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields4.processMap(Row(1, 2)) should be(Some(1L)) val inputFields5 = new FirstValueOperator("firstValue", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields5.processMap(Row(1, 2)) should be(None) val inputFields6 = new FirstValueOperator("firstValue", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields6.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new FirstValueOperator("firstValue", initSchema, Map()) inputFields.processReduce(Seq()) should be(None) val inputFields2 = new FirstValueOperator("firstValue", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(2))) should be(Some(1)) val inputFields3 = new FirstValueOperator("firstValue", initSchema, Map()) inputFields3.processReduce(Seq(Some("a"), Some("b"))) should be(Some("a")) } "associative process must be " in { val inputFields = new FirstValueOperator("firstValue", initSchema, Map()) val resultInput = Seq((Operator.OldValuesKey, Some(1L)), (Operator.NewValuesKey, Some(1L)), (Operator.NewValuesKey, None)) inputFields.associativity(resultInput) should be(Some(1L)) val inputFields2 = new FirstValueOperator("firstValue", initSchema, Map("typeOp" -> "int")) val resultInput2 = Seq((Operator.OldValuesKey, Some(1L)), (Operator.NewValuesKey, Some(1L))) inputFields2.associativity(resultInput2) should be(Some(1)) val inputFields3 = new FirstValueOperator("firstValue", initSchema, Map("typeOp" -> null)) val resultInput3 = Seq((Operator.OldValuesKey, Some(1)), (Operator.NewValuesKey, Some(1)), (Operator.NewValuesKey, None)) inputFields3.associativity(resultInput3) should be(Some(1)) val inputFields4 = new FirstValueOperator("firstValue", initSchema, Map()) val resultInput4 = Seq() inputFields4.associativity(resultInput4) should be(None) val inputFields5 = new FirstValueOperator("firstValue", initSchema, Map()) val date = new Date() val resultInput5 = Seq((Operator.NewValuesKey, Some(date))) inputFields5.associativity(resultInput5) should be(Some(date)) } } }
Example 93
Source File: MeanAssociativeOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.mean import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class MeanAssociativeOperatorTest extends WordSpec with Matchers { "Mean operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new MeanAssociativeOperator("avg", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new MeanAssociativeOperator("avg", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1")) inputFields4.processMap(Row("1", 2)) should be(Some(1)) val inputFields6 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1")) inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5)) val inputFields7 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1")) inputFields7.processMap(Row(5L, 2)) should be(Some(5L)) val inputFields8 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields8.processMap(Row(1, 2)) should be(Some(1L)) val inputFields9 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields9.processMap(Row(1, 2)) should be(None) val inputFields10 = new MeanAssociativeOperator("avg", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields10.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new MeanAssociativeOperator("avg", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(List())) val inputFields2 = new MeanAssociativeOperator("avg", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(1), None)) should be (Some(List(1.0, 1.0))) val inputFields3 = new MeanAssociativeOperator("avg", initSchema, Map()) inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), None)) should be(Some(List(1.0, 2.0, 3.0))) val inputFields4 = new MeanAssociativeOperator("avg", initSchema, Map()) inputFields4.processReduce(Seq(None)) should be(Some(List())) } "processReduce distinct must be " in { val inputFields = new MeanAssociativeOperator("avg", initSchema, Map("distinct" -> "true")) inputFields.processReduce(Seq()) should be(Some(List())) val inputFields2 = new MeanAssociativeOperator("avg", initSchema, Map("distinct" -> "true")) inputFields2.processReduce(Seq(Some(1), Some(1), None)) should be(Some(List(1.0))) val inputFields3 = new MeanAssociativeOperator("avg", initSchema, Map("distinct" -> "true")) inputFields3.processReduce(Seq(Some(1), Some(3), Some(1), None)) should be(Some(List(1.0, 3.0))) val inputFields4 = new MeanAssociativeOperator("avg", initSchema, Map("distinct" -> "true")) inputFields4.processReduce(Seq(None)) should be(Some(List())) } "associative process must be " in { val inputFields = new MeanAssociativeOperator("avg", initSchema, Map()) val resultInput = Seq((Operator.OldValuesKey, Some(Map("count" -> 1d, "sum" -> 2d, "mean" -> 2d))), (Operator.NewValuesKey, None)) inputFields.associativity(resultInput) should be(Some(Map("count" -> 1.0, "sum" -> 2.0, "mean" -> 2.0))) val inputFields2 = new MeanAssociativeOperator("avg", initSchema, Map()) val resultInput2 = Seq((Operator.OldValuesKey, Some(Map("count" -> 1d, "sum" -> 2d, "mean" -> 2d))), (Operator.NewValuesKey, Some(Seq(1d)))) inputFields2.associativity(resultInput2) should be(Some(Map("sum" -> 3.0, "count" -> 2.0, "mean" -> 1.5))) } } }
Example 94
Source File: MeanOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.mean import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class MeanOperatorTest extends WordSpec with Matchers { "Mean operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new MeanOperator("avg", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new MeanOperator("avg", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1")) inputFields4.processMap(Row("1", 2)) should be(Some(1)) val inputFields6 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1")) inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5)) val inputFields7 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1")) inputFields7.processMap(Row(5L, 2)) should be(Some(5L)) val inputFields8 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields8.processMap(Row(1, 2)) should be(Some(1L)) val inputFields9 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields9.processMap(Row(1, 2)) should be(None) val inputFields10 = new MeanOperator("avg", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields10.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new MeanOperator("avg", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new MeanOperator("avg", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(1), None)) should be(Some(1)) val inputFields3 = new MeanOperator("avg", initSchema, Map()) inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), None)) should be(Some(2)) val inputFields4 = new MeanOperator("avg", initSchema, Map()) inputFields4.processReduce(Seq(None)) should be(Some(0d)) val inputFields5 = new MeanOperator("avg", initSchema, Map("typeOp" -> "string")) inputFields5.processReduce(Seq(Some(1), Some(1))) should be(Some("1.0")) } "processReduce distinct must be " in { val inputFields = new MeanOperator("avg", initSchema, Map("distinct" -> "true")) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new MeanOperator("avg", initSchema, Map("distinct" -> "true")) inputFields2.processReduce(Seq(Some(1), Some(1), None)) should be(Some(1)) val inputFields3 = new MeanOperator("avg", initSchema, Map("distinct" -> "true")) inputFields3.processReduce(Seq(Some(1), Some(3), Some(1), None)) should be(Some(2)) val inputFields4 = new MeanOperator("avg", initSchema, Map("distinct" -> "true")) inputFields4.processReduce(Seq(None)) should be(Some(0d)) val inputFields5 = new MeanOperator("avg", initSchema, Map("typeOp" -> "string", "distinct" -> "true")) inputFields5.processReduce(Seq(Some(1), Some(1))) should be(Some("1.0")) } } }
Example 95
Source File: OperatorEntityCountTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.entityCount import java.io.{Serializable => JSerializable} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class OperatorEntityCountTest extends WordSpec with Matchers { "EntityCount" should { val props = Map( "inputField" -> "inputField".asInstanceOf[JSerializable], "split" -> ",".asInstanceOf[JSerializable]) val schema = StructType(Seq(StructField("inputField", StringType))) val entityCount = new OperatorEntityCountMock("op1", schema, props) val inputFields = Row("hello,bye") "Return the associated precision name" in { val expected = Option(Seq("hello", "bye")) val result = entityCount.processMap(inputFields) result should be(expected) } "Return empty list" in { val expected = None val result = entityCount.processMap(Row()) result should be(expected) } } }
Example 96
Source File: EntityCountOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.entityCount import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class EntityCountOperatorTest extends WordSpec with Matchers { "Entity Count Operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new EntityCountOperator("entityCount", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new EntityCountOperator("entityCount", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row("hola holo", 2)) should be(Some(Seq("hola holo"))) val inputFields4 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1", "split" -> ",")) inputFields4.processMap(Row("hola holo", 2)) should be(Some(Seq("hola holo"))) val inputFields5 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1", "split" -> "-")) inputFields5.processMap(Row("hola-holo", 2)) should be(Some(Seq("hola", "holo"))) val inputFields6 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1", "split" -> ",")) inputFields6.processMap(Row("hola,holo adios", 2)) should be(Some(Seq("hola", "holo " + "adios"))) val inputFields7 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"!=\", \"value\":\"hola\"}]")) inputFields7.processMap(Row("hola", 2)) should be(None) val inputFields8 = new EntityCountOperator("entityCount", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"!=\", \"value\":\"hola\"}]", "split" -> " ")) inputFields8.processMap(Row("hola holo", 2)) should be(Some(Seq("hola", "holo"))) } "processReduce must be " in { val inputFields = new EntityCountOperator("entityCount", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(Seq())) val inputFields2 = new EntityCountOperator("entityCount", initSchema, Map()) inputFields2.processReduce(Seq(Some(Seq("hola", "holo")))) should be(Some(Seq("hola", "holo"))) val inputFields3 = new EntityCountOperator("entityCount", initSchema, Map()) inputFields3.processReduce(Seq(Some(Seq("hola", "holo", "hola")))) should be(Some(Seq("hola", "holo", "hola"))) } "associative process must be " in { val inputFields = new EntityCountOperator("entityCount", initSchema, Map()) val resultInput = Seq((Operator.OldValuesKey, Some(Map("hola" -> 1L, "holo" -> 1L))), (Operator.NewValuesKey, None)) inputFields.associativity(resultInput) should be(Some(Map("hola" -> 1L, "holo" -> 1L))) val inputFields2 = new EntityCountOperator("entityCount", initSchema, Map("typeOp" -> "int")) val resultInput2 = Seq((Operator.OldValuesKey, Some(Map("hola" -> 1L, "holo" -> 1L))), (Operator.NewValuesKey, Some(Seq("hola")))) inputFields2.associativity(resultInput2) should be(Some(Map())) val inputFields3 = new EntityCountOperator("entityCount", initSchema, Map("typeOp" -> null)) val resultInput3 = Seq((Operator.OldValuesKey, Some(Map("hola" -> 1L, "holo" -> 1L)))) inputFields3.associativity(resultInput3) should be(Some(Map("hola" -> 1L, "holo" -> 1L))) } } }
Example 97
Source File: SumOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.sum import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class SumOperatorTest extends WordSpec with Matchers { "Sum operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new SumOperator("sum", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new SumOperator("sum", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new SumOperator("sum", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new SumOperator("sum", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row("1", 2)) should be(Some(1)) val inputFields6 = new SumOperator("sum", initSchema, Map("inputField" -> "field1")) inputFields6.processMap(Row(1.5, 2)) should be(Some(1.5)) val inputFields7 = new SumOperator("sum", initSchema, Map("inputField" -> "field1")) inputFields7.processMap(Row(5L, 2)) should be(Some(5L)) val inputFields8 = new SumOperator("sum", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields8.processMap(Row(1, 2)) should be(Some(1L)) val inputFields9 = new SumOperator("sum", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields9.processMap(Row(1, 2)) should be(None) val inputFields10 = new SumOperator("sum", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields10.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new SumOperator("sum", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new SumOperator("sum", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(2), Some(3), Some(7), Some(7))) should be(Some(20d)) val inputFields3 = new SumOperator("sum", initSchema, Map()) inputFields3.processReduce(Seq(Some(1), Some(2), Some(3), Some(6.5), Some(7.5))) should be(Some(20d)) val inputFields4 = new SumOperator("sum", initSchema, Map()) inputFields4.processReduce(Seq(None)) should be(Some(0d)) } "processReduce distinct must be " in { val inputFields = new SumOperator("sum", initSchema, Map("distinct" -> "true")) inputFields.processReduce(Seq()) should be(Some(0d)) val inputFields2 = new SumOperator("sum", initSchema, Map("distinct" -> "true")) inputFields2.processReduce(Seq(Some(1), Some(2), Some(1))) should be(Some(3d)) } "associative process must be " in { val inputFields = new SumOperator("count", initSchema, Map()) val resultInput = Seq((Operator.OldValuesKey, Some(1L)), (Operator.NewValuesKey, Some(1L)), (Operator.NewValuesKey, None)) inputFields.associativity(resultInput) should be(Some(2d)) val inputFields2 = new SumOperator("count", initSchema, Map("typeOp" -> "string")) val resultInput2 = Seq((Operator.OldValuesKey, Some(1L)), (Operator.NewValuesKey, Some(1L)), (Operator.NewValuesKey, None)) inputFields2.associativity(resultInput2) should be(Some("2.0")) } } }
Example 98
Source File: FullTextOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.fullText import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class FullTextOperatorTest extends WordSpec with Matchers { "FullText operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new FullTextOperator("fullText", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new FullTextOperator("fullText", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new FullTextOperator("fullText", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row(1, 2)) should be(Some(1)) val inputFields4 = new FullTextOperator("fullText", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"<\", \"value\":2}]")) inputFields4.processMap(Row(1, 2)) should be(Some(1L)) val inputFields5 = new FullTextOperator("fullText", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \">\", \"value\":\"2\"}]")) inputFields5.processMap(Row(1, 2)) should be(None) val inputFields6 = new FullTextOperator("fullText", initSchema, Map("inputField" -> "field1", "filters" -> { "[{\"field\":\"field1\", \"type\": \"<\", \"value\":\"2\"}," + "{\"field\":\"field2\", \"type\": \"<\", \"value\":\"2\"}]" })) inputFields6.processMap(Row(1, 2)) should be(None) } "processReduce must be " in { val inputFields = new FullTextOperator("fullText", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some("")) val inputFields2 = new FullTextOperator("fullText", initSchema, Map()) inputFields2.processReduce(Seq(Some(1), Some(1))) should be(Some(s"1${Operator.SpaceSeparator}1")) val inputFields3 = new FullTextOperator("fullText", initSchema, Map()) inputFields3.processReduce(Seq(Some("a"), Some("b"))) should be(Some(s"a${Operator.SpaceSeparator}b")) } "associative process must be " in { val inputFields = new FullTextOperator("fullText", initSchema, Map()) val resultInput = Seq((Operator.OldValuesKey, Some(2)), (Operator.NewValuesKey, None)) inputFields.associativity(resultInput) should be(Some("2")) val inputFields2 = new FullTextOperator("fullText", initSchema, Map("typeOp" -> "arraystring")) val resultInput2 = Seq((Operator.OldValuesKey, Some(2)), (Operator.NewValuesKey, Some(1))) inputFields2.associativity(resultInput2) should be(Some(Seq(s"2${Operator.SpaceSeparator}1"))) val inputFields3 = new FullTextOperator("fullText", initSchema, Map("typeOp" -> null)) val resultInput3 = Seq((Operator.OldValuesKey, Some(2)), (Operator.OldValuesKey, Some(3))) inputFields3.associativity(resultInput3) should be(Some(s"2${Operator.SpaceSeparator}3")) } } }
Example 99
Source File: TotalEntityCountOperatorTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.cube.operator.totalEntityCount import com.stratio.sparta.sdk.pipeline.aggregation.operator.Operator import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpec} @RunWith(classOf[JUnitRunner]) class TotalEntityCountOperatorTest extends WordSpec with Matchers { "Entity Count Operator" should { val initSchema = StructType(Seq( StructField("field1", IntegerType, false), StructField("field2", IntegerType, false), StructField("field3", IntegerType, false) )) val initSchemaFail = StructType(Seq( StructField("field2", IntegerType, false) )) "processMap must be " in { val inputField = new TotalEntityCountOperator("totalEntityCount", initSchema, Map()) inputField.processMap(Row(1, 2)) should be(None) val inputFields2 = new TotalEntityCountOperator("totalEntityCount", initSchemaFail, Map("inputField" -> "field1")) inputFields2.processMap(Row(1, 2)) should be(None) val inputFields3 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1")) inputFields3.processMap(Row("hola holo", 2)) should be(Some(Seq("hola holo"))) val inputFields4 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1", "split" -> ",")) inputFields4.processMap(Row("hola holo", 2)) should be(Some(Seq("hola holo"))) val inputFields5 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1", "split" -> "-")) inputFields5.processMap(Row("hola-holo", 2)) should be(Some(Seq("hola", "holo"))) val inputFields6 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1", "split" -> ",")) inputFields6.processMap(Row("hola,holo adios", 2)) should be(Some(Seq("hola", "holo " + "adios"))) val inputFields7 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"!=\", \"value\":\"hola\"}]")) inputFields7.processMap(Row("hola", 2)) should be(None) val inputFields8 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("inputField" -> "field1", "filters" -> "[{\"field\":\"field1\", \"type\": \"!=\", \"value\":\"hola\"}]", "split" -> " ")) inputFields8.processMap(Row("hola holo", 2)) should be (Some(Seq("hola", "holo"))) } "processReduce must be " in { val inputFields = new TotalEntityCountOperator("totalEntityCount", initSchema, Map()) inputFields.processReduce(Seq()) should be(Some(0L)) val inputFields2 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map()) inputFields2.processReduce(Seq(Some(Seq("hola", "holo")))) should be(Some(2L)) val inputFields3 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map()) inputFields3.processReduce(Seq(Some(Seq("hola", "holo", "hola")))) should be(Some(3L)) val inputFields4 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map()) inputFields4.processReduce(Seq(None)) should be(Some(0L)) } "processReduce distinct must be " in { val inputFields = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("distinct" -> "true")) inputFields.processReduce(Seq()) should be(Some(0L)) val inputFields2 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("distinct" -> "true")) inputFields2.processReduce(Seq(Some(Seq("hola", "holo", "hola")))) should be(Some(2L)) } "associative process must be " in { val inputFields = new TotalEntityCountOperator("totalEntityCount", initSchema, Map()) val resultInput = Seq((Operator.OldValuesKey, Some(2)), (Operator.NewValuesKey, None)) inputFields.associativity(resultInput) should be(Some(2)) val inputFields2 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("typeOp" -> "int")) val resultInput2 = Seq((Operator.OldValuesKey, Some(2)), (Operator.NewValuesKey, Some(1))) inputFields2.associativity(resultInput2) should be(Some(3)) val inputFields3 = new TotalEntityCountOperator("totalEntityCount", initSchema, Map("typeOp" -> null)) val resultInput3 = Seq((Operator.OldValuesKey, Some(2))) inputFields3.associativity(resultInput3) should be(Some(2)) } } }
Example 100
Source File: MorphlinesParserTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.transformation.morphline import java.io.Serializable import com.stratio.sparta.sdk.pipeline.input.Input import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, Matchers, WordSpecLike} @RunWith(classOf[JUnitRunner]) class MorphlinesParserTest extends WordSpecLike with Matchers with BeforeAndAfter with BeforeAndAfterAll { val morphlineConfig = """ id : test1 importCommands : ["org.kitesdk.**"] commands: [ { readJson {}, } { extractJsonPaths { paths : { col1 : /col1 col2 : /col2 } } } { java { code : "return child.process(record);" } } { removeFields { blacklist:["literal:_attachment_body"] } } ] """ val inputField = Some(Input.RawDataKey) val outputsFields = Seq("col1", "col2") val props: Map[String, Serializable] = Map("morphline" -> morphlineConfig) val schema = StructType(Seq(StructField("col1", StringType), StructField("col2", StringType))) val parser = new MorphlinesParser(1, inputField, outputsFields, schema, props) "A MorphlinesParser" should { "parse a simple json" in { val simpleJson = """{ "col1":"hello", "col2":"word" } """ val input = Row(simpleJson) val result = parser.parse(input) val expected = Seq(Row(simpleJson, "hello", "world")) result should be eq(expected) } "parse a simple json removing raw" in { val simpleJson = """{ "col1":"hello", "col2":"word" } """ val input = Row(simpleJson) val result = parser.parse(input) val expected = Seq(Row("hello", "world")) result should be eq(expected) } "exclude not configured fields" in { val simpleJson = """{ "col1":"hello", "col2":"word", "col3":"!" } """ val input = Row(simpleJson) val result = parser.parse(input) val expected = Seq(Row(simpleJson, "hello", "world")) result should be eq(expected) } } }
Example 101
Source File: DateTimeParserTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.transformation.datetime import com.stratio.sparta.sdk.properties.JsoneyString import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Matchers, WordSpecLike} @RunWith(classOf[JUnitRunner]) class DateTimeParserTest extends WordSpecLike with Matchers { val inputField = Some("ts") val outputsFields = Seq("ts") //scalastyle:off "A DateTimeParser" should { "parse unixMillis to string" in { val input = Row(1416330788000L) val schema = StructType(Seq(StructField("ts", StringType))) val result = new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unixMillis")) .parse(input) val expected = Seq(Row(1416330788000L, "1416330788000")) assertResult(result)(expected) } "parse unix to string" in { val input = Row(1416330788) val schema = StructType(Seq(StructField("ts", StringType))) val result = new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unix")) .parse(input) val expected = Seq(Row(1416330788, "1416330788000")) assertResult(result)(expected) } "parse unix to string removing raw" in { val input = Row(1416330788) val schema = StructType(Seq(StructField("ts", StringType))) val result = new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unix", "removeInputField" -> JsoneyString.apply("true"))) .parse(input) val expected = Seq(Row("1416330788000")) assertResult(result)(expected) } "not parse anything if the field does not match" in { val input = Row("1212") val schema = StructType(Seq(StructField("otherField", StringType))) an[IllegalStateException] should be thrownBy new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unixMillis")).parse(input) } "not parse anything and generate a new Date" in { val input = Row("anything") val schema = StructType(Seq(StructField("ts", StringType))) val result = new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "autoGenerated")) .parse(input) assertResult(result.head.size)(2) } "Auto generated if inputFormat does not exist" in { val input = Row("1416330788") val schema = StructType(Seq(StructField("ts", StringType))) val result = new DateTimeParser(1, inputField, outputsFields, schema, Map()).parse(input) assertResult(result.head.size)(2) } "parse dateTime in hive format" in { val input = Row("2015-11-08 15:58:58") val schema = StructType(Seq(StructField("ts", StringType))) val result = new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "hive")) .parse(input) val expected = Seq(Row("2015-11-08 15:58:58", "1446998338000")) assertResult(result)(expected) } } }
Example 102
Source File: package.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering import org.apache.spark.sql.execution.datasources.oap.io.ColumnStatistics import org.apache.spark.sql.execution.datasources.oap.utils.OapUtils import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StructField, StructType} package object oap { type Key = InternalRow def order(sf: StructField): Ordering[Key] = GenerateOrdering.create(StructType(Array(sf))) // Return if the rowGroup or file can be skipped by min max statistics def isSkippedByStatistics( columnStats: Array[ColumnStatistics], filter: Filter, schema: StructType): Boolean = filter match { case Or(left, right) => isSkippedByStatistics(columnStats, left, schema) && isSkippedByStatistics(columnStats, right, schema) case And(left, right) => isSkippedByStatistics(columnStats, left, schema) || isSkippedByStatistics(columnStats, right, schema) case IsNotNull(attribute) => val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) !stat.hasNonNullValue case EqualTo(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) (OapUtils.keyFromBytes(stat.min, schema(idx).dataType), OapUtils.keyFromBytes( stat.max, schema(idx).dataType)) match { case (Some(v1), Some(v2)) => comp.gt(v1, key) || comp.lt(v2, key) case _ => false } case LessThan(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) OapUtils.keyFromBytes(stat.min, schema(idx).dataType) match { case Some(v) => comp.gteq(v, key) case None => false } case LessThanOrEqual(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) OapUtils.keyFromBytes(stat.min, schema(idx).dataType) match { case Some(v) => comp.gt(v, key) case None => false } case GreaterThan(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) OapUtils.keyFromBytes(stat.max, schema(idx).dataType) match { case Some(v) => comp.lteq(v, key) case None => false } case GreaterThanOrEqual(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) OapUtils.keyFromBytes(stat.max, schema(idx).dataType) match { case Some(v) => comp.lt(v, key) case None => false } case _ => false } } class OapException(message: String, cause: Throwable) extends Exception(message, cause) { def this(message: String) = this(message, null) }
Example 103
Source File: AnnotationUtils.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.vcf import org.apache.spark.sql.types.{ArrayType, DataType, IntegerType, StringType, StructField, StructType} // Unified VCF annotation representation, used by SnpEff and VEP object AnnotationUtils { // Delimiter between annotation fields val annotationDelimiter = "|" val annotationDelimiterRegex = "\\|" // Fractional delimiter for struct subfields val structDelimiter = "/" val structDelimiterRegex = "\\/" // Delimiter for array subfields val arrayDelimiter = "&" // Struct subfield schemas private val rankTotalStruct = StructType( Seq(StructField("rank", IntegerType), StructField("total", IntegerType))) private val posLengthStruct = StructType( Seq(StructField("pos", IntegerType), StructField("length", IntegerType))) private val referenceVariantStruct = StructType( Seq(StructField("reference", StringType), StructField("variant", StringType))) // Special schemas for SnpEff subfields private val snpEffFieldsToSchema: Map[String, DataType] = Map( "Annotation" -> ArrayType(StringType), "Rank" -> rankTotalStruct, "cDNA_pos/cDNA_length" -> posLengthStruct, "CDS_pos/CDS_length" -> posLengthStruct, "AA_pos/AA_length" -> posLengthStruct, "Distance" -> IntegerType ) // Special schemas for VEP subfields private val vepFieldsToSchema: Map[String, DataType] = Map( "Consequence" -> ArrayType(StringType), "EXON" -> rankTotalStruct, "INTRON" -> rankTotalStruct, "cDNA_position" -> IntegerType, "CDS_position" -> IntegerType, "Protein_position" -> IntegerType, "Amino_acids" -> referenceVariantStruct, "Codons" -> referenceVariantStruct, "Existing_variation" -> ArrayType(StringType), "DISTANCE" -> IntegerType, "STRAND" -> IntegerType, "FLAGS" -> ArrayType(StringType) ) // Special schemas for LOFTEE (as VEP plugin) subfields private val lofteeFieldsToSchema: Map[String, DataType] = Map( "LoF_filter" -> ArrayType(StringType), "LoF_flags" -> ArrayType(StringType), "LoF_info" -> ArrayType(StringType) ) // Default string schema for annotation subfield val allFieldsToSchema: Map[String, DataType] = (snpEffFieldsToSchema ++ vepFieldsToSchema ++ lofteeFieldsToSchema).withDefaultValue(StringType) }
Example 104
Source File: DateTimeDataFixture.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.vector import java.util.TimeZone import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{DateType, StructField, StructType, TimestampType} import com.actian.spark_vector.test.util.DateHelper.{ansiDateFor, timestampFor} object DateTimeDataFixture { def timeRDD(sparkContext: SparkContext): (RDD[Seq[Any]], StructType) = createTimeRDD(sparkContext, timeData) private[vector] val tz = TimeZone.getTimeZone("GMT-06:00") private[vector] val utc = TimeZone.getTimeZone("UTC") private[vector] val timeData = Seq( Seq[Any]( timestampFor(1995, 1, 22, 18, 3, 29, 234, tz), timestampFor(1996, 2, 22, 18, 3, 29, 234), timestampFor(1997, 2, 22, 18, 3, 29, 234), timestampFor(1998, 1, 22, 18, 3, 29, 234, tz), timestampFor(1999, 2, 22, 18, 3, 29, 234), timestampFor(2000, 2, 22, 18, 3, 29, 234), timestampFor(2015, 11, 23, 18, 3, 29, 123, tz), timestampFor(2015, 11, 23, 18, 3, 29, 123), ansiDateFor(1995, 2, 22)), Seq[Any]( timestampFor(2015, 3, 2, 17, 52, 12, 678, tz), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 3, 2, 17, 52, 12, 678, tz), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 11, 13, 17, 52, 12, 123, tz), ansiDateFor(2015, 4, 2))) private def createTimeRDD(sparkContext: SparkContext, data: Seq[Seq[Any]]): (RDD[Seq[Any]], StructType) = { val schema = StructType(Seq( StructField("tswtz", TimestampType), StructField("tsntz", TimestampType), StructField("tsltz", TimestampType), StructField("tswtz4", TimestampType), StructField("tsntz4", TimestampType), StructField("tsltz4", TimestampType), StructField("tmwtz", TimestampType), StructField("tmntz", TimestampType), StructField("tmltz", TimestampType), StructField("tmwtz3", TimestampType), StructField("tmntz3", TimestampType), StructField("tmltz3", TimestampType), StructField("date", DateType))) (sparkContext.parallelize(data, 2), schema) } def createTimeTable(connectionProps: VectorConnectionProperties)(tableName: String): Unit = { VectorJDBC.withJDBC(connectionProps) { cxn => cxn.dropTable(tableName) cxn.executeStatement( s"""|create table ${tableName} ( | tswtz timestamp with time zone, | tsntz timestamp without time zone, | tsltz timestamp with local time zone, | tswtz4 timestamp(4) with time zone, | tsntz4 timestamp(4) without time zone, | tsltz4 timestamp(4) with local time zone, | tmwtz time with time zone, | tmntz time without time zone, | tmltz time with local time zone, | tmwtz3 time(3) with time zone, | tmntz3 time(3) without time zone, | tmltz3 time(3) with local time zone, | dt date |)""".stripMargin) } } }
Example 105
Source File: RDDFixtures.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector import java.util.Date import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ DateType, IntegerType, StringType, StructField, StructType } import com.actian.spark_vector.test.util.StructTypeUtil.createSchema trait RDDFixtures { // poor man's fixture, for other approaches see: // http://www.scalatest.org/user_guide/sharing_fixtures def createRecordRdd(sc: SparkContext): (RDD[Seq[Any]], StructType) = { val input = Seq( Seq(42, "a"), Seq(43, "b")) val inputRdd = sc.parallelize(input, 2) val inputSchema = createSchema("id" -> IntegerType, "name" -> StringType) (inputRdd, inputSchema) } def createRowRDD(sc: SparkContext): (RDD[Seq[Any]], StructType) = { val input = Seq( Seq[Any](42, "a", new Date(), new Date()), Seq[Any](43, "b", new Date(), new Date())) val inputRdd = sc.parallelize(input, 2) val inputSchema = createSchema("id" -> IntegerType, "name" -> StringType, "date" -> DateType) (inputRdd, inputSchema) } def wideRDD(sc: SparkContext, columnCount: Int, rowCount: Int = 2): (RDD[Row], StructType) = { val data: Row = Row.fromSeq(1 to columnCount) val fields = for (i <- 1 to rowCount) yield { StructField("field_" + i, IntegerType, true) } val inputSchema = StructType(fields.toSeq) val input = for (i <- 1 to rowCount) yield { data } val inputRDD = sc.parallelize(input, 2) (inputRDD, inputSchema) } }
Example 106
Source File: CurrentPersistenceIdsQuerySourceProvider.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package akka.persistence.jdbc.spark.sql.execution.streaming import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source } import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider } import org.apache.spark.sql.types.{ StringType, StructField, StructType } import org.apache.spark.sql.{ SQLContext, _ } object CurrentPersistenceIdsQuerySourceProvider { val name = "current-persistence-id" val schema: StructType = StructType(Array( StructField("persistence_id", StringType, nullable = false) )) } class CurrentPersistenceIdsQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable { override def sourceSchema( sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): (String, StructType) = { CurrentPersistenceIdsQuerySourceProvider.name -> CurrentPersistenceIdsQuerySourceProvider.schema } override def createSource( sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): Source = { new CurrentPersistenceIdsQuerySourceImpl(sqlContext, parameters("path")) } override def shortName(): String = CurrentPersistenceIdsQuerySourceProvider.name } class CurrentPersistenceIdsQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String) extends Source with ReadJournalSource { override def schema: StructType = CurrentPersistenceIdsQuerySourceProvider.schema override def getOffset: Option[Offset] = { val offset = maxPersistenceIds println("[CurrentPersistenceIdsQuery]: Returning maximum offset: " + offset) Some(LongOffset(offset)) } override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = { val (start, end) = getStartEnd(_start, _end) println(s"[CurrentPersistenceIdsQuery]: Getting currentPersistenceIds from start: $start, end: $end") import sqlContext.implicits._ persistenceIds(start, end).toDF() } }
Example 107
Source File: HelloWorldDataSource.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.datasources.helloworld import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{ BaseRelation, DataSourceRegister, RelationProvider, TableScan } import org.apache.spark.sql.types.{ StringType, StructField, StructType } import org.apache.spark.sql.{ Row, SQLContext } class HelloWorldDataSource extends RelationProvider with DataSourceRegister with Serializable { override def shortName(): String = "helloworld" override def hashCode(): Int = getClass.hashCode() override def equals(other: scala.Any): Boolean = other.isInstanceOf[HelloWorldDataSource] override def toString: String = "HelloWorldDataSource" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val path = parameters.get("path") path match { case Some(p) => new HelloWorldRelationProvider(sqlContext, p, parameters) case _ => throw new IllegalArgumentException("Path is required for Tickets datasets") } } } class HelloWorldRelationProvider(val sqlContext: SQLContext, path: String, parameters: Map[String, String]) extends BaseRelation with TableScan { import sqlContext.implicits._ override def schema: StructType = StructType(Array( StructField("key", StringType, nullable = false), StructField("value", StringType, nullable = true) )) override def buildScan(): RDD[Row] = Seq( "path" -> path, "message" -> parameters.getOrElse("message", ""), "name" -> s"Hello ${parameters.getOrElse("name", "")}", "hello_world" -> "Hello World!" ).toDF.rdd }
Example 108
Source File: TextPiperSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.transformers.pipe import scala.collection.JavaConverters._ import org.apache.spark.SparkException import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{StringType, StructField, StructType} import io.projectglow.Glow import io.projectglow.sql.GlowBaseTest class TextPiperSuite extends GlowBaseTest { override def afterEach(): Unit = { Glow.transform("pipe_cleanup", spark.emptyDataFrame) super.afterEach() } def pipeText(df: DataFrame): DataFrame = { val options = Map("inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["cat", "-"]""") new PipeTransformer().transform(df, options) } test("text input and output") { val sess = spark import sess.implicits._ val output = pipeText(Seq("hello", "world").toDF()) assert(output.count() == 2) assert(output.schema == StructType(Seq(StructField("text", StringType)))) assert(output.orderBy("text").as[String].collect.toSeq == Seq("hello", "world")) } test("text input requires one column") { val sess = spark import sess.implicits._ val df = Seq(Seq("hello", "world"), Seq("foo", "bar")).toDF() assertThrows[IllegalArgumentException](pipeText(df)) } test("text input requires string column") { val sess = spark import sess.implicits._ val df = Seq(Seq(5), Seq(6)).toDF() assertThrows[IllegalArgumentException](pipeText(df)) } test("does not break on null row") { val sess = spark import sess.implicits._ val df = Seq("hello", null, "hello").toDF() val output = pipeText(df) assert(output.count() == 2) assert(output.filter("text = 'hello'").count == 2) } test("command fails") { val sess = spark import sess.implicits._ val df = Seq("hello", "world").toDF() val options = Map( "inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["bash", "-c", "exit 1"]""") val ex = intercept[SparkException] { new PipeTransformer().transform(df, options) } assert(ex.getMessage.contains("Subprocess exited with status 1")) // threads should still be cleaned up eventually { assert( !Thread .getAllStackTraces .asScala .keySet .exists(_.getName.startsWith(ProcessHelper.STDIN_WRITER_THREAD_PREFIX))) assert( !Thread .getAllStackTraces .asScala .keySet .exists(_.getName.startsWith(ProcessHelper.STDERR_READER_THREAD_PREFIX))) } } }
Example 109
Source File: MomentAggState.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import io.projectglow.common.GlowLogging def toInternalRow(row: InternalRow, offset: Int = 0): InternalRow = { row.update(offset, if (count > 0) mean else null) row.update(offset + 1, if (count > 0) Math.sqrt(m2 / (count - 1)) else null) row.update(offset + 2, if (count > 0) min else null) row.update(offset + 3, if (count > 0) max else null) row } def toInternalRow: InternalRow = { toInternalRow(new GenericInternalRow(4)) } } object MomentAggState extends GlowLogging { val schema = StructType( Seq( StructField("mean", DoubleType), StructField("stdDev", DoubleType), StructField("min", DoubleType), StructField("max", DoubleType) ) ) def merge(s1: MomentAggState, s2: MomentAggState): MomentAggState = { if (s1.count == 0) { return s2 } else if (s2.count == 0) { return s1 } val newState = MomentAggState() newState.count = s1.count + s2.count val delta = s2.mean - s1.mean val deltaN = delta / newState.count newState.mean = s1.mean + deltaN * s2.count // higher order moments computed according to: // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics newState.m2 = s1.m2 + s2.m2 + delta * deltaN * s1.count * s2.count newState.min = Math.min(s1.min, s2.min) newState.max = Math.max(s1.max, s2.max) newState } }
Example 110
Source File: UTF8TextOutputFormatter.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.transformers.pipe import java.io.InputStream import scala.collection.JavaConverters._ import org.apache.commons.io.IOUtils import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.unsafe.types.UTF8String class UTF8TextOutputFormatter() extends OutputFormatter { override def makeIterator(stream: InputStream): Iterator[Any] = { val schema = StructType(Seq(StructField("text", StringType))) val iter = IOUtils.lineIterator(stream, "UTF-8").asScala.map { s => new GenericInternalRow(Array(UTF8String.fromString(s)): Array[Any]) } Iterator(schema) ++ iter } } class UTF8TextOutputFormatterFactory extends OutputFormatterFactory { override def name: String = "text" override def makeOutputFormatter(options: Map[String, String]): OutputFormatter = { new UTF8TextOutputFormatter } }
Example 111
Source File: CSVOutputFormatter.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.transformers.pipe import java.io.InputStream import scala.collection.JavaConverters._ import com.univocity.parsers.csv.CsvParser import org.apache.commons.io.IOUtils import org.apache.spark.sql.execution.datasources.csv.{CSVDataSourceUtils, CSVUtils, UnivocityParserUtils} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{StringType, StructField, StructType} import io.projectglow.SparkShim.{CSVOptions, UnivocityParser} class CSVOutputFormatter(parsedOptions: CSVOptions) extends OutputFormatter { private def getSchema(record: Array[String]): StructType = { val header = CSVDataSourceUtils.makeSafeHeader( record, SQLConf.get.caseSensitiveAnalysis, parsedOptions ) val fields = header.map { fieldName => StructField(fieldName, StringType, nullable = true) } StructType(fields) } override def makeIterator(stream: InputStream): Iterator[Any] = { val lines = IOUtils.lineIterator(stream, "UTF-8").asScala val filteredLines = CSVUtils.filterCommentAndEmpty(lines, parsedOptions) if (filteredLines.isEmpty) { return Iterator.empty } val firstLine = filteredLines.next val csvParser = new CsvParser(parsedOptions.asParserSettings) val firstRecord = csvParser.parseLine(firstLine) val schema = getSchema(firstRecord) val univocityParser = new UnivocityParser(schema, schema, parsedOptions) val parsedIter = UnivocityParserUtils.parseIterator( Iterator(firstLine) ++ filteredLines, univocityParser, schema ) val parsedIterWithoutHeader = if (parsedOptions.headerFlag) { parsedIter.drop(1) } else { parsedIter } Iterator(schema) ++ parsedIterWithoutHeader.map(_.copy) } } class CSVOutputFormatterFactory extends OutputFormatterFactory { override def name: String = "csv" override def makeOutputFormatter( options: Map[String, String] ): OutputFormatter = { val parsedOptions = new CSVOptions( options, SQLConf.get.csvColumnPruning, SQLConf.get.sessionLocalTimeZone ) new CSVOutputFormatter(parsedOptions) } }
Example 112
Source File: package.scala From sparkpipe-core with Apache License 2.0 | 5 votes |
package software.uncharted.sparkpipe.ops.core.dataframe import org.apache.spark.sql.{SparkSession, DataFrame} import org.apache.spark.sql.types.{StructType, StructField} // Can't test because DataFrameWriter is currently marked final // $COVERAGE-OFF$ def write( path: String, format: String = "parquet", options: Map[String, String] = Map[String, String]() )(input: DataFrame): DataFrame = { if (path.length > 0) { input.write.format(format).options(options).save(path) } else { input.write.format(format).options(options).save() } input } // $COVERAGE-ON$ }
Example 113
Source File: SQLTransformerSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.types.{LongType, StructField, StructType} import org.apache.spark.storage.StorageLevel class SQLTransformerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("transform numeric data") { val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)) .toDF("id", "v1", "v2", "v3", "v4") val resultSchema = sqlTrans.transformSchema(original.schema) testTransformerByGlobalCheckFunc[(Int, Double, Double)]( original, sqlTrans, "id", "v1", "v2", "v3", "v4") { rows => assert(rows.head.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(rows == expected.collect().toSeq) assert(original.sparkSession.catalog.listTables().count() == 0) } } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } test("transformSchema") { val df = spark.range(10) val outputSchema = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") .transformSchema(df.schema) val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) assert(outputSchema === expected) } ignore("SPARK-22538: SQLTransformer should not unpersist given dataset") { val df = spark.range(10).toDF() df.cache() df.count() assert(df.storageLevel != StorageLevel.NONE) val sqlTrans = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") testTransformerByGlobalCheckFunc[Long](df, sqlTrans, "id1") { _ => } assert(df.storageLevel != StorageLevel.NONE) } }
Example 114
Source File: KCore.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.kcore import com.tencent.angel.sona.context.PSContext import org.apache.spark.SparkContext import com.tencent.angel.sona.graph.params._ import com.tencent.angel.sona.ml.Transformer import com.tencent.angel.sona.ml.param.ParamMap import com.tencent.angel.sona.ml.util.Identifiable import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.storage.StorageLevel class KCore(override val uid: String) extends Transformer with HasSrcNodeIdCol with HasDstNodeIdCol with HasOutputNodeIdCol with HasOutputCoreIdCol with HasStorageLevel with HasPartitionNum with HasPSPartitionNum with HasUseBalancePartition { def this() = this(Identifiable.randomUID("KCore")) override def transform(dataset: Dataset[_]): DataFrame = { val edges = dataset.select($(srcNodeIdCol), $(dstNodeIdCol)).rdd .map(row => (row.getLong(0), row.getLong(1))) .filter(e => e._1 != e._2) edges.persist(StorageLevel.DISK_ONLY) val maxId = edges.map(e => math.max(e._1, e._2)).max() + 1 val minId = edges.map(e => math.min(e._1, e._2)).min() val nodes = edges.flatMap(e => Iterator(e._1, e._2)) val numEdges = edges.count() println(s"minId=$minId maxId=$maxId numEdges=$numEdges level=${$(storageLevel)}") // Start PS and init the model println("start to run ps") PSContext.getOrCreate(SparkContext.getOrCreate()) val model = KCorePSModel.fromMinMax(minId, maxId, nodes, $(psPartitionNum), $(useBalancePartition)) var graph = edges.flatMap(e => Iterator((e._1, e._2), (e._2, e._1))) .groupByKey($(partitionNum)) .mapPartitionsWithIndex((index, edgeIter) => Iterator(KCoreGraphPartition.apply(index, edgeIter))) graph.persist($(storageLevel)) graph.foreachPartition(_ => Unit) graph.foreach(_.initMsgs(model)) var curIteration = 0 var numMsgs = model.numMsgs() var prev = graph println(s"numMsgs=$numMsgs") do { curIteration += 1 graph = prev.map(_.process(model, numMsgs, curIteration == 1)) graph.persist($(storageLevel)) graph.count() prev.unpersist(true) prev = graph model.resetMsgs() numMsgs = model.numMsgs() println(s"curIteration=$curIteration numMsgs=$numMsgs") } while (numMsgs > 0) val retRDD = graph.map(_.save()).flatMap{case (nodes,cores) => nodes.zip(cores)} .map(r => Row.fromSeq(Seq[Any](r._1, r._2))) dataset.sparkSession.createDataFrame(retRDD, transformSchema(dataset.schema)) } override def transformSchema(schema: StructType): StructType = { StructType(Seq( StructField(s"${$(outputNodeIdCol)}", LongType, nullable = false), StructField(s"${$(outputCoreIdCol)}", IntegerType, nullable = false) )) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) }
Example 115
Source File: Predictor.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.common import com.tencent.angel.mlcore.conf.{MLCoreConf, SharedConf} import com.tencent.angel.ml.math2.utils.{DataBlock, LabeledData} import org.apache.spark.broadcast.Broadcast import com.tencent.angel.sona.ml.common.MathImplicits._ import com.tencent.angel.sona.core.{AngelGraphModel, ExecutorContext} import com.tencent.angel.sona.data.LocalMemoryDataBlock import org.apache.spark.linalg import org.apache.spark.linalg.Vectors import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.apache.spark.sql.{Row, SPKSQLUtils} import scala.collection.mutable.ListBuffer class Predictor(bcValue: Broadcast[ExecutorContext], featIdx: Int, predictionCol: String, probabilityCol: String, bcConf: Broadcast[SharedConf]) extends Serializable { @transient private lazy val executorContext: ExecutorContext = { bcValue.value } @transient private lazy implicit val dim: Long = { executorContext.conf.getLong(MLCoreConf.ML_FEATURE_INDEX_RANGE) } @transient private lazy val appendedSchema: StructType = if (probabilityCol.nonEmpty) { new StructType(Array[StructField](StructField(probabilityCol, DoubleType), StructField(predictionCol, DoubleType))) } else { new StructType(Array[StructField](StructField(predictionCol, DoubleType))) } def predictRDD(data: Iterator[Row]): Iterator[Row] = { val localModel = executorContext.borrowModel(bcConf.value) val batchSize = 1024 val storage = new LocalMemoryDataBlock(batchSize, batchSize * 1024 * 1024) var count = 0 val cachedRows: Array[Row] = new Array[Row](batchSize) val result: ListBuffer[Row] = ListBuffer[Row]() data.foreach { case row if count != 0 && count % batchSize == 0 => predictInternal(localModel, storage, cachedRows, result) storage.clean() storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0)) cachedRows(count % batchSize) = row count += 1 case row => storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0)) cachedRows(count % batchSize) = row count += 1 } predictInternal(localModel, storage, cachedRows, result) executorContext.returnModel(localModel) result.toIterator } private def predictInternal(model: AngelGraphModel, storage: DataBlock[LabeledData], cachedRows: Array[Row], result: ListBuffer[Row]): Unit = { val predicted = model.predict(storage) if (appendedSchema.length == 1) { predicted.zipWithIndex.foreach { case (res, idx) => result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.pred)) } } else { predicted.zipWithIndex.foreach { case (res, idx) => result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.proba, res.predLabel)) } } } def predictRaw(features: linalg.Vector): linalg.Vector = { val localModel = executorContext.borrowModel(bcConf.value) val res = localModel.predict(new LabeledData(features, 0.0)) executorContext.returnModel(localModel) Vectors.dense(res.pred, -res.pred) } }
Example 116
Source File: Correlation.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.stat import org.apache.spark.linalg.{SQLDataTypes, Vector} import scala.collection.JavaConverters._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.types.{StructField, StructType} /** * API for correlation functions in MLlib, compatible with DataFrames and Datasets. * * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]] * to spark.ml's Vector types. */ object Correlation { /** * :: Experimental :: * Compute the correlation matrix for the input Dataset of Vectors using the specified method. * Methods currently supported: `pearson` (default), `spearman`. * * @param dataset A dataset or a dataframe * @param column The name of the column of vectors for which the correlation coefficient needs * to be computed. This must be a column of the dataset, and it must contain * Vector objects. * @param method String specifying the method to use for computing correlation. * Supported: `pearson` (default), `spearman` * @return A dataframe that contains the correlation matrix of the column of vectors. This * dataframe contains a single row and a single column of name * '$METHODNAME($COLUMN)'. * @throws IllegalArgumentException if the column is not a valid column in the dataset, or if * the content of this column is not of type Vector. * * Here is how to access the correlation coefficient: * {{{ * val data: Dataset[Vector] = ... * val Row(coeff: Matrix) = Correlation.corr(data, "value").head * // coeff now contains the Pearson correlation matrix. * }}} * * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector], * which is fairly costly. Cache the input Dataset before calling corr with `method = "spearman"` * to avoid recomputing the common lineage. */ def corr(dataset: Dataset[_], column: String, method: String): DataFrame = { val rdd = dataset.select(column).rdd.map { case Row(v: Vector) => v } val oldM = Statistics.corr(rdd, method) val name = s"$method($column)" val schema = StructType(Array(StructField(name, SQLDataTypes.MatrixType, nullable = false))) dataset.sparkSession.createDataFrame(Seq(Row(oldM)).asJava, schema) } /** * Compute the Pearson correlation matrix for the input Dataset of Vectors. */ def corr(dataset: Dataset[_], column: String): DataFrame = { corr(dataset, column, "pearson") } }
Example 117
Source File: SONAMetadataUtils.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.util import com.tencent.angel.sona.ml.attribute._ import org.apache.spark.linalg.VectorUDT import org.apache.spark.sql.types.StructField import scala.collection.immutable.HashMap /** * Helper utilities for algorithms using ML metadata */ object SONAMetadataUtils { /** * Examine a schema to identify the number of classes in a label column. * Returns None if the number of labels is not specified, or if the label column is continuous. */ def getNumClasses(labelSchema: StructField): Option[Int] = { Attribute.fromStructField(labelSchema) match { case binAttr: BinaryAttribute => Some(2) case nomAttr: NominalAttribute => nomAttr.getNumValues case _: NumericAttribute | UnresolvedAttribute => None } } /** * Examine a schema to identify categorical (Binary and Nominal) features. * * @param featuresSchema Schema of the features column. * If a feature does not have metadata, it is assumed to be continuous. * If a feature is Nominal, then it must have the number of values * specified. * @return Map: feature index to number of categories. * The map's set of keys will be the set of categorical feature indices. */ def getCategoricalFeatures(featuresSchema: StructField): Map[Int, Int] = { val metadata = AttributeGroup.fromStructField(featuresSchema) if (metadata.attributes.isEmpty) { HashMap.empty[Int, Int] } else { metadata.attributes.get.zipWithIndex.flatMap { case (attr, idx) => if (attr == null) { Iterator() } else { attr match { case _: NumericAttribute | UnresolvedAttribute => Iterator() case binAttr: BinaryAttribute => Iterator(idx -> 2) case nomAttr: NominalAttribute => nomAttr.getNumValues match { case Some(numValues: Int) => Iterator(idx -> numValues) case None => throw new IllegalArgumentException(s"Feature $idx is marked as" + " Nominal (categorical), but it does not have the number of values specified.") } } } }.toMap } } /** * Takes a Vector column and a list of feature names, and returns the corresponding list of * feature indices in the column, in order. * @param col Vector column which must have feature names specified via attributes * @param names List of feature names */ def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Long] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 118
Source File: SparkSpreadsheetServiceWriteSuite.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package com.github.potix2.spark.google.spreadsheets import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheet import com.google.api.services.sheets.v4.model.{ExtendedValue, CellData, RowData} import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import org.scalatest.{BeforeAndAfter, FlatSpec} import scala.collection.JavaConverters._ class SparkSpreadsheetServiceWriteSuite extends FlatSpec with BeforeAndAfter { private val serviceAccountId = "53797494708-ds5v22b6cbpchrv2qih1vg8kru098k9i@developer.gserviceaccount.com" private val testCredentialPath = "src/test/resources/spark-google-spreadsheets-test-eb7b191d1e1d.p12" private val TEST_SPREADSHEET_NAME = "WriteSuite" private val TEST_SPREADSHEET_ID = "163Ja2OWUephWjIa-jpwTlvGcg8EJwCFCfxrF7aI117s" private val context: SparkSpreadsheetService.SparkSpreadsheetContext = SparkSpreadsheetService.SparkSpreadsheetContext(Some(serviceAccountId), new java.io.File(testCredentialPath)) var spreadsheet: SparkSpreadsheet = null var worksheetName: String = "" def definedSchema: StructType = { new StructType() .add(new StructField("col_1", DataTypes.StringType)) .add(new StructField("col_2", DataTypes.LongType)) .add(new StructField("col_3", DataTypes.StringType)) } case class Elem(col_1: String, col_2: Long, col_3: String) def extractor(e: Elem): RowData = new RowData().setValues( List( new CellData().setUserEnteredValue( new ExtendedValue().setStringValue(e.col_1) ), new CellData().setUserEnteredValue( new ExtendedValue().setNumberValue(e.col_2.toDouble) ), new CellData().setUserEnteredValue( new ExtendedValue().setStringValue(e.col_3) ) ).asJava ) before { spreadsheet = context.findSpreadsheet(TEST_SPREADSHEET_ID) worksheetName = scala.util.Random.alphanumeric.take(16).mkString val data = List( Elem("a", 1L, "x"), Elem("b", 2L, "y"), Elem("c", 3L, "z") ) spreadsheet.addWorksheet(worksheetName, definedSchema, data, extractor) } after { spreadsheet.deleteWorksheet(worksheetName) } behavior of "A Spreadsheet" it should "find the new worksheet" in { val newWorksheet = spreadsheet.findWorksheet(worksheetName) assert(newWorksheet.isDefined) assert(newWorksheet.get.name == worksheetName) assert(newWorksheet.get.headers == Seq("col_1", "col_2", "col_3")) val rows = newWorksheet.get.rows assert(rows.head == Map("col_1" -> "a", "col_2" -> "1", "col_3" -> "x")) } behavior of "SparkWorksheet#updateCells" it should "update values in a worksheet" in { val newWorksheet = spreadsheet.findWorksheet(worksheetName) assert(newWorksheet.isDefined) val newData = List( Elem("f", 5L, "yy"), Elem("e", 4L, "xx"), Elem("c", 3L, "z"), Elem("b", 2L, "y"), Elem("a", 1L, "x") ) newWorksheet.get.updateCells(definedSchema, newData, extractor) val rows = newWorksheet.get.rows assert(rows.head == Map("col_1" -> "f", "col_2" -> "5", "col_3" -> "yy")) assert(rows.last == Map("col_1" -> "a", "col_2" -> "1", "col_3" -> "x")) } }
Example 119
Source File: SpreadsheetRelation.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package com.github.potix2.spark.google.spreadsheets import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheetContext import com.github.potix2.spark.google.spreadsheets.util._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SQLContext} case class SpreadsheetRelation protected[spark] ( context:SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext) extends BaseRelation with TableScan with InsertableRelation { import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService._ override def schema: StructType = userSchema.getOrElse(inferSchema()) private lazy val aWorksheet: SparkWorksheet = findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(aWorksheet) => aWorksheet case Left(e) => throw e } private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows private[spreadsheets] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] = for { sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right } yield worksheet override def buildScan(): RDD[Row] = { val aSchema = schema sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter => iter.map { m => var index = 0 val rowArray = new Array[Any](aSchema.fields.length) while(index < aSchema.fields.length) { val field = aSchema.fields(index) rowArray(index) = if (m.contains(field.name)) { TypeCast.castTo(m(field.name), field.dataType, field.nullable) } else { null } index += 1 } Row.fromSeq(rowArray) } } } override def insert(data: DataFrame, overwrite: Boolean): Unit = { if(!overwrite) { sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.") } findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(w) => w.updateCells(data.schema, data.collect().toList, Util.toRowData) case Left(e) => throw e } } private def inferSchema(): StructType = StructType(aWorksheet.headers.toList.map { fieldName => StructField(fieldName, StringType, nullable = true) }) }
Example 120
Source File: TestData.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.testutils import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} object TestData { def makeIntegerDf(spark: SparkSession, numbers: Seq[Int]): DataFrame = spark.createDataFrame( spark.sparkContext.makeRDD(numbers.map(Row(_))), StructType(List(StructField("column", IntegerType, nullable = false))) ) def makeNullableStringDf(spark: SparkSession, strings: Seq[String]): DataFrame = spark.createDataFrame(spark.sparkContext.makeRDD(strings.map(Row(_))), StructType(List(StructField("column", StringType, nullable = true)))) def makeIntegersDf(spark: SparkSession, row1: Seq[Int], rowN: Seq[Int]*): DataFrame = { val rows = row1 :: rowN.toList val numCols = row1.size val rdd = spark.sparkContext.makeRDD(rows.map(Row(_:_*))) val schema = StructType((1 to numCols).map(idx => StructField("column" + idx, IntegerType, nullable = false))) spark.createDataFrame(rdd, schema) } }
Example 121
Source File: SchemaWithInfo.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.model.metadata import com.flaminem.flamy.conf.FlamyContext import com.flaminem.flamy.model.IOFormat import com.flaminem.flamy.model.metadata.TableWithInfo.getSparkSchema import com.flaminem.flamy.model.names.{SchemaName, TableName} import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} class SchemaWithInfo( override val creationTime: Option[Long], override val location: String, val name: SchemaName, val numTables: Option[Int], fileSize: Option[Long], fileCount: Option[Long], modificationTime: Option[Long] ) extends ItemWithInfo { def formattedNumTables: String = { numTables.map{_.toString}.getOrElse("") } override def getFormattedInfo(context: FlamyContext, humanReadable: Boolean): Seq[String] = { Seq( name.toString, formattedNumTables, formattedFileSize(context, humanReadable), formattedFileCount(context), formattedModificationTime(context) ) } override def getFileSize: Option[Long] = { fileSize } override def getFileCount: Option[Long] = { fileCount } override def getModificationTime(context: FlamyContext, refresh: Boolean = false): Option[Long] = { modificationTime } override def toString: String = { name.toString } } object SchemaWithInfo { val getSparkSchema: StructType = { StructType(Seq( StructField("schema", StringType), StructField("num_tables", LongType), StructField("size", LongType), StructField("num_files", LongType), StructField("modification_time", LongType) )) } def getInfoHeader: Seq[String] = { getSparkSchema.fields.map{_.name} } }
Example 122
Source File: ContinuousDistributionBuilderFactory.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.dataframe.report.distribution.continuous import org.apache.spark.mllib.stat.MultivariateStatisticalSummary import org.apache.spark.sql.types.StructField import io.deepsense.deeplang.doperables.dataframe.report.distribution._ import io.deepsense.deeplang.utils.SparkTypeConverter._ import io.deepsense.deeplang.utils.aggregators._ object ContinuousDistributionBuilderFactory { def prepareBuilder( columnIndex: Int, field: StructField, multivarStats: MultivariateStatisticalSummary): DistributionBuilder = { val columnStats = ColumnStats.fromMultiVarStats(multivarStats, columnIndex) // MultivarStats inits min with Double.MaxValue and max with MinValue. // If there is at least one not (null or NaN) its guaranteed to change min/max values. // TODO Its a bit hacky. Find more elegant solution. Example approaches: // - Filter out nulls? Problematic because we operate on vectors for performance. // - Remade spark aggregators to return options? val hasOnlyNulls = columnStats.min == Double.MaxValue && columnStats.max == Double.MinValue if (!hasOnlyNulls) { val histogram = { val buckets = BucketsCalculator.calculateBuckets(field.dataType, columnStats) HistogramAggregator(buckets, true).mapInput(getColumnAsDouble(columnIndex)) } val missing = CountOccurenceAggregator[Option[Any]](None).mapInput(getOption(columnIndex)) val colStats = columnStats ContinuousDistributionBuilder(histogram, missing, field, colStats) } else { NoDistributionBuilder(field.name, NoDistributionReasons.OnlyNulls) } } }
Example 123
Source File: DiscreteDistributionBuilder.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.dataframe.report.distribution.discrete import org.apache.spark.sql.Row import org.apache.spark.sql.types.{BooleanType, StringType, StructField} import io.deepsense.deeplang.doperables.dataframe.report.DataFrameReportGenerator import io.deepsense.deeplang.doperables.dataframe.report.distribution.{DistributionBuilder, NoDistributionReasons} import io.deepsense.deeplang.doperables.report.ReportUtils import io.deepsense.deeplang.utils.aggregators.Aggregator import io.deepsense.deeplang.utils.aggregators.AggregatorBatch.BatchedResult import io.deepsense.reportlib.model.{DiscreteDistribution, Distribution, NoDistribution} case class DiscreteDistributionBuilder( categories: Aggregator[Option[scala.collection.mutable.Map[String, Long]], Row], missing: Aggregator[Long, Row], field: StructField) extends DistributionBuilder { def allAggregators: Seq[Aggregator[_, Row]] = Seq(categories, missing) override def build(results: BatchedResult): Distribution = { val categoriesMap = results.forAggregator(categories) val nullsCount = results.forAggregator(missing) categoriesMap match { case Some(occurrencesMap) => { val labels = field.dataType match { case StringType => occurrencesMap.keys.toSeq.sorted // We always want two labels, even when all elements are true or false case BooleanType => Seq(false.toString, true.toString) } val counts = labels.map(occurrencesMap.getOrElse(_, 0L)) DiscreteDistribution( field.name, s"Discrete distribution for ${field.name} column", nullsCount, labels.map(ReportUtils.shortenLongStrings(_, DataFrameReportGenerator.StringPreviewMaxLength)), counts) } case None => NoDistribution( field.name, NoDistributionReasons.TooManyDistinctCategoricalValues ) } } }
Example 124
Source File: GBTClassificationModel.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.models import org.apache.spark.ml.classification.{GBTClassificationModel => SparkGBTClassificationModel, GBTClassifier => SparkGBTClassifier} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import io.deepsense.commons.utils.Logging import io.deepsense.deeplang.doperables.report.CommonTablesGenerators.SparkSummaryEntry import io.deepsense.deeplang.doperables.report.{CommonTablesGenerators, Report} import io.deepsense.deeplang.doperables.spark.wrappers.params.common.PredictorParams import io.deepsense.deeplang.doperables.stringindexingwrapper.StringIndexingWrapperModel import io.deepsense.deeplang.doperables.{LoadableWithFallback, SparkModelWrapper} import io.deepsense.deeplang.params.Param import io.deepsense.sparkutils.ML class GBTClassificationModel(vanilaModel: VanillaGBTClassificationModel) extends StringIndexingWrapperModel[SparkGBTClassificationModel, SparkGBTClassifier](vanilaModel) { def this() = this(new VanillaGBTClassificationModel()) } class VanillaGBTClassificationModel() extends SparkModelWrapper[SparkGBTClassificationModel, SparkGBTClassifier] with LoadableWithFallback[SparkGBTClassificationModel, SparkGBTClassifier] with PredictorParams with Logging { override private[deeplang] def _transformSchema(schema: StructType): Option[StructType] = { val predictionColumnName = $(predictionColumn) Some(StructType(schema.fields :+ StructField(predictionColumnName, DoubleType))) } override val params: Array[Param[_]] = Array(featuresColumn, predictionColumn) override def report: Report = { val summary = List( SparkSummaryEntry( name = "number of features", value = sparkModel.numFeatures, description = "Number of features the model was trained on.")) super.report .withReportName( s"${this.getClass.getSimpleName} with ${sparkModel.numTrees} trees") .withAdditionalTable(CommonTablesGenerators.modelSummary(summary)) .withAdditionalTable( CommonTablesGenerators.decisionTree( sparkModel.treeWeights, sparkModel.trees), 2) } override protected def transformerName: String = classOf[GBTClassificationModel].getSimpleName override def tryToLoadModel(path: String): Option[SparkGBTClassificationModel] = { ML.ModelLoading.GBTClassification(path) } }
Example 125
Source File: RandomForestClassificationModel.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.models import org.apache.spark.ml.classification.{RandomForestClassificationModel => SparkRandomForestClassificationModel, RandomForestClassifier => SparkRandomForestClassifier} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import io.deepsense.deeplang.doperables.report.CommonTablesGenerators.SparkSummaryEntry import io.deepsense.deeplang.doperables.report.{CommonTablesGenerators, Report} import io.deepsense.deeplang.doperables.spark.wrappers.params.common.ProbabilisticClassifierParams import io.deepsense.deeplang.doperables.stringindexingwrapper.StringIndexingWrapperModel import io.deepsense.deeplang.doperables.{LoadableWithFallback, SparkModelWrapper} import io.deepsense.deeplang.params.Param import io.deepsense.sparkutils.ML class RandomForestClassificationModel( vanillaModel: VanillaRandomForestClassificationModel) extends StringIndexingWrapperModel[ SparkRandomForestClassificationModel, SparkRandomForestClassifier](vanillaModel) { def this() = this(new VanillaRandomForestClassificationModel()) } class VanillaRandomForestClassificationModel extends SparkModelWrapper[ SparkRandomForestClassificationModel, SparkRandomForestClassifier] with LoadableWithFallback[ SparkRandomForestClassificationModel, SparkRandomForestClassifier] with ProbabilisticClassifierParams { override private[deeplang] def _transformSchema(schema: StructType): Option[StructType] = { val predictionColumnName = $(predictionColumn) val probabilityColumnName = $(probabilityColumn) val rawPredictionColumnName = $(rawPredictionColumn) Some(StructType(schema.fields ++ Seq( StructField(predictionColumnName, DoubleType), StructField(probabilityColumnName, new io.deepsense.sparkutils.Linalg.VectorUDT), StructField(rawPredictionColumnName, new io.deepsense.sparkutils.Linalg.VectorUDT) ))) } override val params: Array[Param[_]] = Array( featuresColumn, predictionColumn, probabilityColumn, rawPredictionColumn) // thresholds override def report: Report = { val treeWeight = SparkSummaryEntry( name = "tree weights", value = sparkModel.treeWeights, description = "Weights for each tree." ) super.report .withAdditionalTable(CommonTablesGenerators.modelSummary(List(treeWeight))) } override protected def transformerName: String = classOf[RandomForestClassificationModel].getSimpleName override def tryToLoadModel(path: String): Option[SparkRandomForestClassificationModel] = { ML.ModelLoading.randomForestClassification(path) } }
Example 126
Source File: CustomCodeColumnTransformer.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperations.exceptions.CustomOperationExecutionException import io.deepsense.deeplang.OperationExecutionDispatcher.Result import io.deepsense.deeplang.params.{CodeSnippetParam, Param} import io.deepsense.deeplang.params.choice.ChoiceParam import org.apache.spark.sql.types.{DataType, StructField, StructType} abstract class CustomCodeColumnTransformer() extends MultiColumnTransformer { import CustomCodeColumnTransformer._ val targetType = ChoiceParam[TargetTypeChoice]( name = "target type", description = Some("Target type of the columns.")) def getTargetType: TargetTypeChoice = $(targetType) def setTargetType(value: TargetTypeChoice): this.type = set(targetType, value) val codeParameter: CodeSnippetParam def getCodeParameter: String = $(codeParameter) def setCodeParameter(value: String): this.type = set(codeParameter, value) def runCode(context: ExecutionContext, code: String): Result def isValid(context: ExecutionContext, code: String): Boolean def getComposedCode( userCode: String, inputColumn: String, outputColumn: String, targetType: DataType): String override def getSpecificParams: Array[Param[_]] private def executeCode( code: String, inputColumn: String, outputColumn: String, context: ExecutionContext, dataFrame: DataFrame): DataFrame = { runCode(context, code) match { case Left(error) => throw CustomOperationExecutionException(s"Execution exception:\n\n$error") case Right(_) => val sparkDataFrame = context.dataFrameStorage.getOutputDataFrame(OutputPortNumber).getOrElse { throw CustomOperationExecutionException( "Operation finished successfully, but did not produce a DataFrame.") } val newSparkDataFrame = context.sparkSQLSession.createDataFrame( sparkDataFrame.rdd, transformSingleColumnSchema(inputColumn, outputColumn, dataFrame.schema.get).get) DataFrame.fromSparkDataFrame(newSparkDataFrame) } } override def transformSingleColumn( inputColumn: String, outputColumn: String, context: ExecutionContext, dataFrame: DataFrame): DataFrame = { val code = getComposedCode( $(codeParameter), inputColumn, outputColumn, getTargetType.columnType) logger.debug(s"Code to be validated and executed:\n$code") if (!isValid(context, code)) { throw CustomOperationExecutionException("Code validation failed") } context.dataFrameStorage.withInputDataFrame(InputPortNumber, dataFrame.sparkDataFrame) { executeCode(code, inputColumn, outputColumn, context, dataFrame) } } override def transformSingleColumnSchema( inputColumn: String, outputColumn: String, schema: StructType): Option[StructType] = { MultiColumnTransformer.assertColumnExist(inputColumn, schema) MultiColumnTransformer.assertColumnDoesNotExist(outputColumn, schema) Some(schema.add(StructField(outputColumn, getTargetType.columnType, nullable = true))) } } object CustomCodeColumnTransformer { val InputPortNumber: Int = 0 val OutputPortNumber: Int = 0 }
Example 127
Source File: CountVectorizerExample.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations.examples import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType} import io.deepsense.deeplang.doperables.dataframe.{DataFrame, DataFrameBuilder} import io.deepsense.deeplang.doperations.spark.wrappers.estimators.CountVectorizer class CountVectorizerExample extends AbstractOperationExample[CountVectorizer]{ override def dOperation: CountVectorizer = { val op = new CountVectorizer() op.estimator .setInputColumn("lines") .setNoInPlace("lines_out") .setMinTF(3) op.set(op.estimator.extractParamMap()) } override def inputDataFrames: Seq[DataFrame] = { val rows = Seq( Row("a a a b b c c c d ".split(" ").toSeq), Row("c c c c c c".split(" ").toSeq), Row("a".split(" ").toSeq), Row("e e e e e".split(" ").toSeq)) val rdd = sparkContext.parallelize(rows) val schema = StructType(Seq(StructField("lines", ArrayType(StringType, containsNull = true)))) Seq(DataFrameBuilder(sparkSQLSession).buildDataFrame(schema, rdd)) } }
Example 128
Source File: UnionIntegSpec.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperations.exceptions.SchemaMismatchException import io.deepsense.deeplang.inference.{InferContext, InferenceWarnings} import io.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport} class UnionIntegSpec extends DeeplangIntegTestSupport { import DeeplangIntegTestSupport._ val schema1 = StructType(List( StructField("column1", DoubleType), StructField("column2", DoubleType))) val rows1_1 = Seq( Row(1.0, 2.0), Row(2.0, 3.0) ) "Union" should { "return a union of two DataFrames" in { val rows1_2 = Seq( Row(2.0, 4.0), Row(4.0, 6.0) ) val df1 = createDataFrame(rows1_1, schema1) val df2 = createDataFrame(rows1_2, schema1) val merged = Union() .executeUntyped(Vector(df1, df2))(executionContext) .head.asInstanceOf[DataFrame] assertDataFramesEqual( merged, createDataFrame(rows1_1 ++ rows1_2, schema1)) } "throw for mismatching types in DataFrames" in { val schema2 = StructType(List( StructField("column1", StringType), StructField("column2", DoubleType))) val rows2_1 = Seq( Row("a", 1.0), Row("b", 1.0) ) val df1 = createDataFrame(rows1_1, schema1) val df2 = createDataFrame(rows2_1, schema2) a [SchemaMismatchException] should be thrownBy { Union().executeUntyped(Vector(df1, df2))(executionContext) } } "throw for mismatching column names in DataFrames" in { val schema2 = StructType(List( StructField("column1", DoubleType), StructField("different_column_name", DoubleType))) val rows2_1 = Seq( Row(1.1, 1.0), Row(1.1, 1.0) ) val df1 = createDataFrame(rows1_1, schema1) val df2 = createDataFrame(rows2_1, schema2) a [SchemaMismatchException] should be thrownBy { Union().executeUntyped(Vector(df1, df2))(executionContext) } } } it should { "propagate schema when both schemas match" in { val structType = StructType(Seq( StructField("x", DoubleType), StructField("y", DoubleType))) val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType)) val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType)) Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext]) shouldBe (Vector(knowledgeDF1), InferenceWarnings()) } "generate error when schemas don't match" in { val structType1 = StructType(Seq( StructField("x", DoubleType))) val structType2 = StructType(Seq( StructField("y", DoubleType))) val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType1)) val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType2)) an [SchemaMismatchException] shouldBe thrownBy( Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext])) } } }
Example 129
Source File: DataFrameSplitterIntegSpec.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations import scala.collection.JavaConverters._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import io.deepsense.deeplang._ import io.deepsense.deeplang.doperables.dataframe.DataFrame class DataFrameSplitterIntegSpec extends DeeplangIntegTestSupport with GeneratorDrivenPropertyChecks with Matchers { "SplitDataFrame" should { "split randomly one df into two df in given range" in { val input = Range(1, 100) val parameterPairs = List( (0.0, 0), (0.3, 1), (0.5, 2), (0.8, 3), (1.0, 4)) for((splitRatio, seed) <- parameterPairs) { val rdd = createData(input) val df = executionContext.dataFrameBuilder.buildDataFrame(createSchema, rdd) val (df1, df2) = executeOperation( executionContext, new Split() .setSplitMode( SplitModeChoice.Random() .setSplitRatio(splitRatio) .setSeed(seed / 2)))(df) validateSplitProperties(df, df1, df2) } } "split conditionally one df into two df in given range" in { val input = Range(1, 100) val condition = "value > 20" val predicate: Int => Boolean = _ > 20 val (expectedDF1, expectedDF2) = (input.filter(predicate), input.filter(!predicate(_))) val rdd = createData(input) val df = executionContext.dataFrameBuilder.buildDataFrame(createSchema, rdd) val (df1, df2) = executeOperation( executionContext, new Split() .setSplitMode( SplitModeChoice.Conditional() .setCondition(condition)))(df) df1.sparkDataFrame.collect().map(_.get(0)) should contain theSameElementsAs expectedDF1 df2.sparkDataFrame.collect().map(_.get(0)) should contain theSameElementsAs expectedDF2 validateSplitProperties(df, df1, df2) } } private def createSchema: StructType = { StructType(List( StructField("value", IntegerType, nullable = false) )) } private def createData(data: Seq[Int]): RDD[Row] = { sparkContext.parallelize(data.map(Row(_))) } private def executeOperation(context: ExecutionContext, operation: DOperation) (dataFrame: DataFrame): (DataFrame, DataFrame) = { val operationResult = operation.executeUntyped(Vector[DOperable](dataFrame))(context) val df1 = operationResult.head.asInstanceOf[DataFrame] val df2 = operationResult.last.asInstanceOf[DataFrame] (df1, df2) } def validateSplitProperties(inputDF: DataFrame, outputDF1: DataFrame, outputDF2: DataFrame) : Unit = { val dfCount = inputDF.sparkDataFrame.count() val df1Count = outputDF1.sparkDataFrame.count() val df2Count = outputDF2.sparkDataFrame.count() val rowsDf = inputDF.sparkDataFrame.collectAsList().asScala val rowsDf1 = outputDF1.sparkDataFrame.collectAsList().asScala val rowsDf2 = outputDF2.sparkDataFrame.collectAsList().asScala val intersect = rowsDf1.intersect(rowsDf2) intersect.size shouldBe 0 (df1Count + df2Count) shouldBe dfCount rowsDf.toSet shouldBe rowsDf1.toSet.union(rowsDf2.toSet) } }
Example 130
Source File: DataFrameReportPerformanceSpec.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.dataframe import java.sql.Timestamp import java.text.{DateFormat, SimpleDateFormat} import java.util.TimeZone import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType, TimestampType} import org.scalatest.{BeforeAndAfter, Ignore} import io.deepsense.commons.utils.{DoubleUtils, Logging} import io.deepsense.deeplang.{TestFiles, DeeplangIntegTestSupport} // It's ignored because it does not have got assertions, it only prints report generation time. @Ignore class DataFrameReportPerformanceSpec extends DeeplangIntegTestSupport with BeforeAndAfter with TestFiles with Logging { val testFile = absoluteTestsDirPath.pathWithoutScheme + "/demand_without_header.csv" "DataFrame" should { "generate report" when { "DataFrame has 17K of rows" in { val numberOfTries = 10 var results: Seq[Double] = Seq() for (i <- 1 to numberOfTries) { val dataFrame: DataFrame = demandDataFrame() val start = System.nanoTime() val report = dataFrame.report val end = System.nanoTime() val time1: Double = (end - start).toDouble / 1000000000.0 results = results :+ time1 logger.debug("Report generation time: {}", DoubleUtils.double2String(time1)) } logger.debug( "Mean report generation time: {}", DoubleUtils.double2String(results.fold(0D)(_ + _) / numberOfTries.toDouble)) } } } private def demandDataFrame(): DataFrame = { val rddString: RDD[String] = executionContext.sparkContext.textFile(testFile) val data: RDD[Row] = rddString.map(DataFrameHelpers.demandString2Row) executionContext.dataFrameBuilder.buildDataFrame(demandSchema, data) } private def demandSchema: StructType = StructType(Seq( StructField("datetime", TimestampType), StructField("log_count", DoubleType), StructField("workingday", DoubleType), StructField("holiday", DoubleType), StructField("season2", DoubleType), StructField("season3", DoubleType), StructField("season4", DoubleType))) private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } } private object DataFrameHelpers { def demandString2Row(s: String): Row = { val split = s.split(",") Row( timestamp(split(0)), split(1).toDouble, split(2).toDouble, split(3).toDouble, split(4).toDouble, split(5).toDouble, split(6).toDouble ) } private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } }
Example 131
Source File: AbstractEvaluatorSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.ParamPair import io.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport} import io.deepsense.sparkutils.Linalg.Vectors abstract class AbstractEvaluatorSmokeTest extends DeeplangIntegTestSupport { def className: String val evaluator: Evaluator val evaluatorParams: Seq[ParamPair[_]] val inputDataFrameSchema = StructType(Seq( StructField("s", StringType), StructField("prediction", DoubleType), StructField("rawPrediction", new io.deepsense.sparkutils.Linalg.VectorUDT), StructField("label", DoubleType) )) val inputDataFrame: DataFrame = { val rowSeq = Seq( Row("aAa bBb cCc dDd eEe f", 1.0, Vectors.dense(2.1, 2.2, 2.3), 3.0), Row("das99213 99721 8i!#@!", 4.0, Vectors.dense(5.1, 5.2, 5.3), 6.0) ) createDataFrame(rowSeq, inputDataFrameSchema) } def setUpStubs(): Unit = () className should { "successfully run _evaluate()" in { setUpStubs() evaluator.set(evaluatorParams: _*)._evaluate(executionContext, inputDataFrame) } "successfully run _infer()" in { evaluator.set(evaluatorParams: _*)._infer(DKnowledge(inputDataFrame)) } "successfully run report" in { evaluator.set(evaluatorParams: _*).report } } }
Example 132
Source File: EstimatorModelWrapperFixtures.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.estimators import scala.language.reflectiveCalls import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml import org.apache.spark.ml.param.{ParamMap, Param => SparkParam} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.report.Report import io.deepsense.deeplang.doperables.serialization.SerializableSparkModel import io.deepsense.deeplang.doperables.{SparkEstimatorWrapper, SparkModelWrapper} import io.deepsense.deeplang.params.wrappers.spark.SingleColumnCreatorParamWrapper import io.deepsense.deeplang.params.{Param, Params} import io.deepsense.sparkutils.ML object EstimatorModelWrapperFixtures { class SimpleSparkModel private[EstimatorModelWrapperFixtures]() extends ML.Model[SimpleSparkModel] { def this(x: String) = this() override val uid: String = "modelId" val predictionCol = new SparkParam[String](uid, "name", "description") def setPredictionCol(value: String): this.type = set(predictionCol, value) override def copy(extra: ParamMap): this.type = defaultCopy(extra) override def transformDF(dataset: DataFrame): DataFrame = { dataset.selectExpr("*", "1 as " + $(predictionCol)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = ??? } class SimpleSparkEstimator extends ML.Estimator[SimpleSparkModel] { def this(x: String) = this() override val uid: String = "estimatorId" val predictionCol = new SparkParam[String](uid, "name", "description") override def fitDF(dataset: DataFrame): SimpleSparkModel = new SimpleSparkModel().setPredictionCol($(predictionCol)) override def copy(extra: ParamMap): ML.Estimator[SimpleSparkModel] = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(predictionCol), IntegerType, nullable = false)) } } trait HasPredictionColumn extends Params { val predictionColumn = new SingleColumnCreatorParamWrapper[ ml.param.Params { val predictionCol: SparkParam[String] }]( "prediction column", None, _.predictionCol) setDefault(predictionColumn, "abcdefg") def getPredictionColumn(): String = $(predictionColumn) def setPredictionColumn(value: String): this.type = set(predictionColumn, value) } class SimpleSparkModelWrapper extends SparkModelWrapper[SimpleSparkModel, SimpleSparkEstimator] with HasPredictionColumn { override val params: Array[Param[_]] = Array(predictionColumn) override def report: Report = ??? override protected def loadModel( ctx: ExecutionContext, path: String): SerializableSparkModel[SimpleSparkModel] = ??? } class SimpleSparkEstimatorWrapper extends SparkEstimatorWrapper[SimpleSparkModel, SimpleSparkEstimator, SimpleSparkModelWrapper] with HasPredictionColumn { override val params: Array[Param[_]] = Array(predictionColumn) override def report: Report = ??? } }
Example 133
Source File: EstimatorModelWrapperIntegSpec.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.estimators import io.deepsense.deeplang.DeeplangIntegTestSupport import io.deepsense.deeplang.doperables.dataframe.DataFrame import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StructType, StructField} class EstimatorModelWrapperIntegSpec extends DeeplangIntegTestSupport { import io.deepsense.deeplang.doperables.spark.wrappers.estimators.EstimatorModelWrapperFixtures._ val inputDF = { val rowSeq = Seq(Row(1), Row(2), Row(3)) val schema = StructType(Seq(StructField("x", IntegerType, nullable = false))) createDataFrame(rowSeq, schema) } val estimatorPredictionParamValue = "estimatorPrediction" val expectedSchema = StructType(Seq( StructField("x", IntegerType, nullable = false), StructField(estimatorPredictionParamValue, IntegerType, nullable = false) )) val transformerPredictionParamValue = "modelPrediction" val expectedSchemaForTransformerParams = StructType(Seq( StructField("x", IntegerType, nullable = false), StructField(transformerPredictionParamValue, IntegerType, nullable = false) )) "EstimatorWrapper" should { "_fit() and transform() + transformSchema() with parameters inherited" in { val transformer = createEstimatorAndFit() val transformOutputSchema = transformer._transform(executionContext, inputDF).sparkDataFrame.schema transformOutputSchema shouldBe expectedSchema val inferenceOutputSchema = transformer._transformSchema(inputDF.sparkDataFrame.schema) inferenceOutputSchema shouldBe Some(expectedSchema) } "_fit() and transform() + transformSchema() with parameters overwritten" in { val transformer = createEstimatorAndFit().setPredictionColumn(transformerPredictionParamValue) val transformOutputSchema = transformer._transform(executionContext, inputDF).sparkDataFrame.schema transformOutputSchema shouldBe expectedSchemaForTransformerParams val inferenceOutputSchema = transformer._transformSchema(inputDF.sparkDataFrame.schema) inferenceOutputSchema shouldBe Some(expectedSchemaForTransformerParams) } "_fit_infer().transformSchema() with parameters inherited" in { val estimatorWrapper = new SimpleSparkEstimatorWrapper() .setPredictionColumn(estimatorPredictionParamValue) estimatorWrapper._fit_infer(inputDF.schema) ._transformSchema(inputDF.sparkDataFrame.schema) shouldBe Some(expectedSchema) } "_fit_infer().transformSchema() with parameters overwritten" in { val estimatorWrapper = new SimpleSparkEstimatorWrapper() .setPredictionColumn(estimatorPredictionParamValue) val transformer = estimatorWrapper._fit_infer(inputDF.schema).asInstanceOf[SimpleSparkModelWrapper] val transformerWithParams = transformer.setPredictionColumn(transformerPredictionParamValue) val outputSchema = transformerWithParams._transformSchema(inputDF.sparkDataFrame.schema) outputSchema shouldBe Some(expectedSchemaForTransformerParams) } } private def createEstimatorAndFit(): SimpleSparkModelWrapper = { val estimatorWrapper = new SimpleSparkEstimatorWrapper() .setPredictionColumn(estimatorPredictionParamValue) val transformer = estimatorWrapper._fit(executionContext, inputDF).asInstanceOf[SimpleSparkModelWrapper] transformer.getPredictionColumn() shouldBe estimatorPredictionParamValue transformer } }
Example 134
Source File: StructFieldJsonProtocol.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.reportlib.model import org.apache.spark.sql.types.{DataType, StructField} import spray.json._ import io.deepsense.commons.json.EnumerationSerializer import io.deepsense.commons.types.{ColumnType, SparkConversions} trait StructFieldJsonProtocol extends DefaultJsonProtocol with MetadataJsonProtocol with DataTypeJsonProtocol { implicit val failureCodeFormat = EnumerationSerializer.jsonEnumFormat(ColumnType) // StructField format without metadata, with deeplangType appended implicit val structFieldFormat = new RootJsonFormat[StructField] { val c = (s: String, d: DataType, b: Boolean) => StructField(s, d, b) implicit val rawFormat = jsonFormat(c, "name", "dataType", "nullable") override def write(obj: StructField): JsValue = { val jsObject = obj.toJson(rawFormat).asJsObject val deeplangType = SparkConversions.sparkColumnTypeToColumnType(obj.dataType) JsObject(jsObject.fields + ("deeplangType" -> deeplangType.toJson)) } override def read(json: JsValue): StructField = { json.convertTo(rawFormat) } } }
Example 135
Source File: ReportContentTestFactory.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.reportlib.model.factory import io.deepsense.reportlib.model.{ReportType, ReportContent} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType} trait ReportContentTestFactory { import ReportContentTestFactory._ def testReport: ReportContent = ReportContent( reportName, reportType, Seq(TableTestFactory.testEmptyTable), Map(ReportContentTestFactory.categoricalDistName -> DistributionTestFactory.testCategoricalDistribution( ReportContentTestFactory.categoricalDistName), ReportContentTestFactory.continuousDistName -> DistributionTestFactory.testContinuousDistribution( ReportContentTestFactory.continuousDistName) ) ) } object ReportContentTestFactory extends ReportContentTestFactory { val continuousDistName = "continuousDistributionName" val categoricalDistName = "categoricalDistributionName" val reportName = "TestReportContentName" val reportType = ReportType.Empty val someReport: ReportContent = ReportContent("empty", ReportType.Empty) }
Example 136
Source File: ListTablesSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.scalatest.BeforeAndAfter import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType} import org.apache.spark.sql.catalyst.TableIdentifier class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContext { import testImplicits._ private lazy val df = (1 to 10).map(i => (i, s"str$i")).toDF("key", "value") before { df.registerTempTable("ListTablesSuiteTable") } after { sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable")) } test("get all tables") { checkAnswer( sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'"), Row("ListTablesSuiteTable", true)) checkAnswer( sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"), Row("ListTablesSuiteTable", true)) sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable")) assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0) } test("getting all Tables with a database name has no impact on returned table names") { checkAnswer( sqlContext.tables("DB").filter("tableName = 'ListTablesSuiteTable'"), Row("ListTablesSuiteTable", true)) checkAnswer( sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"), Row("ListTablesSuiteTable", true)) sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable")) assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0) } test("query the returned DataFrame of tables") { val expectedSchema = StructType( StructField("tableName", StringType, false) :: StructField("isTemporary", BooleanType, false) :: Nil) Seq(sqlContext.tables(), sql("SHOW TABLes")).foreach { case tableDF => assert(expectedSchema === tableDF.schema) tableDF.registerTempTable("tables") checkAnswer( sql( "SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"), Row(true, "ListTablesSuiteTable") ) checkAnswer( sqlContext.tables().filter("tableName = 'tables'").select("tableName", "isTemporary"), Row("tables", true)) sqlContext.dropTempTable("tables") } } }
Example 137
Source File: GroupedDatasetSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.api.python.PythonEvalType import org.apache.spark.sql.catalyst.plans.logical.AnalysisBarrier import org.apache.spark.sql.execution.python.PythonUDF import org.apache.spark.sql.functions.udf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{LongType, StructField, StructType} class GroupedDatasetSuite extends QueryTest with SharedSQLContext { import testImplicits._ private val scalaUDF = udf((x: Long) => { x + 1 }) private lazy val datasetWithUDF = spark.range(1).toDF("s").select($"s", scalaUDF($"s")) private def assertContainsAnalysisBarrier(ds: Dataset[_], atLevel: Int = 1): Unit = { assert(atLevel >= 0) var children = Seq(ds.queryExecution.logical) (1 to atLevel).foreach { _ => children = children.flatMap(_.children) } val barriers = children.collect { case ab: AnalysisBarrier => ab } assert(barriers.nonEmpty, s"Plan does not contain AnalysisBarrier at level $atLevel:\n" + ds.queryExecution.logical) } test("SPARK-24373: avoid running Analyzer rules twice on RelationalGroupedDataset") { val groupByDataset = datasetWithUDF.groupBy() val rollupDataset = datasetWithUDF.rollup("s") val cubeDataset = datasetWithUDF.cube("s") val pivotDataset = datasetWithUDF.groupBy().pivot("s", Seq(1, 2)) datasetWithUDF.cache() Seq(groupByDataset, rollupDataset, cubeDataset, pivotDataset).foreach { rgDS => val df = rgDS.count() assertContainsAnalysisBarrier(df) assertCached(df) } val flatMapGroupsInRDF = datasetWithUDF.groupBy().flatMapGroupsInR( Array.emptyByteArray, Array.emptyByteArray, Array.empty, StructType(Seq(StructField("s", LongType)))) val flatMapGroupsInPandasDF = datasetWithUDF.groupBy().flatMapGroupsInPandas(PythonUDF( "pyUDF", null, StructType(Seq(StructField("s", LongType))), Seq.empty, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, true)) Seq(flatMapGroupsInRDF, flatMapGroupsInPandasDF).foreach { df => assertContainsAnalysisBarrier(df, 2) assertCached(df) } datasetWithUDF.unpersist(true) } test("SPARK-24373: avoid running Analyzer rules twice on KeyValueGroupedDataset") { val kvDasaset = datasetWithUDF.groupByKey(_.getLong(0)) datasetWithUDF.cache() val mapValuesKVDataset = kvDasaset.mapValues(_.getLong(0)).reduceGroups(_ + _) val keysKVDataset = kvDasaset.keys val flatMapGroupsKVDataset = kvDasaset.flatMapGroups((k, _) => Seq(k)) val aggKVDataset = kvDasaset.count() val otherKVDataset = spark.range(1).groupByKey(_ + 1) val cogroupKVDataset = kvDasaset.cogroup(otherKVDataset)((k, _, _) => Seq(k)) Seq((mapValuesKVDataset, 1), (keysKVDataset, 2), (flatMapGroupsKVDataset, 2), (aggKVDataset, 1), (cogroupKVDataset, 2)).foreach { case (df, analysisBarrierDepth) => assertContainsAnalysisBarrier(df, analysisBarrierDepth) assertCached(df) } datasetWithUDF.unpersist(true) } }
Example 138
Source File: BlockingSource.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 139
Source File: MockSourceProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.StreamSourceProvider import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MockSourceProvider extends StreamSourceProvider { override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", MockSourceProvider.fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { MockSourceProvider.sourceProviderFunction() } } object MockSourceProvider { // Function to generate sources. May provide multiple sources if the user implements such a // function. private var sourceProviderFunction: () => Source = _ final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = { var i = 0 val sources = source +: otherSources sourceProviderFunction = () => { val source = sources(i % sources.length) i += 1 source } try { f } finally { sourceProviderFunction = null } } }
Example 140
Source File: SpreadsheetRelation.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.datasource.google.spreadsheet import mimir.exec.spark.datasource.google.spreadsheet.SparkSpreadsheetService.SparkSpreadsheetContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SQLContext} case class SpreadsheetRelation protected[spark] ( context:SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext) extends BaseRelation with TableScan with InsertableRelation { import mimir.exec.spark.datasource.google.spreadsheet.SparkSpreadsheetService._ private val fieldMap = scala.collection.mutable.Map[String, String]() override def schema: StructType = userSchema.getOrElse(inferSchema()) private lazy val aWorksheet: SparkWorksheet = findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(aWorksheet) => aWorksheet case Left(e) => throw e } private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows private[spreadsheet] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] = for { sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right } yield worksheet override def buildScan(): RDD[Row] = { val aSchema = schema val schemaMap = fieldMap.toMap sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter => iter.map { m => var index = 0 val rowArray = new Array[Any](aSchema.fields.length) while(index < aSchema.fields.length) { val field = aSchema.fields(index) rowArray(index) = if (m.contains(field.name)) { TypeCast.castTo(m(field.name), field.dataType, field.nullable) } else if (schemaMap.contains(field.name) && m.contains(schemaMap(field.name))) { TypeCast.castTo(m(schemaMap(field.name)), field.dataType, field.nullable) } else { null } index += 1 } Row.fromSeq(rowArray) } } } override def insert(data: DataFrame, overwrite: Boolean): Unit = { if(!overwrite) { sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.") } findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(w) => w.updateCells(data.schema, data.collect().toList, Util.toRowData) case Left(e) => throw e } } def sanitizeColumnName(name: String): String = { name .replaceAll("[^a-zA-Z0-9]+", "_") // Replace sequences of non-alphanumeric characters with underscores .replaceAll("_+$", "") // Strip trailing underscores .replaceAll("^[0-9_]+", "") // Strip leading underscores and digits } private def inferSchema(): StructType = StructType(aWorksheet.headers.toList.map { fieldName => { val sanitizedName = sanitizeColumnName(fieldName) fieldMap.put(sanitizedName, fieldName) StructField(sanitizedName, StringType, true) }}) }
Example 141
Source File: MimirUDF.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.udf import java.sql.{ Timestamp, Date } import org.apache.spark.sql.types.{ DataType, StructType, StructField } import mimir.algebra._ import mimir.exec.spark._ import mimir.util.SparkUtils class MimirUDF { def getPrimitive(t:Type, value:Any) = value match { case null => NullPrimitive() case _ => t match { //case TInt() => IntPrimitive(value.asInstanceOf[Long]) case TInt() => IntPrimitive(value.asInstanceOf[Long]) case TFloat() => FloatPrimitive(value.asInstanceOf[Double]) case TDate() => SparkUtils.convertDate(value.asInstanceOf[Date]) case TTimestamp() => SparkUtils.convertTimestamp(value.asInstanceOf[Timestamp]) case TString() => StringPrimitive(value.asInstanceOf[String]) case TBool() => BoolPrimitive(value.asInstanceOf[Boolean]) case TRowId() => RowIdPrimitive(value.asInstanceOf[String]) case TType() => TypePrimitive(Type.fromString(value.asInstanceOf[String])) //case TAny() => NullPrimitive() //case TUser(name) => name.toLowerCase //case TInterval() => Primitive(value.asInstanceOf[Long]) case _ => StringPrimitive(value.asInstanceOf[String]) } } def getNative(primitive : PrimitiveValue) : AnyRef = primitive match { case NullPrimitive() => null case RowIdPrimitive(s) => s case StringPrimitive(s) => s case IntPrimitive(i) => new java.lang.Long(i) case FloatPrimitive(f) => new java.lang.Double(f) case BoolPrimitive(b) => new java.lang.Boolean(b) case ts@TimestampPrimitive(y,m,d,h,mm,s,ms) => SparkUtils.convertTimestamp(ts) case dt@DatePrimitive(y,m,d) => SparkUtils.convertDate(dt) case x => x.asString } def getStructType(datatypes:Seq[DataType]): StructType = { StructType(datatypes.map(dti => StructField("", RAToSpark.getInternalSparkType(dti), true))) } }
Example 142
Source File: LibSVMResponseRowDeserializer.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers import org.apache.spark.ml.linalg.{SparseVector, SQLDataTypes} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.{ContentTypes, ResponseRowDeserializer} override val accepts: String = ContentTypes.TEXT_LIBSVM private def parseLibSVMRow(record: String): Row = { val items = record.split(' ') val label = items.head.toDouble val (indices, values) = items.tail.filter(_.nonEmpty).map { item => val entry = item.split(':') val index = entry(0).toInt - 1 val value = entry(1).toDouble (index, value) }.unzip Row(label, new SparseVector(dim, indices.toArray, values.toArray)) } override val schema: StructType = StructType( Array( StructField(labelColumnName, DoubleType, nullable = false), StructField(featuresColumnName, SQLDataTypes.VectorType, nullable = false))) }
Example 143
Source File: ProtobufRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val labelColumnName = "label" val featuresColumnName = "features" val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField( featuresColumnName, VectorType))) it should "serialize a dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "serialize a sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "fail to set schema on invalid features name" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) intercept[IllegalArgumentException] { val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist") } } it should "fail on invalid types" in { val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) new ProtobufRequestRowSerializer(Some(validSchema)) } }
Example 144
Source File: UnlabeledLibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StringType, StructField, StructType} class UnlabeledLibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false))) "UnlabeledLibSVMRequestRowSerializer" should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() val serialized = new String(rrs.serializeRow(row)) assert ("0.0 1:-100.0 11:100.1\n" == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() val serialized = new String(rrs.serializeRow(row)) assert("0.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "fail on invalid features column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer(featuresColumnName = "mangoes are not features") intercept[RuntimeException] { rrs.serializeRow(row) } } it should "fail on invalid features type" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, "FEATURESSSSSZ!1!").toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() intercept[RuntimeException] { rrs.serializeRow(row) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) val rrs = new UnlabeledLibSVMRequestRowSerializer(Some(validSchema)) } it should "fail to validate incorrect schema" in { val invalidSchema = StructType(Array( StructField("features", StringType, nullable = false))) intercept[IllegalArgumentException] { new UnlabeledLibSVMRequestRowSerializer(Some(invalidSchema)) } } }
Example 145
Source File: LibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest._ import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema = new LibSVMResponseRowDeserializer(10).schema "LibSVMRequestRowSerializer" should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert ("1.0 1:-100.0 11:100.1\n" == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "ignore other columns" in { val schemaWithExtraColumns = StructType(Array( StructField("name", StringType, nullable = false), StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false), StructField("favorite activity", StringType, nullable = false))) val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray, schema = schemaWithExtraColumns) val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "fail on invalid features column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!") } } it should "fail on invalid label column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), labelColumnName = "Sir! I must protest! I do not exist!") } } it should "fail on invalid types" in { val schemaWithInvalidLabelType = StructType(Array( StructField("label", StringType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType)) } val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) new LibSVMRequestRowSerializer(Some(validSchema)) } }
Example 146
Source File: UnlabeledCSVRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package unit.com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.UnlabeledCSVRequestRowSerializer class UnlabeledCSVRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema: StructType = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false))) it should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) val sparseString = "-100.0," + "0.0," * 9 + "100.1," + "0.0," * 88 + "0.0\n" assert (sparseString == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert("10.0,-100.0,2.0\n" == serialized) } }
Example 147
Source File: HashSetRowIterator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class ObjectHashSetRowIterator(set: ObjectHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { rawIter.next() } } class IntKeysHashSetRowIterator(set: IntKeysHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() val uRow = new UnsafeRow() val bufferHolder = new BufferHolder() val rowWriter = new UnsafeRowWriter() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { bufferHolder.reset() rowWriter.initialize(bufferHolder, 1) rowWriter.write(0, rawIter.next()) uRow.pointTo(bufferHolder.buffer, 1, bufferHolder.totalSize()) uRow } } class LongKeysHashSetRowIterator(set: LongKeysHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() val numFields = set.schemaInfo.arity val uRow = new UnsafeRow() val bufferHolder = new BufferHolder() val rowWriter = new UnsafeRowWriter() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { bufferHolder.reset() rowWriter.initialize(bufferHolder, numFields) val value = rawIter.nextLong() if (numFields == 2) { rowWriter.write(0, (value >> 32).toInt) rowWriter.write(1, value.toInt) } else { rowWriter.write(0, value) } uRow.pointTo(bufferHolder.buffer, numFields, bufferHolder.totalSize()) uRow } } object HashSetRowIterator { def create(set: HashSet): Iterator[InternalRow] = { set match { //case set: UnsafeFixedWidthSet => set.iterator().asScala case set: IntKeysHashSet => new IntKeysHashSetRowIterator(set) case set: LongKeysHashSet => new LongKeysHashSetRowIterator(set) case set: ObjectHashSet => new ObjectHashSetRowIterator(set) } } }
Example 148
Source File: LibSVMRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import com.google.common.base.Objects import org.apache.spark.Logging import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @Since("1.6.0") class DefaultSource extends RelationProvider with DataSourceRegister { @Since("1.6.0") override def shortName(): String = "libsvm" @Since("1.6.0") override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) : BaseRelation = { val path = parameters.getOrElse("path", throw new IllegalArgumentException("'path' must be specified")) val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt val vectorType = parameters.getOrElse("vectorType", "sparse") new LibSVMRelation(path, numFeatures, vectorType)(sqlContext) } }
Example 149
Source File: MetadataUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.types.StructField def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 150
Source File: LocalRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs = Iterator(output) override def sameResult(plan: LogicalPlan): Boolean = plan match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) }
Example 151
Source File: SparkSqlSerializer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.nio.ByteBuffer import java.util.{HashMap => JavaHashMap} import scala.reflect.ClassTag import com.esotericsoftware.kryo.io.{Input, Output} import com.esotericsoftware.kryo.{Kryo, Serializer} import com.twitter.chill.ResourcePool import org.apache.spark.serializer.{KryoSerializer, SerializerInstance} import org.apache.spark.sql.types.{Decimal, StructField, StructType} import org.apache.spark.util.MutablePair import org.apache.spark.{SparkConf, SparkEnv} //private[sql] class SparkSqlSerializer(conf: SparkConf) extends KryoSerializer(conf) { override def newKryo(): Kryo = { val kryo = super.newKryo() kryo.setRegistrationRequired(false) kryo.register(classOf[MutablePair[_, _]]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericRow]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericInternalRow]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericMutableRow]) kryo.register(classOf[java.math.BigDecimal], new JavaBigDecimalSerializer) kryo.register(classOf[BigDecimal], new ScalaBigDecimalSerializer) kryo.register(classOf[Decimal]) kryo.register(classOf[JavaHashMap[_, _]]) // APS kryo.register(classOf[StructType]) kryo.register(classOf[StructField]) kryo.setReferences(false) kryo } } private[execution] class KryoResourcePool(size: Int) extends ResourcePool[SerializerInstance](size) { val ser: SparkSqlSerializer = { val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf()) new SparkSqlSerializer(sparkConf) } def newInstance(): SerializerInstance = ser.newInstance() } //private[sql] object SparkSqlSerializer { @transient lazy val resourcePool = new KryoResourcePool(30) private[this] def acquireRelease[O](fn: SerializerInstance => O): O = { val kryo = resourcePool.borrow try { fn(kryo) } finally { resourcePool.release(kryo) } } def serialize[T: ClassTag](o: T): Array[Byte] = acquireRelease { k => k.serialize(o).array() } def deserialize[T: ClassTag](bytes: Array[Byte]): T = acquireRelease { k => k.deserialize[T](ByteBuffer.wrap(bytes)) } } private[sql] class JavaBigDecimalSerializer extends Serializer[java.math.BigDecimal] { def write(kryo: Kryo, output: Output, bd: java.math.BigDecimal) { // TODO: There are probably more efficient representations than strings... output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[java.math.BigDecimal]): java.math.BigDecimal = { new java.math.BigDecimal(input.readString()) } } private[sql] class ScalaBigDecimalSerializer extends Serializer[BigDecimal] { def write(kryo: Kryo, output: Output, bd: BigDecimal) { // TODO: There are probably more efficient representations than strings... output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[BigDecimal]): BigDecimal = { new java.math.BigDecimal(input.readString()) } }
Example 152
Source File: resources.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 153
Source File: DDLSourceLoadSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.SQLContext import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{StringType, StructField, StructType} // please note that the META-INF/services had to be modified for the test directory for this to work class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { test("data sources with the same name") { intercept[RuntimeException] { caseInsensitiveContext.read.format("Fluet da Bomb").load() } } test("load data source from format alias") { caseInsensitiveContext.read.format("gathering quorum").load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("specify full classname with duplicate formats") { caseInsensitiveContext.read.format("org.apache.spark.sql.sources.FakeSourceOne") .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("should fail to load ORC without HiveContext") { intercept[ClassNotFoundException] { caseInsensitiveContext.read.format("orc").load() } } } class FakeSourceOne extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceTwo extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceThree extends RelationProvider with DataSourceRegister { def shortName(): String = "gathering quorum" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } }
Example 154
Source File: ScalaSparkSQLBySchema.scala From learning-spark with Apache License 2.0 | 5 votes |
package com.javachen.spark.examples.sparksql import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.{SparkConf, SparkContext} object ScalaSparkSQLBySchema { def main(args: Array[String]) { val sc = new SparkContext(new SparkConf().setAppName("ScalaSparkSQL")) val sqlContext = new org.apache.spark.sql.SQLContext(sc) // Create an RDD val people = sc.textFile("people.txt") // The schema is encoded in a string val schemaString = "name age" // Import Spark SQL data types and Row. import org.apache.spark.sql._ // Generate the schema based on the string of schema val schema = StructType( schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true))) // Convert records of the RDD (people) to Rows. val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim)) // Apply the schema to the RDD. val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema) // Register the DataFrames as a table. peopleDataFrame.registerTempTable("people") // SQL statements can be run by using the sql methods provided by sqlContext. val results = sqlContext.sql("SELECT name FROM people") // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. results.map(t => "Name: " + t(0)).collect().foreach(println) } }
Example 155
Source File: MongodbSchemaIT.scala From Spark-MongoDB with Apache License 2.0 | 5 votes |
package com.stratio.datasource.mongodb.schema import java.text.SimpleDateFormat import java.util.Locale import com.stratio.datasource.MongodbTestConstants import com.stratio.datasource.mongodb.config.{MongodbConfig, MongodbConfigBuilder} import com.stratio.datasource.mongodb.partitioner.MongodbPartitioner import com.stratio.datasource.mongodb.rdd.MongodbRDD import com.stratio.datasource.mongodb._ import org.apache.spark.sql.mongodb.{TemporaryTestSQLContext, TestSQLContext} import org.apache.spark.sql.types.{ArrayType, StringType, StructField, TimestampType} import org.junit.runner.RunWith import org.scalatest._ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class MongodbSchemaIT extends FlatSpec with Matchers with MongoEmbedDatabase with TestBsonData with MongodbTestConstants { private val host: String = "localhost" private val collection: String = "testCol" private val readPreference = "secondaryPreferred" val testConfig = MongodbConfigBuilder() .set(MongodbConfig.Host,List(host + ":" + mongoPort)) .set(MongodbConfig.Database,db) .set(MongodbConfig.Collection,collection) .set(MongodbConfig.SamplingRatio,1.0) .set(MongodbConfig.ReadPreference, readPreference) .build() val mongodbPartitioner = new MongodbPartitioner(testConfig) val mongodbRDD = new MongodbRDD(TemporaryTestSQLContext, testConfig, mongodbPartitioner) behavior of "A schema" it should "be inferred from rdd with primitives" + scalaBinaryVersion in { withEmbedMongoFixture(primitiveFieldAndType) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 7 schema.fieldNames should contain allOf("string", "integer", "long", "double", "boolean", "null") schema.printTreeString() } } it should "be inferred from rdd with complex fields" + scalaBinaryVersion in { withEmbedMongoFixture(complexFieldAndType1) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 13 schema.fields filter { case StructField(name, ArrayType(StringType, _), _, _) => Set("arrayOfNull", "arrayEmpty") contains name case _ => false } should have size 2 schema.printTreeString() } } it should "resolve type conflicts between fields" + scalaBinaryVersion in { withEmbedMongoFixture(primitiveFieldValueTypeConflict) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 7 schema.printTreeString() } } it should "be inferred from rdd with more complex fields" + scalaBinaryVersion in { withEmbedMongoFixture(complexFieldAndType2) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 5 schema.printTreeString() } } it should "read java.util.Date fields as timestamptype" + scalaBinaryVersion in { val dfunc = (s: String) => new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.ENGLISH).parse(s) import com.mongodb.casbah.Imports.DBObject val stringAndDate = List(DBObject("string" -> "this is a simple string.", "date" -> dfunc("Mon Aug 10 07:52:49 EDT 2015"))) withEmbedMongoFixture(stringAndDate) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 3 schema.fields.filter(_.name == "date").head.dataType should equal(TimestampType) schema.printTreeString() } } }
Example 156
Source File: LOFSuite.scala From spark-lof with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.outlier import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import org.apache.spark.sql.functions._ object LOFSuite { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("LOFExample") .master("local[4]") .getOrCreate() val schema = new StructType(Array( new StructField("col1", DataTypes.DoubleType), new StructField("col2", DataTypes.DoubleType))) val df = spark.read.schema(schema).csv("data/outlier.csv") val assembler = new VectorAssembler() .setInputCols(df.columns) .setOutputCol("features") val data = assembler.transform(df).repartition(4) val startTime = System.currentTimeMillis() val result = new LOF() .setMinPts(5) .transform(data) val endTime = System.currentTimeMillis() result.count() // Outliers have much higher LOF value than normal data result.sort(desc(LOF.lof)).head(10).foreach { row => println(row.get(0) + " | " + row.get(1) + " | " + row.get(2)) } println("Total time = " + (endTime - startTime) / 1000.0 + "s") } }
Example 157
Source File: CloudPartitionTest.scala From cloud-integration with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.types.{IntegerType, StructField, StructType} abstract class CloudPartitionTest extends AbstractCloudRelationTest { import testImplicits._ ctest( "save-findClass-partitioned-part-columns-in-data", "Save sets of files in explicitly set up partition tree; read") { withTempPathDir("part-columns", None) { path => for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(path, s"p1=$p1/p2=$p2") val df = sparkContext .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1)) .toDF("a", "b", "p1") df.write .format(dataSourceName) .mode(SaveMode.ErrorIfExists) .save(partitionDir.toString) // each of these directories as its own success file; there is // none at the root resolveSuccessFile(partitionDir, true) } val dataSchemaWithPartition = StructType( dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.options(Map( "path" -> path.toString, "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName) .load()) } } }
Example 158
Source File: PileupTestBase.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.pileup import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import org.apache.spark.sql.types.{IntegerType, ShortType, StringType, StructField, StructType} import org.scalatest.{BeforeAndAfter, FunSuite} class PileupTestBase extends FunSuite with DataFrameSuiteBase with BeforeAndAfter with SharedSparkContext{ val sampleId = "NA12878.multichrom.md" val samResPath: String = getClass.getResource("/multichrom/mdbam/samtools.pileup").getPath val referencePath: String = getClass.getResource("/reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta").getPath val bamPath: String = getClass.getResource(s"/multichrom/mdbam/${sampleId}.bam").getPath val cramPath : String = getClass.getResource(s"/multichrom/mdcram/${sampleId}.cram").getPath val tableName = "reads_bam" val tableNameCRAM = "reads_cram" val schema: StructType = StructType( List( StructField("contig", StringType, nullable = true), StructField("position", IntegerType, nullable = true), StructField("reference", StringType, nullable = true), StructField("coverage", ShortType, nullable = true), StructField("pileup", StringType, nullable = true), StructField("quality", StringType, nullable = true) ) ) before { System.setProperty("spark.kryo.registrator", "org.biodatageeks.sequila.pileup.serializers.CustomKryoRegistrator") spark .conf.set("spark.sql.shuffle.partitions",1) //FIXME: In order to get orderBy in Samtools tests working - related to exchange partitions stage spark.sql(s"DROP TABLE IF EXISTS $tableName") spark.sql( s""" |CREATE TABLE $tableName |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource |OPTIONS(path "$bamPath") | """.stripMargin) spark.sql(s"DROP TABLE IF EXISTS $tableNameCRAM") spark.sql( s""" |CREATE TABLE $tableNameCRAM |USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource |OPTIONS(path "$cramPath", refPath "$referencePath" ) | """.stripMargin) val mapToString = (map: Map[Byte, Short]) => { if (map == null) "null" else map.map({ case (k, v) => k.toChar -> v}).mkString.replace(" -> ", ":") } val byteToString = ((byte: Byte) => byte.toString) spark.udf.register("mapToString", mapToString) spark.udf.register("byteToString", byteToString) } }
Example 159
Source File: A_1_DataFrameTest.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.sql import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{Row, SparkSession} //定义一个schema val schemaString = "name age" val fields = schemaString.split(" ") .map(filedName => StructField(filedName, StringType, nullable = true)) val structType = StructType(fields) val personRDD = sparkSession.sparkContext.textFile("src/main/resources/sparkresource/people.txt") .map(_.split(",")) //将RDD转换为行 .map(attr => Row(attr(0), attr(1).trim)) //将schema应用于RDD,并创建df sparkSession.createDataFrame(personRDD,structType).createOrReplaceTempView("people1") val dataFrameBySchema = sparkSession.sql("select name,age from people1 where age > 19 ") dataFrameBySchema.show() } }
Example 160
Source File: JDBCUtilityFunctions.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.jdbc.utilities import scala.collection.immutable.Map import org.apache.spark.sql.types.StructField import com.paypal.gimel.common.conf.GimelConstants object JDBCUtilityFunctions { def prepareCreateStatement(sql: String, dbtable: String, dataSetProps: Map[String, Any]): String = { // Here we remove the SELECT portion and have only the CREATE portion of the DDL supplied so that we can use that to create the table val sqlParts = sql.split(" ") val lenPartKeys = sqlParts.length val index = sqlParts.indexWhere(_.toUpperCase() == "SELECT") val createOnly: String = sqlParts.slice(0, index - 1).mkString(" ") // Here we remove the PCATALOG prefix => we replace pcatalog.storagetype.storagesystem.DB.Table with DB.Table val createParts = createOnly.split(" ") val pcatSQL = createParts.map(element => { if (element.toLowerCase().contains(GimelConstants.PCATALOG_STRING)) { // we replace pcatalog.storagetype.storagesystem.DB.Table with DB.Table element.split('.').tail.mkString(".").split('.').tail.mkString(".").split('.').tail.mkString(".") } else { element } } ).mkString(" ") val sparkSchema = dataSetProps(GimelConstants.TABLE_FILEDS).asInstanceOf[Array[StructField]] // From the dataframe schema, translate them into Teradata data types val gimelSchema: Array[com.paypal.gimel.common.catalog.Field] = sparkSchema.map(x => { com.paypal.gimel.common.catalog.Field(x.name, SparkToJavaConverter.getTeradataDataType(x.dataType), x.nullable) }) val colList: Array[String] = gimelSchema.map(x => (x.fieldName + " " + (x.fieldType) + ",")) val conCatColumns = colList.mkString("").dropRight(1) val colQulifier = s"""(${conCatColumns})""" // Here we inject back the columns with data types back in the SQL statemnt val newSqlParts = pcatSQL.split(" ") val PCATindex = newSqlParts.indexWhere(_.toUpperCase().contains("TABLE")) val catPrefix = newSqlParts.slice(0, PCATindex + 2).mkString(" ") val catSuffix = newSqlParts.slice(PCATindex + 2, newSqlParts.length).mkString(" ") val fullStatement = s"""${catPrefix} ${colQulifier} ${catSuffix}""" fullStatement.trim() } }
Example 161
Source File: JdbcConstants.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.jdbc.conf import org.apache.spark.sql.types.{StringType, StructField, StructType} import com.paypal.gimel.common.conf.GimelConstants object JdbcConstants { // basic variable references val HDFS_PREFIX: String = "hdfs:///user" val TD_PASS_FILENAME_DEFAULT: String = "pass.dat" val P_FILEPATH = s"${HDFS_PREFIX}/${GimelConstants.USER_NAME}/password/teradata/${TD_PASS_FILENAME_DEFAULT}" val MYSQL = "MYSQL" val TERADATA = "TERADATA" val ORCALE = "ORACLE" val POSTGRESQL = "POSTGRESQL" val HDFS_FILE_SOURCE = "hdfs" val LOCAL_FILE_SOURCE = "local" val DEFAULT_P_FILE_SOURCE = HDFS_FILE_SOURCE val JDBC_FILE_PASSWORD_STRATEGY = "file" val JDBC_INLINE_PASSWORD_STRATEGY = "inline" val JDBC_PROXY_USERNAME = "gimelproxyuser" val JDBC_CUSTOM_PASSWORD_STRATEGY = "custom" val JDBC_DEFAULT_PASSWORD_STRATEGY = JDBC_CUSTOM_PASSWORD_STRATEGY val JDBC_AUTH_REQUEST_TYPE = "JDBC" // default TD properties val DEFAULT_TD_SESSIONS = 5 val DEFAULT_CHARSET = "UTF16" val DEFAULT_SESSIONS = "6" val TD_FASTLOAD_KEY: String = "FASTLOAD" val TD_FASTEXPORT_KEY: String = "FASTEXPORT" val TD_FASTLOAD_KEY_LC: String = TD_FASTLOAD_KEY.toLowerCase val TD_FASTEXPORT_KEY_LC: String = TD_FASTEXPORT_KEY.toLowerCase // JDBC READ configs val MAX_TD_JDBC_READ_PARTITIONS = 24 val MAX_FAST_EXPORT_READ_PARTITIONS = 2 val DEFAULT_READ_FETCH_SIZE = 1000 val DEFAULT_LOWER_BOUND = 0 val DEFAULT_UPPER_BOUND = 20 val DEFAULT_READ_TYPE = "BATCH" val READ_OPERATION = "read" // JDBC write configs val GIMEL_TEMP_PARTITION = "GIMEL_TEMP_PARTITION" val DEFAULT_WRITE_BATCH_SIZE = 10000 val MAX_TD_JDBC_WRITE_PARTITIONS: Int = 24 val MAX_FAST_LOAD_WRITE_PARTITIONS: Int = 2 val DEFAULT_INSERT_STRATEGY = "insert" val DEFAULT_WRITE_TYPE = "BATCH" val WRITE_OPERATION = "write" val REPARTITION_METHOD = "repartition" val COALESCE_METHOD = "coalesce" // partitions for Systems other than Teradata val DEFAULT_JDBC_READ_PARTITIONS = 100 val DEFAULT_JDBC_WRTIE_PARTITIONS = 100 val DEFAULT_JDBC_PER_PROCESS_MAX_ROWS_LIMIT: Long = 1000000000L val DEFAULT_JDBC_PER_PROCESS_MAX_ROWS_LIMIT_STRING: String = DEFAULT_JDBC_PER_PROCESS_MAX_ROWS_LIMIT.toString // pushdown constants val DEF_JDBC_PUSH_DOWN_SCHEMA: StructType = new StructType(fields = Seq( StructField("QUERY_EXECUTION", StringType, nullable = false) ).toArray) }
Example 162
Source File: RestApiConsumer.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.restapi.reader import scala.language.implicitConversions import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.types.{StringType, StructField, StructType} import com.paypal.gimel.common.gimelservices.GimelServiceUtilities import com.paypal.gimel.logger.Logger import com.paypal.gimel.restapi.conf.RestApiClientConfiguration object RestApiConsumer { val logger: Logger = Logger() val utils: GimelServiceUtilities = GimelServiceUtilities() def consume(sparkSession: SparkSession, conf: RestApiClientConfiguration): DataFrame = { def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() logger.info(" @Begin --> " + MethodName) val responsePayload = conf.httpsFlag match { case false => utils.get(conf.resolvedUrl.toString) case true => utils.httpsGet(conf.resolvedUrl.toString) } conf.parsePayloadFlag match { case false => logger.info("NOT Parsing payload.") val rdd: RDD[String] = sparkSession.sparkContext.parallelize(Seq(responsePayload)) val rowRdd: RDD[Row] = rdd.map(Row(_)) val field: StructType = StructType(Seq(StructField(conf.payloadFieldName, StringType))) sparkSession.sqlContext.createDataFrame(rowRdd, field) case true => logger.info("Parsing payload to fields - as requested.") val rdd: RDD[String] = sparkSession.sparkContext.parallelize(Seq(responsePayload)) sparkSession.sqlContext.read.json(rdd) } } }
Example 163
Source File: TransformerSpec.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.transform import com.collective.modelmatrix.{ModelFeature, ModelMatrix, TestSparkContext} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.scalatest.FlatSpec class TransformerSpec extends FlatSpec with TestSparkContext { val sqlContext = ModelMatrix.sqlContext(sc) val schema = StructType(Seq( StructField("adv_site", StringType), StructField("adv_id", IntegerType) )) val input = Seq( Row("cnn.com", 1), Row("bbc.com", 2), Row("hbo.com", 1), Row("mashable.com", 3) ) val isActive = true val withAllOther = true // Can't call 'day_of_week' with String val badFunctionType = ModelFeature(isActive, "advertisement", "f1", "day_of_week(adv_site, 'UTC')", Top(95.0, allOther = false)) // Not enough parameters for 'concat' val wrongParametersCount = ModelFeature(isActive, "advertisement", "f2", "concat(adv_site)", Top(95.0, allOther = false)) val df = sqlContext.createDataFrame(sc.parallelize(input), schema) "Transformer" should "report failed feature extraction" in { val features = Transformer.extractFeatures(df, Seq(badFunctionType, wrongParametersCount)) assert(features.isLeft) val errors = features.fold(identity, _ => sys.error("Should not be here")) assert(errors.length == 2) assert(errors(0).feature == badFunctionType) assert(errors(1).feature == wrongParametersCount) } }
Example 164
Source File: IdentityTransformerSpec.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.transform import com.collective.modelmatrix.{ModelFeature, ModelMatrix, TestSparkContext} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.scalatest.FlatSpec import scalaz.syntax.either._ import scalaz.{-\/, \/-} class IdentityTransformerSpec extends FlatSpec with TestSparkContext { val sqlContext = ModelMatrix.sqlContext(sc) val schema = StructType(Seq( StructField("adv_site", StringType), StructField("adv_id", IntegerType) )) val input = Seq( Row("cnn.com", 1), Row("bbc.com", 2), Row("hbo.com", 1), Row("mashable.com", 3) ) val isActive = true val withAllOther = true val adSite = ModelFeature(isActive, "Ad", "ad_site", "adv_site", Identity) val adId = ModelFeature(isActive, "Ad", "ad_id", "adv_id", Identity) val df = sqlContext.createDataFrame(sc.parallelize(input), schema) val transformer = new IdentityTransformer(Transformer.extractFeatures(df, Seq(adSite, adId)) match { case -\/(err) => sys.error(s"Can't extract features: $err") case \/-(suc) => suc }) "Identity Transformer" should "support integer typed model feature" in { val valid = transformer.validate(adId) assert(valid == TypedModelFeature(adId, IntegerType).right) } it should "fail if feature column doesn't exists" in { val failed = transformer.validate(adSite.copy(feature = "adv_site")) assert(failed == FeatureTransformationError.FeatureColumnNotFound("adv_site").left) } it should "fail if column type is not supported" in { val failed = transformer.validate(adSite) assert(failed == FeatureTransformationError.UnsupportedTransformDataType("ad_site", StringType, Identity).left) } }
Example 165
Source File: DataFrameNaFunctionsSpec.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.sql import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.apache.spark.sql.Row import org.scalatest._ import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import com.github.mrpowers.spark.spec.SparkSessionTestWrapper import com.github.mrpowers.spark.fast.tests.DatasetComparer class DataFrameNaFunctionsSpec extends FunSpec with SparkSessionTestWrapper with DatasetComparer { import spark.implicits._ describe("#drop") { it("drops rows that contains null values") { val sourceData = List( Row(1, null), Row(null, null), Row(3, 30), Row(10, 20) ) val sourceSchema = List( StructField("num1", IntegerType, true), StructField("num2", IntegerType, true) ) val sourceDF = spark.createDataFrame( spark.sparkContext.parallelize(sourceData), StructType(sourceSchema) ) val actualDF = sourceDF.na.drop() val expectedData = List( Row(3, 30), Row(10, 20) ) val expectedSchema = List( StructField("num1", IntegerType, true), StructField("num2", IntegerType, true) ) val expectedDF = spark.createDataFrame( spark.sparkContext.parallelize(expectedData), StructType(expectedSchema) ) assertSmallDatasetEquality(actualDF, expectedDF) } } describe("#fill") { it("Returns a new DataFrame that replaces null or NaN values in numeric columns with value") { val sourceDF = spark.createDF( List( (1, null), (null, null), (3, 30), (10, 20) ), List( ("num1", IntegerType, true), ("num2", IntegerType, true) ) ) val actualDF = sourceDF.na.fill(77) val expectedDF = spark.createDF( List( (1, 77), (77, 77), (3, 30), (10, 20) ), List( ("num1", IntegerType, false), ("num2", IntegerType, false) ) ) assertSmallDatasetEquality(actualDF, expectedDF) } } describe("#replace") { pending } }
Example 166
Source File: N1qlSpec.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.n1ql import com.couchbase.client.core.CouchbaseException import com.couchbase.client.java.error.QueryExecutionException import com.couchbase.client.java.query.N1qlQuery import org.apache.spark.{SparkConf, SparkContext, SparkException} import org.apache.spark.sql.sources.EqualTo import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.scalatest._ import com.couchbase.spark._ import com.couchbase.spark.connection.CouchbaseConnection import com.couchbase.spark.sql.N1QLRelation import org.apache.spark.sql.types.{StringType, StructField, StructType} import scala.util.control.NonFatal class N1qlSpec extends FunSuite with Matchers with BeforeAndAfterAll { private val master = "local[2]" private val appName = "cb-int-specs1" private var spark: SparkSession = _ override def beforeAll(): Unit = { spark = SparkSession .builder() .master(master) .appName(appName) .config("spark.couchbase.username", "Administrator") .config("spark.couchbase.password", "password") // Open 2 buckets as tests below rely on it .config("com.couchbase.bucket.default", "") .config("com.couchbase.bucket.travel-sample", "") .getOrCreate() } override def afterAll(): Unit = { CouchbaseConnection().stop() spark.stop() } test("Creating N1QLRelation with default bucket, when two buckets exist, should fail") { assertThrows[IllegalStateException] { spark.read .format("com.couchbase.spark.sql.DefaultSource") .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline"))) .option("schemaFilter", "`type` = 'airline'") .schema(StructType(StructField("name", StringType) :: Nil)) .load() } } test("Creating N1QLRelation with non-default bucket, when two buckets exist, should succeed") { spark.read .format("com.couchbase.spark.sql.DefaultSource") .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline"))) .option("schemaFilter", "`type` = 'airline'") .option("bucket", "travel-sample") .schema(StructType(StructField("name", StringType) :: Nil)) .load() } test("N1QL failures should fail the Observable") { try { spark.sparkContext .couchbaseQuery(N1qlQuery.simple("BAD QUERY"), bucketName = "default") .collect() .foreach(println) fail() } catch { case e: SparkException => assert (e.getCause.isInstanceOf[QueryExecutionException]) val err = e.getCause.asInstanceOf[QueryExecutionException] assert (err.getMessage == "syntax error - at QUERY") case NonFatal(e) => println(e) fail() } } }
Example 167
Source File: CouchbaseDataFrameSpec.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.sql import com.couchbase.spark.connection.CouchbaseConnection import org.apache.avro.generic.GenericData.StringType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession} import org.apache.spark.sql.sources.EqualTo import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest._ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class CouchbaseDataFrameSpec extends FlatSpec with Matchers with BeforeAndAfterAll { private val master = "local[2]" private val appName = "cb-int-specs1" private var spark: SparkSession = null override def beforeAll(): Unit = { val conf = new SparkConf() .setMaster(master) .setAppName(appName) .set("spark.couchbase.nodes", "127.0.0.1") .set("com.couchbase.username", "Administrator") .set("com.couchbase.password", "password") .set("com.couchbase.bucket.default", "") .set("com.couchbase.bucket.travel-sample", "") spark = SparkSession.builder().config(conf).getOrCreate() loadData() } override def afterAll(): Unit = { CouchbaseConnection().stop() spark.stop() } def loadData(): Unit = { } "If two buckets are used and the bucket is specified the API" should "not fail" in { val ssc = spark.sqlContext ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample")) } "The DataFrame API" should "infer the schemas" in { val ssc = spark.sqlContext import com.couchbase.spark.sql._ val airline = ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample")) val airport = ssc.read.couchbase(EqualTo("type", "airport"), Map("bucket" -> "travel-sample")) val route = ssc.read.couchbase(EqualTo("type", "route"), Map("bucket" -> "travel-sample")) val landmark = ssc.read.couchbase(EqualTo("type", "landmark"), Map("bucket" -> "travel-sample")) airline .limit(10) .write .mode(SaveMode.Overwrite) .couchbase(Map("bucket" -> "default")) // TODO: validate schemas which are inferred on a field and type basis } it should "write and ignore" in { val ssc = spark.sqlContext import com.couchbase.spark.sql._ // create df, write it twice val data = ("Michael", 28, true) val df = ssc.createDataFrame(spark.sparkContext.parallelize(Seq(data))) df.write .mode(SaveMode.Ignore) .couchbase(options = Map("idField" -> "_1", "bucket" -> "default")) df.write .mode(SaveMode.Ignore) .couchbase(options = Map("idField" -> "_1", "bucket" -> "default")) } it should "filter based on a function" in { val ssc = spark.sqlContext import com.couchbase.spark.sql._ val airlineBySubstrCountry: DataFrame = ssc.read.couchbase( EqualTo("'substr(country, 0, 6)'", "United"), Map("bucket" -> "travel-sample")) airlineBySubstrCountry.count() should equal(6797) } }
Example 168
Source File: StatisticsTest.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.statistics import java.io.ByteArrayOutputStream import scala.collection.mutable.ArrayBuffer import org.scalatest.BeforeAndAfterEach import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.BaseOrdering import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache import org.apache.spark.sql.execution.datasources.oap.index.RangeInterval import org.apache.spark.sql.execution.datasources.oap.utils.{NonNullKeyReader, NonNullKeyWriter} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.memory.MemoryBlock import org.apache.spark.unsafe.types.UTF8String abstract class StatisticsTest extends SparkFunSuite with BeforeAndAfterEach { protected def rowGen(i: Int): InternalRow = InternalRow(i, UTF8String.fromString(s"test#$i")) protected lazy val schema: StructType = StructType(StructField("a", IntegerType) :: StructField("b", StringType) :: Nil) @transient protected lazy val nnkw: NonNullKeyWriter = new NonNullKeyWriter(schema) @transient protected lazy val nnkr: NonNullKeyReader = new NonNullKeyReader(schema) @transient protected lazy val ordering: BaseOrdering = GenerateOrdering.create(schema) @transient protected lazy val partialOrdering: BaseOrdering = GenerateOrdering.create(StructType(schema.dropRight(1))) protected var out: ByteArrayOutputStream = _ protected var intervalArray: ArrayBuffer[RangeInterval] = new ArrayBuffer[RangeInterval]() override def beforeEach(): Unit = { out = new ByteArrayOutputStream(8000) } override def afterEach(): Unit = { out.close() intervalArray.clear() } protected def generateInterval( start: InternalRow, end: InternalRow, startInclude: Boolean, endInclude: Boolean): Unit = { intervalArray.clear() intervalArray.append(new RangeInterval(start, end, startInclude, endInclude)) } protected def checkInternalRow(row1: InternalRow, row2: InternalRow): Unit = { val res = row1 == row2 // it works.. assert(res, s"row1: $row1 does not match $row2") } protected def wrapToFiberCache(out: ByteArrayOutputStream): FiberCache = { val bytes = out.toByteArray FiberCache(bytes) } }
Example 169
Source File: ProxyFeedback.scala From oni-ml with Apache License 2.0 | 5 votes |
package org.opennetworkinsight.proxy import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.types.{StructType, StructField, StringType} import scala.io.Source import org.opennetworkinsight.proxy.ProxySchema._ object ProxyFeedback { def loadFeedbackDF(sc: SparkContext, sqlContext: SQLContext, feedbackFile: String, duplicationFactor: Int): DataFrame = { val feedbackSchema = StructType( List(StructField(Date, StringType, nullable= true), StructField(Time, StringType, nullable= true), StructField(ClientIP, StringType, nullable= true), StructField(Host, StringType, nullable= true), StructField(ReqMethod, StringType, nullable= true), StructField(UserAgent, StringType, nullable= true), StructField(ResponseContentType, StringType, nullable= true), StructField(RespCode, StringType, nullable= true), StructField(FullURI, StringType, nullable= true))) if (new java.io.File(feedbackFile).exists) { val dateIndex = 0 val timeIndex = 1 val clientIpIndex = 2 val hostIndex = 3 val reqMethodIndex = 4 val userAgentIndex = 5 val resContTypeIndex = 6 val respCodeIndex = 11 val fullURIIndex = 18 val fullURISeverityIndex = 22 val lines = Source.fromFile(feedbackFile).getLines().toArray.drop(1) val feedback: RDD[String] = sc.parallelize(lines) sqlContext.createDataFrame(feedback.map(_.split("\t")) .filter(row => row(fullURISeverityIndex).trim.toInt == 3) .map(row => Row.fromSeq(List(row(dateIndex), row(timeIndex), row(clientIpIndex), row(hostIndex), row(reqMethodIndex), row(userAgentIndex), row(resContTypeIndex), row(respCodeIndex), row(fullURIIndex)))) .flatMap(row => List.fill(duplicationFactor)(row)), feedbackSchema) .select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, RespCode, FullURI) } else { sqlContext.createDataFrame(sc.emptyRDD[Row], feedbackSchema) } } }
Example 170
Source File: CountByRatingChart.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import java.awt.Font import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.jfree.chart.axis.CategoryLabelPositions import scalax.chart.module.ChartFactories val customSchema = StructType(Array( StructField("user_id", IntegerType, true), StructField("movie_id", IntegerType, true), StructField("rating", IntegerType, true), StructField("timestamp", IntegerType, true))) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val spark = SparkSession .builder() .appName("SparkRatingData").config(spConfig) .getOrCreate() val rating_df = spark.read.format("com.databricks.spark.csv") .option("delimiter", "\t").schema(customSchema) .load("../../data/ml-100k/u.data") val rating_df_count = rating_df.groupBy("rating").count().sort("rating") //val rating_df_count_sorted = rating_df_count.sort("count") rating_df_count.show() val rating_df_count_collection = rating_df_count.collect() val ds = new org.jfree.data.category.DefaultCategoryDataset val mx = scala.collection.immutable.ListMap() for( x <- 0 until rating_df_count_collection.length) { val occ = rating_df_count_collection(x)(0) val count = Integer.parseInt(rating_df_count_collection(x)(1).toString) ds.addValue(count,"UserAges", occ.toString) } //val sorted = ListMap(ratings_count.toSeq.sortBy(_._1):_*) //val ds = new org.jfree.data.category.DefaultCategoryDataset //sorted.foreach{ case (k,v) => ds.addValue(v,"Rating Values", k)} val chart = ChartFactories.BarChart(ds) val font = new Font("Dialog", Font.PLAIN,5); chart.peer.getCategoryPlot.getDomainAxis(). setCategoryLabelPositions(CategoryLabelPositions.UP_90); chart.peer.getCategoryPlot.getDomainAxis.setLabelFont(font) chart.show() Util.sc.stop() } }
Example 171
Source File: UserRatingsChart.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import scalax.chart.module.ChartFactories object UserRatingsChart { def main(args: Array[String]) { val customSchema = StructType(Array( StructField("user_id", IntegerType, true), StructField("movie_id", IntegerType, true), StructField("rating", IntegerType, true), StructField("timestamp", IntegerType, true))) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val spark = SparkSession .builder() .appName("SparkRatingData").config(spConfig) .getOrCreate() val rating_df = spark.read.format("com.databricks.spark.csv") .option("delimiter", "\t").schema(customSchema) .load("../../data/ml-100k/u.data") val rating_nos_by_user = rating_df.groupBy("user_id").count().sort("count") val ds = new org.jfree.data.category.DefaultCategoryDataset rating_nos_by_user.show(rating_nos_by_user.collect().length) val rating_nos_by_user_collect =rating_nos_by_user.collect() var mx = Map(0 -> 0) val min = 1 val max = 1000 val bins = 100 val step = (max/bins).toInt for (i <- step until (max + step) by step) { mx += (i -> 0); } for( x <- 0 until rating_nos_by_user_collect.length) { val user_id = Integer.parseInt(rating_nos_by_user_collect(x)(0).toString) val count = Integer.parseInt(rating_nos_by_user_collect(x)(1).toString) ds.addValue(count,"Ratings", user_id) } // ------------------------------------------------------------------ val chart = ChartFactories.BarChart(ds) chart.peer.getCategoryPlot.getDomainAxis().setVisible(false) chart.show() Util.sc.stop() } }
Example 172
Source File: UserData.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.df //import org.apache.spark.sql.SQLContext import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}; package object UserData { def main(args: Array[String]): Unit = { val customSchema = StructType(Array( StructField("no", IntegerType, true), StructField("age", StringType, true), StructField("gender", StringType, true), StructField("occupation", StringType, true), StructField("zipCode", StringType, true))); val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val spark = SparkSession .builder() .appName("SparkUserData").config(spConfig) .getOrCreate() val user_df = spark.read.format("com.databricks.spark.csv") .option("delimiter", "|").schema(customSchema) .load("/home/ubuntu/work/ml-resources/spark-ml/data/ml-100k/u.user") val first = user_df.first() println("First Record : " + first) val num_genders = user_df.groupBy("gender").count().count() val num_occupations = user_df.groupBy("occupation").count().count() val num_zipcodes = user_df.groupBy("zipCode").count().count() println("num_users : " + user_df.count()) println("num_genders : "+ num_genders) println("num_occupations : "+ num_occupations) println("num_zipcodes: " + num_zipcodes) println("Distribution by Occupation") println(user_df.groupBy("occupation").count().show()) } }
Example 173
Source File: Util.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql._ import org.apache.spark.sql.types.{StringType, StructField, StructType} object Util { val PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/" val DATA_PATH= "../../../data/ml-100k" val PATH_MOVIES = DATA_PATH + "/u.item" def reduceDimension2(x: Vector) : String= { var i = 0 var l = x.toArray.size var l_2 = l/2.toInt var x_ = 0.0 var y_ = 0.0 for(i <- 0 until l_2) { x_ += x(i).toDouble } for(i <- (l_2 + 1) until l) { y_ += x(i).toDouble } var t = x_ + "," + y_ return t } def getMovieDataDF(spark : SparkSession) : DataFrame = { //1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995) // |0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0 val customSchema = StructType(Array( StructField("id", StringType, true), StructField("name", StringType, true), StructField("date", StringType, true), StructField("url", StringType, true))); val movieDf = spark.read.format("com.databricks.spark.csv") .option("delimiter", "|").schema(customSchema) .load(PATH_MOVIES) return movieDf } }
Example 174
Source File: PrettifyTest.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.scalacheck.Gen import org.scalacheck.Prop._ import org.scalacheck.util.Pretty import org.scalatest.FunSuite import org.scalatest.exceptions.GeneratorDrivenPropertyCheckFailedException import org.scalatest.prop.Checkers class PrettifyTest extends FunSuite with SharedSparkContext with Checkers with Prettify { implicit val propertyCheckConfig = PropertyCheckConfig(minSize = 2, maxSize = 2) test("pretty output of DataFrame's check") { val schema = StructType(List(StructField("name", StringType), StructField("age", IntegerType))) val sqlContext = new SQLContext(sc) val nameGenerator = new Column("name", Gen.const("Holden Hanafy")) val ageGenerator = new Column("age", Gen.const(20)) val dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields(sqlContext, schema)(nameGenerator, ageGenerator) val actual = runFailingCheck(dataframeGen.arbitrary) val expected = Some("arg0 = <DataFrame: schema = [name: string, age: int], size = 2, values = ([Holden Hanafy,20], [Holden Hanafy,20])>") assert(actual == expected) } test("pretty output of RDD's check") { val rddGen = RDDGenerator.genRDD[(String, Int)](sc) { for { name <- Gen.const("Holden Hanafy") age <- Gen.const(20) } yield name -> age } val actual = runFailingCheck(rddGen) val expected = Some("""arg0 = <RDD: size = 2, values = ((Holden Hanafy,20), (Holden Hanafy,20))>""") assert(actual == expected) } test("pretty output of Dataset's check") { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val datasetGen = DatasetGenerator.genDataset[(String, Int)](sqlContext) { for { name <- Gen.const("Holden Hanafy") age <- Gen.const(20) } yield name -> age } val actual = runFailingCheck(datasetGen) val expected = Some("""arg0 = <Dataset: schema = [_1: string, _2: int], size = 2, values = ((Holden Hanafy,20), (Holden Hanafy,20))>""") assert(actual == expected) } private def runFailingCheck[T](genUnderTest: Gen[T])(implicit p: T => Pretty) = { val property = forAll(genUnderTest)(_ => false) val e = intercept[GeneratorDrivenPropertyCheckFailedException] { check(property) } takeSecondToLastLine(e.message) } private def takeSecondToLastLine(msg: Option[String]) = msg.flatMap(_.split("\n").toList.reverse.tail.headOption.map(_.trim)) }
Example 175
Source File: MLScalaCheckTest.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.{StructField, StructType} import org.scalacheck.Prop.forAll import org.scalatest.FunSuite import org.scalatest.prop.Checkers class MLScalaCheckTest extends FunSuite with SharedSparkContext with Checkers { // re-use the spark context override implicit def reuseContextIfPossible: Boolean = false test("vector generation") { val schema = StructType(List(StructField("vector", VectorType))) val sqlContext = new SQLContext(sc) val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema) val property = forAll(dataframeGen.arbitrary) { dataframe => { dataframe.schema === schema && dataframe.count >= 0 } } check(property) } test("matrix generation") { val schema = StructType(List(StructField("matrix", MatrixType))) val sqlContext = new SQLContext(sc) val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema) val property = forAll(dataframeGen.arbitrary) { dataframe => { dataframe.schema === schema && dataframe.count >= 0 } } check(property) } }
Example 176
Source File: PartitionAndSleepWorkload.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload.exercise import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import org.apache.spark.sql.{DataFrame, Row, SparkSession} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._ import com.ibm.sparktc.sparkbench.utils.SaveModes import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} object PartitionAndSleepWorkload extends WorkloadDefaults { val name = "timedsleep" val partitions: Int = 48 val sleepms: Long = 12000L def apply(m: Map[String, Any]) = new PartitionAndSleepWorkload( input = None, output = None, partitions = getOrDefault[Int](m, "partitions", partitions), sleepMS = getOrDefault[Long](m, "sleepms", sleepms, any2Long)) } case class PartitionAndSleepWorkload(input: Option[String] = None, output: Option[String] = None, saveMode: String = SaveModes.error, partitions: Int, sleepMS: Long) extends Workload { def doStuff(spark: SparkSession): (Long, Unit) = time { val ms = sleepMS val stuff: RDD[Int] = spark.sparkContext.parallelize(0 until partitions * 100, partitions) val cool: RDD[(Int, Int)] = stuff.map { i => Thread.sleep(ms) (i % 10, i + 42) } val yeah = cool.reduceByKey(_ + _) yeah.collect() } override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val (t, _) = doStuff(spark) val schema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("runtime", LongType, nullable = false) ) ) val timeList = spark.sparkContext.parallelize(Seq(Row("timedsleep", System.currentTimeMillis(), t))) spark.createDataFrame(timeList, schema) } }
Example 177
Source File: GraphDataGen.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.datageneration import org.apache.spark.sql.{DataFrame, Row, SparkSession} import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{any2Long, getOrDefault, getOrThrow, time} import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} import org.apache.spark.graphx.util.GraphGenerators object GraphDataGen extends WorkloadDefaults { val name = "graph-data-generator" val defaultMu = 4.0 val defaultSigma = 1.3 val defaultSeed = -1L val defaultNumOfPartitions = 0 override def apply(m: Map[String, Any]): GraphDataGen = { val numVertices = getOrThrow(m, "vertices").asInstanceOf[Int] val mu = getOrDefault[Double](m, "mu", defaultMu) val sigma = getOrDefault[Double](m, "sigma", defaultSigma) val numPartitions = getOrDefault[Int](m, "partitions", defaultNumOfPartitions) val seed = getOrDefault[Long](m, "seed", defaultSeed, any2Long) val output = { val str = getOrThrow(m, "output").asInstanceOf[String] val s = verifySuitabilityOfOutputFileFormat(str) Some(s) } val saveMode = getOrDefault[String](m, "save-mode", SaveModes.error) new GraphDataGen( numVertices = numVertices, input = None, output = output, saveMode = saveMode, mu = mu, sigma = sigma, seed = seed, numPartitions = numPartitions ) } private[datageneration] def verifySuitabilityOfOutputFileFormat(str: String): String = { val strArr: Array[String] = str.split('.') (strArr.length, strArr.last) match { case (1, _) => throw SparkBenchException("Output file for GraphDataGen must have \".txt\" as the file extension." + "Please modify your config file.") case (2, "txt") => str case (_, _) => throw SparkBenchException("Due to limitations of the GraphX GraphLoader, " + "the graph data generators may only save files as \".txt\"." + "Please modify your config file.") } } } case class GraphDataGen ( numVertices: Int, input: Option[String] = None, output: Option[String], saveMode: String, mu: Double = 4.0, sigma: Double = 1.3, seed: Long = 1, numPartitions: Int = 0 ) extends Workload { override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val timestamp = System.currentTimeMillis() val (generateTime, graph) = time(GraphGenerators.logNormalGraph(spark.sparkContext, numVertices, numPartitions, mu, sigma)) val (convertTime, out) = time(graph.edges.map(e => s"${e.srcId.toString} ${e.dstId}")) val (saveTime, _) = time(out.saveAsTextFile(output.get)) val timeResultSchema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("generate", LongType, nullable = true), StructField("convert", LongType, nullable = true), StructField("save", LongType, nullable = true), StructField("total_runtime", LongType, nullable = false) ) ) val total = generateTime + convertTime + saveTime val timeList = spark.sparkContext.parallelize(Seq(Row(GraphDataGen.name, timestamp, generateTime, convertTime, saveTime, total))) spark.createDataFrame(timeList, timeResultSchema) } }
Example 178
Source File: LinearRegressionDataGen.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.datageneration.mlgenerator import org.apache.spark.mllib.util.LinearDataGenerator import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{getOrDefault, getOrThrow, time} import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} object LinearRegressionDataGen extends WorkloadDefaults { val name = "data-generation-lr" // Application parameters #1million points have 200M data size val numOfExamples: Int = 40000 val numOfFeatures: Int = 4 val eps: Double = 0.5 val intercepts: Double = 0.1 val numOfPartitions: Int = 10 val maxIteration: Int = 3 override def apply(m: Map[String, Any]) = new LinearRegressionDataGen( numRows = getOrThrow(m, "rows").asInstanceOf[Int], numCols = getOrThrow(m, "cols").asInstanceOf[Int], output = Some(getOrThrow(m, "output").asInstanceOf[String]), saveMode = getOrDefault[String](m, "save-mode", SaveModes.error), eps = getOrDefault[Double](m, "eps", eps), intercepts = getOrDefault[Double](m, "intercepts", intercepts), numPartitions = getOrDefault[Int](m, "partitions", numOfPartitions) ) } case class LinearRegressionDataGen ( numRows: Int, numCols: Int, input: Option[String] = None, output: Option[String], saveMode: String, eps: Double, intercepts: Double, numPartitions: Int ) extends Workload { override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val timestamp = System.currentTimeMillis() val (generateTime, data): (Long, RDD[LabeledPoint]) = time { LinearDataGenerator.generateLinearRDD( spark.sparkContext, numRows, numCols, eps, numPartitions, intercepts ) } import spark.implicits._ val (convertTime, dataDF) = time { data.toDF } val (saveTime, _) = time { val outputstr = output.get if(outputstr.endsWith(".csv")) throw SparkBenchException("LabeledPoints cannot be saved to CSV. Please try outputting to Parquet instead.") writeToDisk(output.get, saveMode, dataDF, spark) }//TODO you can't output this to CSV. Parquet is fine val timeResultSchema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("generate", LongType, nullable = true), StructField("convert", LongType, nullable = true), StructField("save", LongType, nullable = true), StructField("total_runtime", LongType, nullable = false) ) ) val total = generateTime + convertTime + saveTime val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total))) spark.createDataFrame(timeList, timeResultSchema) } }
Example 179
Source File: KMeansDataGen.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.datageneration.mlgenerator import com.ibm.sparktc.sparkbench.workload.ml.KMeansWorkload import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._ import com.ibm.sparktc.sparkbench.utils.SaveModes import org.apache.spark.mllib.util.KMeansDataGenerator import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.types._ object KMeansDataGen extends WorkloadDefaults { val name = "data-generation-kmeans" override def apply(m: Map[String, Any]) = new KMeansDataGen( numRows = getOrThrow(m, "rows").asInstanceOf[Int], numCols = getOrThrow(m, "cols").asInstanceOf[Int], output = Some(getOrThrow(m, "output").asInstanceOf[String]), saveMode = getOrDefault[String](m, "save-mode", SaveModes.error), k = getOrDefault[Int](m, "k", KMeansWorkload.numOfClusters), scaling = getOrDefault[Double](m, "scaling", KMeansWorkload.scaling), numPartitions = getOrDefault[Int](m, "partitions", KMeansWorkload.numOfPartitions) ) } case class KMeansDataGen( numRows: Int, numCols: Int, input: Option[String] = None, output: Option[String], saveMode: String, k: Int, scaling: Double, numPartitions: Int ) extends Workload { override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val timestamp = System.currentTimeMillis() val (generateTime, data): (Long, RDD[Array[Double]]) = time { KMeansDataGenerator.generateKMeansRDD( spark.sparkContext, numRows, k, numCols, scaling, numPartitions ) } val (convertTime, dataDF) = time { val schemaString = data.first().indices.map(i => "c" + i.toString).mkString(" ") val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false)) val schema = StructType(fields) val rowRDD = data.map(arr => Row(arr:_*)) spark.createDataFrame(rowRDD, schema) } val (saveTime, _) = time { writeToDisk(output.get, saveMode, dataDF, spark) } val timeResultSchema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("generate", LongType, nullable = true), StructField("convert", LongType, nullable = true), StructField("save", LongType, nullable = true), StructField("total_runtime", LongType, nullable = false) ) ) val total = generateTime + convertTime + saveTime val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total))) spark.createDataFrame(timeList, timeResultSchema) } }
Example 180
Source File: BuildAndTeardownData.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.testfixtures import java.io.File import com.holdenkarau.spark.testing.Utils import com.ibm.sparktc.sparkbench.utils.SaveModes import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk import com.ibm.sparktc.sparkbench.workload.ml.KMeansWorkload import org.apache.spark.mllib.util.KMeansDataGenerator import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType} class BuildAndTeardownData(dirname: String = System.currentTimeMillis.toString) { val prefix = "/tmp/spark-bench-scalatest/" + dirname val sparkBenchTestFolder = s"$prefix/spark-bench-test" val kmeansFile = s"$sparkBenchTestFolder/kmeans-data.parquet" val sparkBenchDemoFolder = s"$prefix/spark-bench-demo" val spark = SparkSessionProvider.spark def createFolders(): Unit = { val fileSeq = Seq(new File(sparkBenchTestFolder), new File(sparkBenchDemoFolder)) fileSeq.foreach(folder => folder.mkdirs()) } def deleteFolders(): Unit = { Utils.deleteRecursively(new File(prefix)) } def generateKMeansData(rows: Int, cols: Int, outputFile: String): Unit = { val data: RDD[Array[Double]] = KMeansDataGenerator.generateKMeansRDD( spark.sparkContext, rows, KMeansWorkload.numOfClusters, cols, KMeansWorkload.scaling, KMeansWorkload.numOfPartitions ) val schemaString = data.first().indices.map(_.toString).mkString(" ") val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false)) val schema = StructType(fields) val rowRDD = data.map(arr => Row(arr:_*)) val df = spark.createDataFrame(rowRDD, schema) writeToDisk(outputFile, SaveModes.overwrite, df, spark) } }
Example 181
Source File: KMeansWorkloadTest.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload.ml import java.io.File import com.holdenkarau.spark.testing.Utils import com.ibm.sparktc.sparkbench.testfixtures.SparkSessionProvider import com.ibm.sparktc.sparkbench.utils.SaveModes import com.ibm.sparktc.sparkbench.utils.SparkFuncs.{load, writeToDisk} import org.apache.spark.mllib.util.KMeansDataGenerator import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} class KMeansWorkloadTest extends FlatSpec with Matchers with BeforeAndAfterEach { private val spark = SparkSessionProvider.spark private val fileName = s"/tmp/spark-bench-scalatest/kmeans-${java.util.UUID.randomUUID.toString}.csv" override def afterEach() { Utils.deleteRecursively(new File(fileName)) } def makeDataFrame(): DataFrame = { val data: RDD[Array[Double]] = KMeansDataGenerator.generateKMeansRDD( spark.sparkContext, 1, 1, 1, KMeansWorkload.scaling, KMeansWorkload.numOfPartitions ) val schemaString = data.first().indices.map(_.toString).mkString(" ") val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false)) val schema = StructType(fields) val rowRDD = data.map(arr => Row(arr: _*)) spark.createDataFrame(rowRDD, schema) } "reconcileSchema" should "handle a StringType schema and turn it into a DoubleType Schema" in { val df2Disk = makeDataFrame() writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv")) val conf = Map("name" -> "kmeans", "input" -> fileName) val work = KMeansWorkload(conf) val df = load(spark, fileName) val ddf = work.reconcileSchema(df) ddf.schema.head.dataType shouldBe DoubleType } "The load function" should "parse the DataFrame it's given into an RDD[Vector]" in { val df = makeDataFrame() val conf = Map("name" -> "kmeans", "input" -> "") val work = KMeansWorkload(conf) val ddf = work.reconcileSchema(df) val (_, rdd) = work.loadToCache(ddf, spark) rdd.first() } it should "work even when we've pulled the data from disk" in { val df2Disk = makeDataFrame() writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv")) val conf = Map("name" -> "kmeans", "input" -> fileName) val work = KMeansWorkload(conf) val df = load(spark, fileName) val ddf = work.reconcileSchema(df) val (_, rdd) = work.loadToCache(ddf, spark) rdd.first() } "doWorkload" should "work" in { val df2Disk = makeDataFrame() writeToDisk(fileName, SaveModes.error, df2Disk, spark, Some("csv")) val conf = Map("name" -> "kmeans", "input" -> fileName) val work = KMeansWorkload(conf) val df = load(spark, fileName) val ddf = work.reconcileSchema(df) work.doWorkload(Some(ddf), spark) } }
Example 182
Source File: MetadataUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import scala.collection.immutable.HashMap import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.types.StructField def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = { require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col" + s" to be Vector type, but it was type ${col.dataType} instead.") val inputAttr = AttributeGroup.fromStructField(col) names.map { name => require(inputAttr.hasAttr(name), s"getFeatureIndicesFromNames found no feature with name $name in column $col.") inputAttr.getAttr(name).index.get } } }
Example 183
Source File: VectorSlicerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 184
Source File: SQLTransformerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.types.{LongType, StructField, StructType} class SQLTransformerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new SQLTransformer()) } test("transform numeric data") { val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val result = sqlTrans.transform(original) val resultSchema = sqlTrans.transformSchema(original.schema) val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)) .toDF("id", "v1", "v2", "v3", "v4") assert(result.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(result.collect().toSeq == expected.collect().toSeq) assert(original.sparkSession.catalog.listTables().count() == 0) } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } test("transformSchema") { val df = spark.range(10) val outputSchema = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") .transformSchema(df.schema) val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) assert(outputSchema === expected) } }
Example 185
Source File: MetastoreRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("makeCopy and toJSON should work") { val table = CatalogTable( identifier = TableIdentifier("test", Some("db")), tableType = CatalogTableType.VIEW, storage = CatalogStorageFormat.empty, schema = StructType(StructField("a", IntegerType, true) :: Nil)) val relation = MetastoreRelation("db", "test")(table, null) // No exception should be thrown relation.makeCopy(Array("db", "test")) // No exception should be thrown relation.toJSON } test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") { withTable("bar") { withTempView("foo") { sql("select 0 as id").createOrReplaceTempView("foo") // If we optimize the query in CTAS more than once, the following saveAsTable will fail // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])` sql("CREATE TABLE bar AS SELECT * FROM foo group by id") checkAnswer(spark.table("bar"), Row(0) :: Nil) val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar")) assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table") } } } }
Example 186
Source File: SparkExecuteStatementOperationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{NullType, StructField, StructType} class SparkExecuteStatementOperationSuite extends SparkFunSuite { test("SPARK-17112 `select null` via JDBC triggers IllegalArgumentException in ThriftServer") { val field1 = StructField("NULL", NullType) val field2 = StructField("(IF(true, NULL, NULL))", NullType) val tableSchema = StructType(Seq(field1, field2)) val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors() assert(columns.size() == 2) assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.NULL_TYPE) assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.NULL_TYPE) } }
Example 187
Source File: LocalRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def sameResult(plan: LogicalPlan): Boolean = { plan.canonicalized match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } } override lazy val statistics = Statistics(sizeInBytes = (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 188
Source File: ResolveInlineTables.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import scala.util.control.NonFatal import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Cast import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.types.{StructField, StructType} private[analysis] def convert(table: UnresolvedInlineTable): LocalRelation = { // For each column, traverse all the values and find a common data type and nullability. val fields = table.rows.transpose.zip(table.names).map { case (column, name) => val inputTypes = column.map(_.dataType) val tpe = TypeCoercion.findWiderTypeWithoutStringPromotion(inputTypes).getOrElse { table.failAnalysis(s"incompatible types found in column $name for inline table") } StructField(name, tpe, nullable = column.exists(_.nullable)) } val attributes = StructType(fields).toAttributes assert(fields.size == table.names.size) val newRows: Seq[InternalRow] = table.rows.map { row => InternalRow.fromSeq(row.zipWithIndex.map { case (e, ci) => val targetType = fields(ci).dataType try { if (e.dataType.sameType(targetType)) { e.eval() } else { Cast(e, targetType).eval() } } catch { case NonFatal(ex) => table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}") } }) } LocalRelation(attributes, newRows) } }
Example 189
Source File: resources.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 190
Source File: DDLSourceLoadSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{StringType, StructField, StructType} // please note that the META-INF/services had to be modified for the test directory for this to work class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { test("data sources with the same name") { intercept[RuntimeException] { spark.read.format("Fluet da Bomb").load() } } test("load data source from format alias") { spark.read.format("gathering quorum").load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("specify full classname with duplicate formats") { spark.read.format("org.apache.spark.sql.sources.FakeSourceOne") .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("should fail to load ORC without Hive Support") { val e = intercept[AnalysisException] { spark.read.format("orc").load() } assert(e.message.contains("The ORC data source must be used with Hive support enabled")) } } class FakeSourceOne extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceTwo extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceThree extends RelationProvider with DataSourceRegister { def shortName(): String = "gathering quorum" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } }
Example 191
Source File: BlockingSource.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 192
Source File: XGBoost.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import eleflow.uberdata.models.UberXGBOOSTModel import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType} import scala.reflect.ClassTag class XGBoost[I](override val uid: String, val models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))])( implicit kt: ClassTag[I], ord: Ordering[I] = null) extends ForecastBaseModel[XGBoostSmallModel[I]] with HasInputCol with HasOutputCol with DefaultParamsWritable with HasFeaturesCol with HasNFutures with HasGroupByCol { def this( models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))] )(implicit kt: ClassTag[I], ord: Ordering[I] ) = this(Identifiable.randomUID("xgboost"), models) override def transform(dataSet: Dataset[_]): DataFrame = { val schema = dataSet.schema val predSchema = transformSchema(schema) val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)}) val predictions = joined.map { case (id, ((bestModel, metrics), row)) => val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]]( IUberdataForecastUtil.FEATURES_COL_NAME ) val label = DataTransformer.toFloat(row.getAs($(featuresCol))) val labelPoint = features.map { vec => val array = vec.toArray.map(_.toFloat) LabeledPoint(label, null, array) } val matrix = new DMatrix(labelPoint.toIterator) val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance .predict(matrix) .flatMap(_.map(_.toDouble)) .splitAt(features.length) Row( row.toSeq :+ Vectors .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _* ) } dataSet.sqlContext.createDataFrame(predictions, predSchema) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra) }
Example 193
Source File: XGBoostBaseBestModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.{Booster, DMatrix} import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType} trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]] extends BestModelFinder[G, M] with HasGroupByCol { protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast { StructType( Seq( StructField($(groupByCol).get, FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT)))) } protected def xGBoostEvaluation(row: Row, model: Booster, broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]], id: G, parameters: ParamMap): ModelParamEvaluation[G] = { val featuresArray = row .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME) .map { vec => val values = vec.toArray.map(DataTransformer.toFloat) LabeledPoint(values.head, null, values.tail) } val features = new DMatrix(featuresArray.toIterator) log.warn(s"Evaluating forecast for id $id, with xgboost") val prediction = model.predict(features).flatten val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length) val toBeValidated = featuresArray.zip(forecastToBeValidated) val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f => (f._1.label.toDouble, f._2.toDouble))) val metricName = broadcastEvaluator.value.getMetricName new ModelParamEvaluation[G]( id, metric, parameters, Some(metricName), SupportedAlgorithm.XGBoostAlgorithm) } }
Example 194
Source File: TimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataSet: Dataset[_]): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val index = sparkContext.broadcast(dataSet.schema.fieldIndex($(timeCol).get)) val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(groupByCol).get)) val featuresColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(featuresCol))) val grouped = rdd.map { case (row: Row) => val timeColRow = IUberdataForecastUtil.convertColumnToLong(row, index.value) convertColumnToDouble(timeColRow, featuresColIndex) }.groupBy { row => row.getAs[L](labelColIndex.value) }.map { case (key, values) => val toBeUsed = values.toArray.sortBy(row => row.getAs[Long](index.value)) (key, toBeUsed) } val toBeTrained = grouped.map { case (key, values) => org.apache.spark.sql.Row( key, Vectors.dense(values.map(_.getAs[Double](featuresColIndex.value))) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(toBeTrained, trainSchema) } override def transformSchema(schema: StructType): StructType = { val labelIndex = schema.fieldIndex($(groupByCol).get) StructType( Seq( schema.fields(labelIndex), StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): TimeSeriesGenerator[L] = defaultCopy(extra) } object TimeSeriesGenerator extends DefaultParamsReadable[TimeSeriesGenerator[_]] { override def load(path: String): TimeSeriesGenerator[_] = super.load(path) }
Example 195
Source File: XGBoostBigModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberXGBoostModel import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint} import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)]) extends ForecastBaseModel[XGBoostBigModel[I]] with HasLabelCol with HasIdCol { def setLabelcol(label: String): this.type = set(labelCol, label) def setIdcol(id: String): this.type = set(idCol, id) override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) ) } .join(prediction) .map { case (id, (features, predictValue)) => Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } protected def predict(dataSet: Dataset[_]) = { val features = dataSet.rdd.map { case (row: Row) => val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) val id = row.getAs[I]($(idCol)) SparkLabeledPoint(DataTransformer.toFloat(id), features) }.cache val (_, model) = models.head UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(getPredictionSchema) protected def getPredictionSchema: Array[StructField] = { Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) } }
Example 196
Source File: VectorizeEncoder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.core.data.DataTransformer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} class VectorizeEncoder(override val uid: String) extends Transformer with HasIdCol with HasTimeCol with HasInputCols with HasLabelCol with HasGroupByCol with HasOutputCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("vectorizer")) def setIdCol(input: String) = set(idCol, input) def setLabelCol(input: String) = set(labelCol, input) def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy)) def setInputCol(input: Array[String]) = set(inputCols, input) def setTimeCol(time: String) = set(timeCol, Some(time)) def setOutputCol(output: String) = set(outputCol, output) override def transform(dataSet: Dataset[_]): DataFrame = { val context = dataSet.sqlContext.sparkContext val input = context.broadcast($(inputCols)) val allColumnNames = dataSet.schema.map(_.name) val nonInputColumnIndexes = context.broadcast( allColumnNames.zipWithIndex.filter( f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol) || f._1 == $(timeCol).getOrElse(""))) val result = dataSet.rdd.map { case (row: Row) => val rowSeq = row.toSeq val nonInputColumns = nonInputColumnIndexes.value.map { case (_, index) => rowSeq(index) } val size = input.value.length val (values, indices) = input.value .filter(col => row.getAs(col) != null) .map { column => DataTransformer.toDouble(row.getAs(column)) } .zipWithIndex .filter(f => f._1 != 0d) .unzip Row( nonInputColumns :+ org.apache.spark.ml.linalg.Vectors .sparse(size, indices.toArray, values.toArray): _* ) } val newSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(result, newSchema) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType( schema.filter( col => !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol) || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("") ) ).add(StructField($(outputCol), new VectorUDT)) }
Example 197
Source File: AllColumnsTimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) // override def transform(dataSet: DataFrame): DataFrame = { override def transform(dataSet: Dataset[_] ): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(labelCol))) val keyValueDataSet = rdd.map { case (row: Row) => Row( row.getAs[T](labelColIndex.value), row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol)) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(keyValueDataSet, trainSchema) } override def transformSchema(schema: StructType): StructType = { StructType( schema.filter(_.name == $(labelCol)).head +: Seq( StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): AllColumnsTimeSeriesGenerator[T, U] = defaultCopy(extra) } object AllColumnsTimeSeriesGenerator extends DefaultParamsReadable[AllColumnsTimeSeriesGenerator[_, _]] { override def load(path: String): AllColumnsTimeSeriesGenerator[_, _] = super.load(path) }
Example 198
Source File: ForecastPipelineStage.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import org.apache.spark.ml.param.shared.{HasNFutures, HasPredictionCol, HasValidationCol} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.types.{StructType, StringType, StructField, MapType} trait ForecastPipelineStage extends PipelineStage with HasNFutures with HasPredictionCol with HasValidationCol { def setValidationCol(value: String): this.type = set(validationCol, value) override def transformSchema(schema: StructType): StructType = { schema .add(StructField($(validationCol), new VectorUDT)) .add(StructField(IUberdataForecastUtil.ALGORITHM, StringType)) .add(StructField(IUberdataForecastUtil.PARAMS, MapType(StringType, StringType))) } }
Example 199
Source File: XGBoostBigModelTimeSeries.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import java.sql.Timestamp import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasTimeCol import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModelTimeSeries[I](override val uid: String, override val models: Seq[(ParamMap, XGBoostModel)]) extends XGBoostBigModel[I](uid, models) with HasTimeCol{ def setTimecol(time: String): this.type = set(timeCol, Some(time)) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME), row.getAs[java.sql.Timestamp]($(timeCol).get))) } .join(prediction) .map { case (id, ((features, time), predictValue)) => Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField($(timeCol).get, TimestampType), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) ) }
Example 200
Source File: CarbonHiveMetastoreListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.hive import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hive.metastore.MetaStorePreEventListener import org.apache.hadoop.hive.metastore.api.{FieldSchema, MetaException} import org.apache.hadoop.hive.metastore.events._ import org.apache.hadoop.hive.metastore.events.PreEventContext.PreEventType._ import org.apache.spark.sql.types.{DataType, StructField, StructType} class CarbonHiveMetastoreListener(conf: Configuration) extends MetaStorePreEventListener(conf) { override def onEvent(preEventContext: PreEventContext): Unit = { preEventContext.getEventType match { case CREATE_TABLE => val table = preEventContext.asInstanceOf[PreCreateTableEvent].getTable val tableProps = table.getParameters if (tableProps != null && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource" || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) { val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts") if (numSchemaParts != null && !numSchemaParts.isEmpty) { val parts = (0 until numSchemaParts.toInt).map { index => val part = tableProps.get(s"spark.sql.sources.schema.part.${index}") if (part == null) { throw new MetaException(s"spark.sql.sources.schema.part.${index} is missing!") } part } // Stick all parts back to a single schema string. val schema = DataType.fromJson(parts.mkString).asInstanceOf[StructType] val hiveSchema = schema.map(toHiveColumn).asJava table.getSd.setCols(hiveSchema) table.getSd.setInputFormat("org.apache.carbondata.hive.MapredCarbonInputFormat") table.getSd.setOutputFormat("org.apache.carbondata.hive.MapredCarbonOutputFormat") val serdeInfo = table.getSd.getSerdeInfo serdeInfo.setSerializationLib("org.apache.carbondata.hive.CarbonHiveSerDe") val tablePath = serdeInfo.getParameters.get("tablePath") if (tablePath != null) { table.getSd.setLocation(tablePath) } } } case ALTER_TABLE => val table = preEventContext.asInstanceOf[PreAlterTableEvent].getNewTable val tableProps = table.getParameters if (tableProps != null && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource" || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) { val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts") if (numSchemaParts != null && !numSchemaParts.isEmpty) { val schemaParts = (0 until numSchemaParts.toInt).map { index => val schemaPart = tableProps.get(s"spark.sql.sources.schema.part.$index") if (schemaPart == null) { throw new MetaException(s"spark.sql.sources.schema.part.$index is missing!") } schemaPart } // Stick all schemaParts back to a single schema string. val schema = DataType.fromJson(schemaParts.mkString).asInstanceOf[StructType] val hiveSchema = schema.map(toHiveColumn).asJava table.getSd.setCols(hiveSchema) } } case _ => // do nothing } } private def toHiveColumn(c: StructField): FieldSchema = { val typeString = if (c.metadata.contains("HIVE_TYPE_STRING")) { c.metadata.getString("HIVE_TYPE_STRING") } else { c.dataType.catalogString } new FieldSchema(c.name, typeString, c.getComment().orNull) } }