org.apache.spark.sql Scala Examples
The following examples show how to use org.apache.spark.sql.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: EvaluatorWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.evaluation import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.Evaluator import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class EvaluatorWrapper( context: ExecutionContext, evaluator: Evaluator) extends ML.Evaluator { override def evaluateDF(dataset: sql.DataFrame): Double = { evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value } override def copy(extra: ParamMap): evaluation.Evaluator = { val params = ParamTransformer.transform(extra) val evaluatorCopy = evaluator.replicate().set(params: _*) new EvaluatorWrapper(context, evaluatorCopy) } override lazy val params: Array[Param[_]] = { evaluator.params.map(new ParamWrapper(uid, _)) } override def isLargerBetter: Boolean = evaluator.isLargerBetter override val uid: String = Identifiable.randomUID("EvaluatorWrapper") }
Example 2
Source File: DeeplangTestSupport.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang import org.apache.spark.sql import org.apache.spark.sql.types.StructType import org.mockito.Mockito._ import org.scalatest.mockito.MockitoSugar import io.deepsense.deeplang.catalogs.doperable.DOperableCatalog import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.inference.InferContext trait DeeplangTestSupport extends MockitoSugar { protected def createInferContext( dOperableCatalog: DOperableCatalog): InferContext = MockedInferContext(dOperableCatalog) protected def createSchema(fields: Array[String] = Array[String]()): StructType = { val schemaMock = mock[StructType] when(schemaMock.fieldNames).thenReturn(fields) schemaMock } protected def createSparkDataFrame(schema: StructType = createSchema()) = { val sparkDataFrameMock = mock[sql.DataFrame] when(sparkDataFrameMock.schema).thenReturn(schema) when(sparkDataFrameMock.toDF).thenReturn(sparkDataFrameMock) sparkDataFrameMock } protected def createDataFrame(fields: Array[String] = Array[String]()): DataFrame = { val schema = createSchema(fields) createDataFrame(schema) } protected def createDataFrame(schema: StructType): DataFrame = { val sparkDataFrameMock = createSparkDataFrame(schema) val dataFrameMock = mock[DataFrame] when(dataFrameMock.sparkDataFrame).thenReturn(sparkDataFrameMock) when(dataFrameMock.schema).thenReturn(Some(schema)) dataFrameMock } }
Example 3
Source File: StatisticsForContinuousIntegSpec.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.dataframe.report.distribution import java.sql.Timestamp import org.apache.spark.rdd.RDD import org.apache.spark.sql import org.apache.spark.sql.Row import org.apache.spark.sql.types._ import io.deepsense.commons.datetime.DateTimeConverter import io.deepsense.deeplang.DeeplangIntegTestSupport import io.deepsense.deeplang.doperables.dataframe.{DataFrame, DataFrameTestFactory} import io.deepsense.reportlib.model._ class StatisticsForContinuousIntegSpec extends DeeplangIntegTestSupport with DataFrameTestFactory { "Statistics (Min, max and mean values)" should { "be calculated for each continuous column in distribution" when { "data is of type int" in { val distribution = distributionForInt(1, 2, 3, 4, 5) distribution.statistics.min shouldEqual Some("1") distribution.statistics.max shouldEqual Some("5") distribution.statistics.mean shouldEqual Some("3") } "data is of type Timestamp" in { val distribution = distributionForTimestamps(new Timestamp(1000), new Timestamp(2000), new Timestamp(3000)) distribution.statistics.min shouldEqual Some(formatDate(1000)) distribution.statistics.max shouldEqual Some(formatDate(3000)) distribution.statistics.mean shouldEqual Some(formatDate(2000)) } } } "Null value in data" should { val distribution = distributionForDouble(1, 2, 3, 4, Double.NaN, 5) "not be skipped in calculating min and max" in { distribution.statistics.min shouldEqual Some("1") distribution.statistics.max shouldEqual Some("5") } "result in mean value NaN" in { distribution.statistics.mean shouldEqual Some("NaN") } } lazy val columnName = "column_name" private def distributionForDouble(data: Double*): ContinuousDistribution = { distributionFor(data, DoubleType) } private def distributionForInt(data: Int*): ContinuousDistribution = { distributionFor(data, IntegerType) } private def distributionForTimestamps(data: Timestamp*): ContinuousDistribution = { distributionFor(data, TimestampType) } private def distributionFor(data: Seq[Any], dataType: DataType): ContinuousDistribution = { val schema = StructType(Array( StructField(columnName, dataType) )) val rows = data.map(v => Row(v)) val dataFrame = createDataFrame(rows, schema) val report = dataFrame.report report.content.distributions(columnName).asInstanceOf[ContinuousDistribution] } def buildDataFrame(schema: StructType, data: RDD[Row]): DataFrame = { val dataFrame: sql.DataFrame = sparkSQLSession.createDataFrame(data, schema) DataFrame.fromSparkDataFrame(dataFrame) } def formatDate(millis: Long): String = { DateTimeConverter.toString(DateTimeConverter.fromMillis(millis)) } }
Example 4
Source File: EstimatorWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.StructType import org.apache.spark.{ml, sql} import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.{Transformer, Estimator} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class EstimatorWrapper( executionContext: ExecutionContext, estimator: Estimator[Transformer]) extends ML.Estimator[TransformerWrapper] { override def fitDF(dataset: sql.DataFrame): TransformerWrapper = { new TransformerWrapper( executionContext, estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))) } override def copy(extra: ParamMap): EstimatorWrapper = { val params = ParamTransformer.transform(extra) val estimatorCopy = estimator.replicate().set(params: _*) new EstimatorWrapper(executionContext, estimatorCopy) } override def transformSchema(schema: StructType): StructType = { schema } override lazy val params: Array[ml.param.Param[_]] = { estimator.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("EstimatorWrapper") }
Example 5
Source File: EvaluatorWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.evaluation import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.Evaluator import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class EvaluatorWrapper( context: ExecutionContext, evaluator: Evaluator) extends ML.Evaluator { override def evaluateDF(dataset: sql.DataFrame): Double = { evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value } override def copy(extra: ParamMap): evaluation.Evaluator = { val params = ParamTransformer.transform(extra) val evaluatorCopy = evaluator.replicate().set(params: _*) new EvaluatorWrapper(context, evaluatorCopy) } override lazy val params: Array[Param[_]] = { evaluator.params.map(new ParamWrapper(uid, _)) } override def isLargerBetter: Boolean = evaluator.isLargerBetter override val uid: String = Identifiable.randomUID("EvaluatorWrapper") }
Example 6
Source File: TransformerWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import org.apache.spark.sql.types.StructType import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.Transformer import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class TransformerWrapper( executionContext: ExecutionContext, transformer: Transformer) extends ML.Model[TransformerWrapper] { override def copy(extra: ParamMap): TransformerWrapper = { val params = ParamTransformer.transform(extra) val transformerCopy = transformer.replicate().set(params: _*) new TransformerWrapper(executionContext, transformerCopy) } override def transformDF(dataset: sql.DataFrame): sql.DataFrame = { transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())) .sparkDataFrame } override def transformSchema(schema: StructType): StructType = { transformer._transformSchema(schema).get } override lazy val params: Array[Param[_]] = { transformer.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("TransformerWrapper") }
Example 7
Source File: SqlTransformer.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables import org.apache.spark.sql import org.apache.spark.sql.types.StructType import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperations.exceptions.SqlExpressionException import io.deepsense.deeplang.inference.{SqlInferenceWarning, SqlSchemaInferrer} import io.deepsense.deeplang.params.{CodeSnippetLanguage, CodeSnippetParam, Param, StringParam} import io.deepsense.sparkutils.SparkSQLSession import io.deepsense.sparkutils.SQL class SqlTransformer extends Transformer { val dataFrameId = StringParam( name = "dataframe id", description = Some("An identifier that can be used in " + "the SQL expression to refer to the input DataFrame.")) setDefault(dataFrameId -> "df") def getDataFrameId: String = $(dataFrameId) def setDataFrameId(value: String): this.type = set(dataFrameId, value) val expression = CodeSnippetParam( name = "expression", description = Some("SQL Expression to be executed on the DataFrame."), language = CodeSnippetLanguage(CodeSnippetLanguage.sql)) setDefault(expression -> "SELECT * FROM df") def getExpression: String = $(expression) def setExpression(value: String): this.type = set(expression, value) override val params: Array[Param[_]] = Array(dataFrameId, expression) override private[deeplang] def _transform(ctx: ExecutionContext, df: DataFrame): DataFrame = { logger.debug(s"SqlExpression(expression = '$getExpression'," + s" dataFrameId = '$getDataFrameId')") val localSparkSQLSession = ctx.sparkSQLSession.newSession() val localDataFrame = moveToSparkSQLSession(df.sparkDataFrame, localSparkSQLSession) SQL.registerTempTable(localDataFrame, getDataFrameId) try { logger.debug(s"Table '$dataFrameId' registered. Executing the expression") val sqlResult = moveToSparkSQLSession(localSparkSQLSession.sql(getExpression), ctx.sparkSQLSession) DataFrame.fromSparkDataFrame(sqlResult) } finally { logger.debug("Unregistering the temporary table " + getDataFrameId) localSparkSQLSession.dropTempTable(getDataFrameId) } } override private[deeplang] def _transformSchema(schema: StructType): Option[StructType] = { val (resultSchema, warnings) = new SqlSchemaInferrer().inferSchema(getExpression, (getDataFrameId, schema)) // We throw/log as there is no way to pass warnings further at this point. warnings.warnings.foreach { case SqlInferenceWarning(sqlExpression, warningText) => throw SqlExpressionException(sqlExpression, warningText) case other => logger.warn(s"Inference warning not reported: ${other.message}") } Some(resultSchema) } private def moveToSparkSQLSession(df: sql.DataFrame, destinationSession: SparkSQLSession): sql.DataFrame = destinationSession.createDataFrame(df.rdd, df.schema) }
Example 8
Source File: SerializableSparkEstimator.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql import org.apache.spark.sql.types.StructType import io.deepsense.sparkutils.ML class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E) extends ML.Estimator[SerializableSparkModel[T]] with MLWritable { override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0" override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = { val result: T = sparkEstimator.fit(dataset) new SerializableSparkModel[T](result) } override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] = new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E]) override def write: MLWriter = new DefaultMLWriter(this) override def transformSchema(schema: StructType): StructType = sparkEstimator.transformSchema(schema) }
Example 9
Source File: DeeplangTestSupport.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang import org.apache.spark.sql import org.apache.spark.sql.types.StructType import org.mockito.Mockito._ import org.scalatest.mockito.MockitoSugar import ai.deepsense.deeplang.catalogs.doperable.DOperableCatalog import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.inference.InferContext trait DeeplangTestSupport extends MockitoSugar { protected def createInferContext( dOperableCatalog: DOperableCatalog): InferContext = MockedInferContext(dOperableCatalog) protected def createExecutionContext: ExecutionContext = { val mockedExecutionContext = mock[ExecutionContext] val mockedInferContext = mock[InferContext] when(mockedExecutionContext.inferContext).thenReturn(mockedInferContext) mockedExecutionContext } protected def createSchema(fields: Array[String] = Array[String]()): StructType = { val schemaMock = mock[StructType] when(schemaMock.fieldNames).thenReturn(fields) schemaMock } protected def createSparkDataFrame(schema: StructType = createSchema()) = { val sparkDataFrameMock = mock[sql.DataFrame] when(sparkDataFrameMock.schema).thenReturn(schema) when(sparkDataFrameMock.toDF).thenReturn(sparkDataFrameMock) sparkDataFrameMock } protected def createDataFrame(fields: Array[String] = Array[String]()): DataFrame = { val schema = createSchema(fields) createDataFrame(schema) } protected def createDataFrame(schema: StructType): DataFrame = { val sparkDataFrameMock = createSparkDataFrame(schema) val dataFrameMock = mock[DataFrame] when(dataFrameMock.sparkDataFrame).thenReturn(sparkDataFrameMock) when(dataFrameMock.schema).thenReturn(Some(schema)) dataFrameMock } }
Example 10
Source File: StatisticsForContinuousIntegSpec.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.dataframe.report.distribution import java.sql.Timestamp import org.apache.spark.rdd.RDD import org.apache.spark.sql import org.apache.spark.sql.Row import org.apache.spark.sql.types._ import ai.deepsense.commons.datetime.DateTimeConverter import ai.deepsense.deeplang.DeeplangIntegTestSupport import ai.deepsense.deeplang.doperables.dataframe.{DataFrame, DataFrameTestFactory} import ai.deepsense.reportlib.model._ class StatisticsForContinuousIntegSpec extends DeeplangIntegTestSupport with DataFrameTestFactory { "Statistics (Min, max and mean values)" should { "be calculated for each continuous column in distribution" when { "data is of type int" in { val distribution = distributionForInt(1, 2, 3, 4, 5) distribution.statistics.min shouldEqual Some("1") distribution.statistics.max shouldEqual Some("5") distribution.statistics.mean shouldEqual Some("3") } "data is of type Timestamp" in { val distribution = distributionForTimestamps(new Timestamp(1000), new Timestamp(2000), new Timestamp(3000)) distribution.statistics.min shouldEqual Some(formatDate(1000)) distribution.statistics.max shouldEqual Some(formatDate(3000)) distribution.statistics.mean shouldEqual Some(formatDate(2000)) } } } "Null value in data" should { val distribution = distributionForDouble(1, 2, 3, 4, Double.NaN, 5) "not be skipped in calculating min and max" in { distribution.statistics.min shouldEqual Some("1") distribution.statistics.max shouldEqual Some("5") } "result in mean value NaN" in { distribution.statistics.mean shouldEqual Some("NaN") } } lazy val columnName = "column_name" private def distributionForDouble(data: Double*): ContinuousDistribution = { distributionFor(data, DoubleType) } private def distributionForInt(data: Int*): ContinuousDistribution = { distributionFor(data, IntegerType) } private def distributionForTimestamps(data: Timestamp*): ContinuousDistribution = { distributionFor(data, TimestampType) } private def distributionFor(data: Seq[Any], dataType: DataType): ContinuousDistribution = { val schema = StructType(Array( StructField(columnName, dataType) )) val rows = data.map(v => Row(v)) val dataFrame = createDataFrame(rows, schema) val report = dataFrame.report() report.content.distributions(columnName).asInstanceOf[ContinuousDistribution] } def buildDataFrame(schema: StructType, data: RDD[Row]): DataFrame = { val dataFrame: sql.DataFrame = sparkSQLSession.createDataFrame(data, schema) DataFrame.fromSparkDataFrame(dataFrame) } def formatDate(millis: Long): String = { DateTimeConverter.toString(DateTimeConverter.fromMillis(millis)) } }
Example 11
Source File: EstimatorWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.StructType import org.apache.spark.{ml, sql} import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.{Transformer, Estimator} import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class EstimatorWrapper( executionContext: ExecutionContext, estimator: Estimator[Transformer]) extends ML.Estimator[TransformerWrapper] { override def fitDF(dataset: sql.DataFrame): TransformerWrapper = { new TransformerWrapper( executionContext, estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))) } override def copy(extra: ParamMap): EstimatorWrapper = { val params = ParamTransformer.transform(extra) val estimatorCopy = estimator.replicate().set(params: _*) new EstimatorWrapper(executionContext, estimatorCopy) } override def transformSchema(schema: StructType): StructType = { schema } override lazy val params: Array[ml.param.Param[_]] = { estimator.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("EstimatorWrapper") }
Example 12
Source File: FieldPointer.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.model.pointers import org.apache.spark.sql import org.apache.spark.sql.Column import org.archive.archivespark.model._ import org.archive.archivespark.util.SelectorUtil trait GenericFieldPointer[+R <: EnrichRoot, +T] extends Serializable { this: FieldPointer[_, _] => } trait FieldPointer[Root <: EnrichRoot, T] extends GenericFieldPointer[Root, T] { def path[R <: Root](root: EnrichRootCompanion[R]): Seq[String] def get(root: Root): Option[T] = enrichable(root).map(_.get) def exists(root: Root): Boolean = root[T](path(root)).isDefined def enrichable(root: Root): Option[TypedEnrichable[T]] = { val initialized = init(root, excludeFromOutput = false) initialized[T](path(initialized)) } def multi: MultiFieldPointer[Root, T] = new SingleToMultiFieldPointer[Root, T](this) def init[R <: Root](root: R, excludeFromOutput: Boolean): R = root def pathTo[R <: Root](root: EnrichRootCompanion[R], field: String): Seq[String] = path(root) ++ SelectorUtil.parse(field) def col(root: EnrichRootCompanion[Root]): Column = sql.functions.col(SelectorUtil.toString(path(root).filter(f => f != "*" && !f.startsWith("[")))) def parent[A]: FieldPointer[Root, A] = new RelativeFieldPointer(this, 1, Seq.empty) def child[A](field: String): FieldPointer[Root, A] = new RelativeFieldPointer(this, 0, Seq(field)) def sibling[A](field: String): FieldPointer[Root, A] = new RelativeFieldPointer(this, 1, Seq(field)) def mapEnrichable[A](field: String)(f: TypedEnrichable[T] => A): EnrichFunc[Root, T, A] = { val sourcePointer = this new EnrichFunc[Root, T, A] { override def source: FieldPointer[Root, T] = sourcePointer override def fields: Seq[String] = Seq(field) override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = { derivatives << f(source) } } } def map[A](field: String)(f: T => A): EnrichFunc[Root, T, A] = mapEnrichable(field)(e => f(e.get)) def mapMultiEnrichable[A](field: String)(f: TypedEnrichable[T] => Seq[A]): MultiEnrichFunc[Root, T, A] = { val sourcePointer = this new MultiEnrichFunc[Root, T, A] { override def source: FieldPointer[Root, T] = sourcePointer override def fields: Seq[String] = Seq(field) override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = { derivatives.setNext(MultiValueEnrichable(f(source))) } } } def mapMulti[A](field: String)(f: T => Seq[A]): MultiEnrichFunc[Root, T, A] = mapMultiEnrichable(field)(e => f(e.get)) def mapIdentity(field: String): EnrichFunc[Root, T, T] = { val sourcePointer = this new EnrichFunc[Root, T, T] { override def source: FieldPointer[Root, T] = sourcePointer override def fields: Seq[String] = Seq(field) override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = { derivatives.setNext(IdentityField[T]) } } } } object FieldPointer { def apply[Root <: EnrichRoot, T](path: String): FieldPointer[Root, T] = apply(SelectorUtil.parse(path)) def apply[Root <: EnrichRoot, T](path: Seq[String]): FieldPointer[Root, T] = new PathFieldPointer(path) def multi[Root <: EnrichRoot, T](path: String): MultiFieldPointer[Root, T] = multi(SelectorUtil.parse(path)) def multi[Root <: EnrichRoot, T](path: Seq[String]): MultiFieldPointer[Root, T] = apply(path).multi def root[Root <: TypedEnrichRoot[T], T]: FieldPointer[Root, T] = new PathFieldPointer(Seq.empty) }
Example 13
Source File: TransformerWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import org.apache.spark.sql.types.StructType import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.Transformer import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class TransformerWrapper( executionContext: ExecutionContext, transformer: Transformer) extends ML.Model[TransformerWrapper] { override def copy(extra: ParamMap): TransformerWrapper = { val params = ParamTransformer.transform(extra) val transformerCopy = transformer.replicate().set(params: _*) new TransformerWrapper(executionContext, transformerCopy) } override def transformDF(dataset: sql.DataFrame): sql.DataFrame = { transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())) .sparkDataFrame } override def transformSchema(schema: StructType): StructType = { transformer._transformSchema(schema).get } override lazy val params: Array[Param[_]] = { transformer.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("TransformerWrapper") }
Example 14
Source File: SqlTransformer.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables import org.apache.spark.sql import org.apache.spark.sql.types.StructType import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperations.exceptions.SqlExpressionException import ai.deepsense.deeplang.inference.{SqlInferenceWarning, SqlSchemaInferrer} import ai.deepsense.deeplang.params.{CodeSnippetLanguage, CodeSnippetParam, Param, StringParam} import ai.deepsense.sparkutils.SparkSQLSession import ai.deepsense.sparkutils.SQL class SqlTransformer extends Transformer { val dataFrameId = StringParam( name = "dataframe id", description = Some("An identifier that can be used in " + "the SQL expression to refer to the input DataFrame.")) setDefault(dataFrameId -> "df") def getDataFrameId: String = $(dataFrameId) def setDataFrameId(value: String): this.type = set(dataFrameId, value) val expression = CodeSnippetParam( name = "expression", description = Some("SQL Expression to be executed on the DataFrame."), language = CodeSnippetLanguage(CodeSnippetLanguage.sql)) setDefault(expression -> "SELECT * FROM df") def getExpression: String = $(expression) def setExpression(value: String): this.type = set(expression, value) override val params: Array[Param[_]] = Array(dataFrameId, expression) override protected def applyTransform(ctx: ExecutionContext, df: DataFrame): DataFrame = { logger.debug(s"SqlExpression(expression = '$getExpression'," + s" dataFrameId = '$getDataFrameId')") val localSparkSQLSession = ctx.sparkSQLSession.newSession() val localDataFrame = moveToSparkSQLSession(df.sparkDataFrame, localSparkSQLSession) SQL.registerTempTable(localDataFrame, getDataFrameId) try { logger.debug(s"Table '$dataFrameId' registered. Executing the expression") val sqlResult = moveToSparkSQLSession(localSparkSQLSession.sql(getExpression), ctx.sparkSQLSession) DataFrame.fromSparkDataFrame(sqlResult) } finally { logger.debug("Unregistering the temporary table " + getDataFrameId) localSparkSQLSession.dropTempTable(getDataFrameId) } } override protected def applyTransformSchema(schema: StructType): Option[StructType] = { val (resultSchema, warnings) = new SqlSchemaInferrer().inferSchema(getExpression, (getDataFrameId, schema)) // We throw/log as there is no way to pass warnings further at this point. warnings.warnings.foreach { case SqlInferenceWarning(sqlExpression, warningText) => throw SqlExpressionException(sqlExpression, warningText) case other => logger.warn(s"Inference warning not reported: ${other.message}") } Some(resultSchema) } private def moveToSparkSQLSession(df: sql.DataFrame, destinationSession: SparkSQLSession): sql.DataFrame = destinationSession.createDataFrame(df.rdd, df.schema) }
Example 15
Source File: SerializableSparkEstimator.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql import org.apache.spark.sql.types.StructType import ai.deepsense.sparkutils.ML class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E) extends ML.Estimator[SerializableSparkModel[T]] with MLWritable { override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0" override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = { val result: T = sparkEstimator.fit(dataset) new SerializableSparkModel[T](result) } override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] = new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E]) override def write: MLWriter = new DefaultMLWriter(this) override def transformSchema(schema: StructType): StructType = sparkEstimator.transformSchema(schema) }
Example 16
Source File: RowSyntax.scala From spark-hyperloglog with MIT License | 5 votes |
package com.collective.analytics.schema import org.apache.spark.sql import org.apache.spark.sql.Row import org.apache.spark.sql.hyperloglog.MergeHyperLogLog object RowSyntax { sealed trait ColumnType trait IntColumn extends ColumnType trait LongColumn extends ColumnType trait StringColumn extends ColumnType trait BinaryColumn extends ColumnType trait StringArrayColumn extends ColumnType trait HLLColumn extends ColumnType sealed trait ColumnReader[C <: ColumnType] { self => type Out def read(row: sql.Row)(idx: Int): Out def map[Out1](f: Out => Out1): ColumnReader[C] {type Out = Out1} = new ColumnReader[C] { type Out = Out1 def read(row: Row)(idx: Int): Out = { f(self.read(row)(idx)) } } } implicit class RowOps(val row: Row) extends AnyVal { def read[C <: ColumnType](idx: Int)(implicit reader: ColumnReader[C]): reader.Out = { reader.read(row)(idx) } } class IntReader[C <: ColumnType] extends ColumnReader[C] { type Out = Int def read(row: Row)(idx: Int): Out = row.getInt(idx) } class LongReader[C <: ColumnType] extends ColumnReader[C] { type Out = Long def read(row: Row)(idx: Int): Out = row.getLong(idx) } class StringReader[C <: ColumnType] extends ColumnReader[C] { type Out = String def read(row: Row)(idx: Int): Out = row(idx) match { case null => "" case str: String => str case arr: Array[_] => new String(arr.asInstanceOf[Array[Byte]]) } } class StringArrayReader[C <: ColumnType] extends ColumnReader[C] { type Out = Array[String] def read(row: Row)(idx: Int): Out = row(idx) match { case null => Array.empty[String] case arr: Array[_] => arr.map(_.toString) } } class BinaryReader[C <: ColumnType] extends ColumnReader[C] { type Out = Array[Byte] def read(row: Row)(idx: Int): Out = { row.getAs[Array[Byte]](idx) } } // Implicit Column Readers implicit val intReader = new IntReader[IntColumn] implicit val longReader = new LongReader[LongColumn] implicit val stringReader = new StringReader[StringColumn] implicit val stringArrayReader = new StringArrayReader[StringArrayColumn] implicit val binaryReader = new BinaryReader[BinaryColumn] implicit val cardinalityReader = new BinaryReader[HLLColumn] map { bytes => MergeHyperLogLog.readHLLWritable(bytes).get() } }
Example 17
Source File: FeatureExtraction.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package com.spark.recommendation import org.apache.spark.{sql, SparkConf} import org.apache.spark.ml.recommendation.ALS import org.apache.spark.sql.{Dataset, SparkSession} def getFeatures(): sql.DataFrame = { import spark.implicits._ //val ratings = spark.read.textFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_05/data/ml-100k 2/u.data").map(parseRating).toDF() val ratings = spark.read.textFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_05/2.0.0/scala-spark-app/src/main/scala/com/spark/recommendation/sample_movielens_ratings.txt").map(parseRating).toDF() println(ratings.first()) // val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) // println(training.first()) return ratings } def getSpark(): SparkSession = { return spark } def main(args: Array[String]) { getFeatures() } }
Example 18
Source File: ModelEstimator.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegressionModel} import org.apache.spark.ml._ import org.apache.spark.sql import sql._ @throws(classOf[IllegalArgumentException]) final def trainWithSummary( trainDf: DataFrame, stages: Array[PipelineStage] ): Option[(Double, Double)] = { require(stages.size > 0, "Cannot process a pipeline without stages") // Print the training set data frame trainDf.printSchema this(trainDf, stages).stages.last match { case lrModel: LogisticRegressionModel => val binarySummary = lrModel.summary.asInstanceOf[BinaryLogisticRegressionSummary] // Set the model threshold to maximize F-Measure val f1: Double = binarySummary.fMeasureByThreshold.select("F-Measure").head.getDouble(0) Some(f1, binarySummary.areaUnderROC) case _ => None } } } // ------------------------------ EOF --------------------------------------------------------
Example 19
Source File: SparkLeapFrame.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.spark import ml.combust.mleap.core.types.{StructField, StructType} import ml.combust.mleap.runtime.frame.{FrameBuilder, Row, RowUtil} import ml.combust.mleap.runtime.function.{Selector, UserDefinedFunction} import org.apache.spark.rdd.RDD import org.apache.spark.sql import org.apache.spark.sql.mleap.TypeConverters import org.apache.spark.sql.{DataFrame, SQLContext, types} import scala.util.Try case class SparkLeapFrame(schema: StructType, dataset: RDD[Row], sqlContext: SQLContext) extends FrameBuilder[SparkLeapFrame] { override def withColumn(output: String, inputs: Selector *) (udf: UserDefinedFunction): Try[SparkLeapFrame] = { RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap { rowSelectors => val field = StructField(output, udf.outputTypes.head) schema.withField(field).map { schema2 => val dataset2 = dataset.map { row => row.withValue(rowSelectors: _*)(udf) } copy(schema = schema2, dataset = dataset2) } } } override def withColumns(outputs: Seq[String], inputs: Selector*) (udf: UserDefinedFunction): Try[SparkLeapFrame] = { RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap { rowSelectors => val fields = outputs.zip(udf.outputTypes).map { case (name, dt) => StructField(name, dt) } schema.withFields(fields).map { schema2 => val dataset2 = dataset.map { row => row.withValues(rowSelectors: _*)(udf) } copy(schema = schema2, dataset = dataset2) } } } override def select(fieldNames: String *): Try[SparkLeapFrame] = { for(indices <- schema.indicesOf(fieldNames: _*); schema2 <- schema.selectIndices(indices: _*)) yield { val dataset2 = dataset.map(row => row.selectIndices(indices: _*)) copy(schema = schema2, dataset = dataset2) } } override def drop(names: String*): Try[SparkLeapFrame] = { for(indices <- schema.indicesOf(names: _*); schema2 <- schema.dropIndices(indices: _*)) yield { val dataset2 = dataset.map(row => row.dropIndices(indices: _*)) copy(schema = schema2, dataset = dataset2) } } override def filter(selectors: Selector*) (udf: UserDefinedFunction): Try[SparkLeapFrame] = { RowUtil.createRowSelectors(schema, selectors: _*)(udf).map { rowSelectors => val dataset2 = dataset.filter(row => row.shouldFilter(rowSelectors: _*)(udf)) copy(schema = schema, dataset = dataset2) } } def toSpark: DataFrame = { val spec = schema.fields.map(TypeConverters.mleapToSparkConverter) val fields = spec.map(_._1) val converters = spec.map(_._2) val sparkSchema = new types.StructType(fields.toArray) val data = dataset.map { r => val values = r.zip(converters).map { case (v, c) => c(v) } sql.Row(values.toSeq: _*) } sqlContext.createDataFrame(data, sparkSchema) } }
Example 20
Source File: RedisSourceConfig.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis.stream import org.apache.spark.sql import org.apache.spark.sql.redis._ case class RedisSourceConfig(consumerConfigs: Seq[RedisConsumerConfig], start: Option[RedisSourceOffset]) object RedisSourceConfig { def fromMap(config: Map[String, String]): RedisSourceConfig = { val streamKeys = config.getOrElse(StreamOptionStreamKeys, throw new IllegalArgumentException(s"Please specify '$StreamOptionStreamKeys'")) val start = config.get(StreamOptionStreamOffsets).map(RedisSourceOffset.fromJson) val parallelism = config.get(sql.redis.StreamOptionParallelism).map(_.toInt).getOrElse(1) val groupName = config.getOrElse(StreamOptionGroupName, "spark-source") val consumerPrefix = config.getOrElse(StreamOptionConsumerPrefix, "consumer") val batchSize = config.get(StreamOptionReadBatchSize).map(_.toInt).getOrElse(StreamOptionReadBatchSizeDefault) val block = config.get(StreamOptionReadBlock).map(_.toInt).getOrElse(StreamOptionReadBlockDefault) val consumerConfigs = streamKeys.split(",").flatMap { streamKey => (1 to parallelism).map { consumerIndex => RedisConsumerConfig(streamKey, s"$groupName", s"$consumerPrefix-$consumerIndex", batchSize, block) } } RedisSourceConfig(consumerConfigs, start) } } case class RedisConsumerConfig(streamKey: String, groupName: String, consumerName: String, batchSize: Int, block: Int)
Example 21
Source File: TiConverter.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import com.pingcap.tikv.exception.TiBatchWriteException import com.pingcap.tikv.types._ import org.apache.spark.sql object TiConverter { type TiDataType = com.pingcap.tikv.types.DataType type SparkSQLDataType = org.apache.spark.sql.types.DataType def fromSparkType(tp: SparkSQLDataType): TiDataType = // TODO: review type system // pending: https://internal.pingcap.net/jira/browse/TISPARK-99 tp match { case _: sql.types.BinaryType => BytesType.BLOB case _: sql.types.StringType => StringType.VARCHAR case _: sql.types.LongType => IntegerType.BIGINT case _: sql.types.IntegerType => IntegerType.INT case _: sql.types.DoubleType => RealType.DOUBLE case _: sql.types.FloatType => RealType.FLOAT case sql.types.DecimalType.Fixed(prec, scale) => new DecimalType(prec, scale) case _: sql.types.TimestampType => TimestampType.TIMESTAMP case _: sql.types.DateType => DateType.DATE } def sparkSQLObjectToJavaObject(value: Any): Object = { if (value == null) { return null } import scala.collection.JavaConversions._ val result: java.lang.Object = value match { case v: java.lang.Boolean => v case v: java.lang.Byte => v case v: java.lang.Short => v case v: java.lang.Integer => v case v: java.lang.Long => v case v: java.lang.Float => v case v: java.lang.Double => v case v: java.lang.String => v case v: java.math.BigDecimal => v case v: java.sql.Date => v case v: java.sql.Timestamp => v case v: Array[Byte] => val r: java.util.List[java.lang.Byte] = v.toList.map(b => java.lang.Byte.valueOf(b)) r // TODO: to support following types //case v: scala.collection.Seq[_] => //case v: scala.collection.Map[_, _] => //case v: org.apache.spark.sql.Row => case _ => throw new TiBatchWriteException( s"do not support converting SparkSQL Data Type ${value.getClass} to TiDB Data Type!") } result } }