org.apache.spark.sql Scala Examples

The following examples show how to use org.apache.spark.sql. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: EvaluatorWrapper.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.evaluation
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql

import ai.deepsense.deeplang.ExecutionContext
import ai.deepsense.deeplang.doperables.Evaluator
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import ai.deepsense.sparkutils.ML

class EvaluatorWrapper(
    context: ExecutionContext,
    evaluator: Evaluator)
  extends ML.Evaluator {

  override def evaluateDF(dataset: sql.DataFrame): Double = {
    evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value
  }

  override def copy(extra: ParamMap): evaluation.Evaluator = {
    val params = ParamTransformer.transform(extra)
    val evaluatorCopy = evaluator.replicate().set(params: _*)
    new EvaluatorWrapper(context, evaluatorCopy)
  }

  override lazy val params: Array[Param[_]] = {
    evaluator.params.map(new ParamWrapper(uid, _))
  }

  override def isLargerBetter: Boolean = evaluator.isLargerBetter

  override val uid: String = Identifiable.randomUID("EvaluatorWrapper")
} 
Example 2
Source File: DeeplangTestSupport.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang

import org.apache.spark.sql
import org.apache.spark.sql.types.StructType
import org.mockito.Mockito._
import org.scalatest.mockito.MockitoSugar

import io.deepsense.deeplang.catalogs.doperable.DOperableCatalog
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.inference.InferContext

trait DeeplangTestSupport extends MockitoSugar {

  protected def createInferContext(
      dOperableCatalog: DOperableCatalog): InferContext = MockedInferContext(dOperableCatalog)

  protected def createSchema(fields: Array[String] = Array[String]()): StructType = {
    val schemaMock = mock[StructType]
    when(schemaMock.fieldNames).thenReturn(fields)
    schemaMock
  }

  protected def createSparkDataFrame(schema: StructType = createSchema()) = {
    val sparkDataFrameMock = mock[sql.DataFrame]
    when(sparkDataFrameMock.schema).thenReturn(schema)
    when(sparkDataFrameMock.toDF).thenReturn(sparkDataFrameMock)
    sparkDataFrameMock
  }

  protected def createDataFrame(fields: Array[String] = Array[String]()): DataFrame = {
    val schema = createSchema(fields)
    createDataFrame(schema)
  }

  protected def createDataFrame(schema: StructType): DataFrame = {
    val sparkDataFrameMock = createSparkDataFrame(schema)
    val dataFrameMock = mock[DataFrame]
    when(dataFrameMock.sparkDataFrame).thenReturn(sparkDataFrameMock)
    when(dataFrameMock.schema).thenReturn(Some(schema))
    dataFrameMock
  }
} 
Example 3
Source File: StatisticsForContinuousIntegSpec.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.dataframe.report.distribution

import java.sql.Timestamp

import org.apache.spark.rdd.RDD
import org.apache.spark.sql
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

import io.deepsense.commons.datetime.DateTimeConverter
import io.deepsense.deeplang.DeeplangIntegTestSupport
import io.deepsense.deeplang.doperables.dataframe.{DataFrame, DataFrameTestFactory}
import io.deepsense.reportlib.model._

class StatisticsForContinuousIntegSpec extends DeeplangIntegTestSupport with DataFrameTestFactory {

  "Statistics (Min, max and mean values)" should {
    "be calculated for each continuous column in distribution" when {
      "data is of type int" in {
        val distribution = distributionForInt(1, 2, 3, 4, 5)
        distribution.statistics.min shouldEqual Some("1")
        distribution.statistics.max shouldEqual Some("5")
        distribution.statistics.mean shouldEqual Some("3")
      }
      "data is of type Timestamp" in {
        val distribution =
          distributionForTimestamps(new Timestamp(1000), new Timestamp(2000), new Timestamp(3000))
        distribution.statistics.min shouldEqual Some(formatDate(1000))
        distribution.statistics.max shouldEqual Some(formatDate(3000))
        distribution.statistics.mean shouldEqual Some(formatDate(2000))
      }
    }
  }
  "Null value in data" should {
    val distribution = distributionForDouble(1, 2, 3, 4, Double.NaN, 5)
    "not be skipped in calculating min and max" in {
      distribution.statistics.min shouldEqual Some("1")
      distribution.statistics.max shouldEqual Some("5")
    }
    "result in mean value NaN" in {
      distribution.statistics.mean shouldEqual Some("NaN")
    }
  }

  lazy val columnName = "column_name"

  private def distributionForDouble(data: Double*): ContinuousDistribution = {
    distributionFor(data, DoubleType)
  }

  private def distributionForInt(data: Int*): ContinuousDistribution = {
    distributionFor(data, IntegerType)
  }

  private def distributionForTimestamps(data: Timestamp*): ContinuousDistribution = {
    distributionFor(data, TimestampType)
  }

  private def distributionFor(data: Seq[Any], dataType: DataType): ContinuousDistribution = {
    val schema = StructType(Array(
      StructField(columnName, dataType)
    ))

    val rows = data.map(v => Row(v))
    val dataFrame = createDataFrame(rows, schema)

    val report = dataFrame.report
    report.content.distributions(columnName).asInstanceOf[ContinuousDistribution]
  }

  def buildDataFrame(schema: StructType, data: RDD[Row]): DataFrame = {
    val dataFrame: sql.DataFrame = sparkSQLSession.createDataFrame(data, schema)
    DataFrame.fromSparkDataFrame(dataFrame)
  }

  def formatDate(millis: Long): String = {
    DateTimeConverter.toString(DateTimeConverter.fromMillis(millis))
  }

} 
Example 4
Source File: EstimatorWrapper.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.StructType
import org.apache.spark.{ml, sql}

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.{Transformer, Estimator}
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import io.deepsense.sparkutils.ML

class EstimatorWrapper(
    executionContext: ExecutionContext,
    estimator: Estimator[Transformer])
  extends ML.Estimator[TransformerWrapper] {

  override def fitDF(dataset: sql.DataFrame): TransformerWrapper = {
    new TransformerWrapper(
      executionContext,
      estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())))
  }

  override def copy(extra: ParamMap): EstimatorWrapper = {
    val params = ParamTransformer.transform(extra)
    val estimatorCopy = estimator.replicate().set(params: _*)
    new EstimatorWrapper(executionContext, estimatorCopy)
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override lazy val params: Array[ml.param.Param[_]] = {
    estimator.params.map(new ParamWrapper(uid, _))
  }

  override val uid: String = Identifiable.randomUID("EstimatorWrapper")
} 
Example 5
Source File: EvaluatorWrapper.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.evaluation
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.Evaluator
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import io.deepsense.sparkutils.ML

class EvaluatorWrapper(
    context: ExecutionContext,
    evaluator: Evaluator)
  extends ML.Evaluator {

  override def evaluateDF(dataset: sql.DataFrame): Double = {
    evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value
  }

  override def copy(extra: ParamMap): evaluation.Evaluator = {
    val params = ParamTransformer.transform(extra)
    val evaluatorCopy = evaluator.replicate().set(params: _*)
    new EvaluatorWrapper(context, evaluatorCopy)
  }

  override lazy val params: Array[Param[_]] = {
    evaluator.params.map(new ParamWrapper(uid, _))
  }

  override def isLargerBetter: Boolean = evaluator.isLargerBetter

  override val uid: String = Identifiable.randomUID("EvaluatorWrapper")
} 
Example 6
Source File: TransformerWrapper.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.Transformer
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import io.deepsense.sparkutils.ML

class TransformerWrapper(
    executionContext: ExecutionContext,
    transformer: Transformer)
  extends ML.Model[TransformerWrapper] {

  override def copy(extra: ParamMap): TransformerWrapper = {
    val params = ParamTransformer.transform(extra)
    val transformerCopy = transformer.replicate().set(params: _*)
    new TransformerWrapper(executionContext, transformerCopy)
  }

  override def transformDF(dataset: sql.DataFrame): sql.DataFrame = {
    transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))
      .sparkDataFrame
  }

  override def transformSchema(schema: StructType): StructType = {
    transformer._transformSchema(schema).get
  }

  override lazy val params: Array[Param[_]] = {
    transformer.params.map(new ParamWrapper(uid, _))
  }

  override val uid: String = Identifiable.randomUID("TransformerWrapper")
} 
Example 7
Source File: SqlTransformer.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables

import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.doperations.exceptions.SqlExpressionException
import io.deepsense.deeplang.inference.{SqlInferenceWarning, SqlSchemaInferrer}
import io.deepsense.deeplang.params.{CodeSnippetLanguage, CodeSnippetParam, Param, StringParam}
import io.deepsense.sparkutils.SparkSQLSession
import io.deepsense.sparkutils.SQL

class SqlTransformer extends Transformer {

  val dataFrameId = StringParam(
    name = "dataframe id",
    description = Some("An identifier that can be used in " +
      "the SQL expression to refer to the input DataFrame."))
  setDefault(dataFrameId -> "df")
  def getDataFrameId: String = $(dataFrameId)
  def setDataFrameId(value: String): this.type = set(dataFrameId, value)

  val expression = CodeSnippetParam(
    name = "expression",
    description = Some("SQL Expression to be executed on the DataFrame."),
    language = CodeSnippetLanguage(CodeSnippetLanguage.sql))
  setDefault(expression -> "SELECT * FROM df")
  def getExpression: String = $(expression)
  def setExpression(value: String): this.type = set(expression, value)

  override val params: Array[Param[_]] = Array(dataFrameId, expression)

  override private[deeplang] def _transform(ctx: ExecutionContext, df: DataFrame): DataFrame = {
    logger.debug(s"SqlExpression(expression = '$getExpression'," +
      s" dataFrameId = '$getDataFrameId')")

    val localSparkSQLSession = ctx.sparkSQLSession.newSession()
    val localDataFrame = moveToSparkSQLSession(df.sparkDataFrame, localSparkSQLSession)

    SQL.registerTempTable(localDataFrame, getDataFrameId)
    try {
      logger.debug(s"Table '$dataFrameId' registered. Executing the expression")
      val sqlResult = moveToSparkSQLSession(localSparkSQLSession.sql(getExpression), ctx.sparkSQLSession)
      DataFrame.fromSparkDataFrame(sqlResult)
    } finally {
      logger.debug("Unregistering the temporary table " + getDataFrameId)
      localSparkSQLSession.dropTempTable(getDataFrameId)
    }
  }

  override private[deeplang] def _transformSchema(schema: StructType): Option[StructType] = {
    val (resultSchema, warnings) =
      new SqlSchemaInferrer().inferSchema(getExpression, (getDataFrameId, schema))
    // We throw/log as there is no way to pass warnings further at this point.
    warnings.warnings.foreach {
      case SqlInferenceWarning(sqlExpression, warningText) =>
        throw SqlExpressionException(sqlExpression, warningText)
      case other => logger.warn(s"Inference warning not reported: ${other.message}")
    }
    Some(resultSchema)
  }

  private def moveToSparkSQLSession(df: sql.DataFrame, destinationSession: SparkSQLSession): sql.DataFrame =
    destinationSession.createDataFrame(df.rdd, df.schema)
} 
Example 8
Source File: SerializableSparkEstimator.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import io.deepsense.sparkutils.ML

class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E)
  extends ML.Estimator[SerializableSparkModel[T]]
  with MLWritable {

  override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0"

  override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = {
    val result: T = sparkEstimator.fit(dataset)
    new SerializableSparkModel[T](result)
  }

  override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] =
    new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E])

  override def write: MLWriter = new DefaultMLWriter(this)

  override def transformSchema(schema: StructType): StructType =
    sparkEstimator.transformSchema(schema)
} 
Example 9
Source File: DeeplangTestSupport.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang

import org.apache.spark.sql
import org.apache.spark.sql.types.StructType
import org.mockito.Mockito._
import org.scalatest.mockito.MockitoSugar

import ai.deepsense.deeplang.catalogs.doperable.DOperableCatalog
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.inference.InferContext

trait DeeplangTestSupport extends MockitoSugar {

  protected def createInferContext(
      dOperableCatalog: DOperableCatalog): InferContext = MockedInferContext(dOperableCatalog)

  protected def createExecutionContext: ExecutionContext = {
    val mockedExecutionContext = mock[ExecutionContext]
    val mockedInferContext = mock[InferContext]
    when(mockedExecutionContext.inferContext).thenReturn(mockedInferContext)
    mockedExecutionContext
  }

  protected def createSchema(fields: Array[String] = Array[String]()): StructType = {
    val schemaMock = mock[StructType]
    when(schemaMock.fieldNames).thenReturn(fields)
    schemaMock
  }

  protected def createSparkDataFrame(schema: StructType = createSchema()) = {
    val sparkDataFrameMock = mock[sql.DataFrame]
    when(sparkDataFrameMock.schema).thenReturn(schema)
    when(sparkDataFrameMock.toDF).thenReturn(sparkDataFrameMock)
    sparkDataFrameMock
  }

  protected def createDataFrame(fields: Array[String] = Array[String]()): DataFrame = {
    val schema = createSchema(fields)
    createDataFrame(schema)
  }

  protected def createDataFrame(schema: StructType): DataFrame = {
    val sparkDataFrameMock = createSparkDataFrame(schema)
    val dataFrameMock = mock[DataFrame]
    when(dataFrameMock.sparkDataFrame).thenReturn(sparkDataFrameMock)
    when(dataFrameMock.schema).thenReturn(Some(schema))
    dataFrameMock
  }
} 
Example 10
Source File: StatisticsForContinuousIntegSpec.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.dataframe.report.distribution

import java.sql.Timestamp

import org.apache.spark.rdd.RDD
import org.apache.spark.sql
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

import ai.deepsense.commons.datetime.DateTimeConverter
import ai.deepsense.deeplang.DeeplangIntegTestSupport
import ai.deepsense.deeplang.doperables.dataframe.{DataFrame, DataFrameTestFactory}
import ai.deepsense.reportlib.model._

class StatisticsForContinuousIntegSpec extends DeeplangIntegTestSupport with DataFrameTestFactory {

  "Statistics (Min, max and mean values)" should {
    "be calculated for each continuous column in distribution" when {
      "data is of type int" in {
        val distribution = distributionForInt(1, 2, 3, 4, 5)
        distribution.statistics.min shouldEqual Some("1")
        distribution.statistics.max shouldEqual Some("5")
        distribution.statistics.mean shouldEqual Some("3")
      }
      "data is of type Timestamp" in {
        val distribution =
          distributionForTimestamps(new Timestamp(1000), new Timestamp(2000), new Timestamp(3000))
        distribution.statistics.min shouldEqual Some(formatDate(1000))
        distribution.statistics.max shouldEqual Some(formatDate(3000))
        distribution.statistics.mean shouldEqual Some(formatDate(2000))
      }
    }
  }
  "Null value in data" should {
    val distribution = distributionForDouble(1, 2, 3, 4, Double.NaN, 5)
    "not be skipped in calculating min and max" in {
      distribution.statistics.min shouldEqual Some("1")
      distribution.statistics.max shouldEqual Some("5")
    }
    "result in mean value NaN" in {
      distribution.statistics.mean shouldEqual Some("NaN")
    }
  }

  lazy val columnName = "column_name"

  private def distributionForDouble(data: Double*): ContinuousDistribution = {
    distributionFor(data, DoubleType)
  }

  private def distributionForInt(data: Int*): ContinuousDistribution = {
    distributionFor(data, IntegerType)
  }

  private def distributionForTimestamps(data: Timestamp*): ContinuousDistribution = {
    distributionFor(data, TimestampType)
  }

  private def distributionFor(data: Seq[Any], dataType: DataType): ContinuousDistribution = {
    val schema = StructType(Array(
      StructField(columnName, dataType)
    ))

    val rows = data.map(v => Row(v))
    val dataFrame = createDataFrame(rows, schema)

    val report = dataFrame.report()
    report.content.distributions(columnName).asInstanceOf[ContinuousDistribution]
  }

  def buildDataFrame(schema: StructType, data: RDD[Row]): DataFrame = {
    val dataFrame: sql.DataFrame = sparkSQLSession.createDataFrame(data, schema)
    DataFrame.fromSparkDataFrame(dataFrame)
  }

  def formatDate(millis: Long): String = {
    DateTimeConverter.toString(DateTimeConverter.fromMillis(millis))
  }

} 
Example 11
Source File: EstimatorWrapper.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.StructType
import org.apache.spark.{ml, sql}

import ai.deepsense.deeplang.ExecutionContext
import ai.deepsense.deeplang.doperables.{Transformer, Estimator}
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import ai.deepsense.sparkutils.ML

class EstimatorWrapper(
    executionContext: ExecutionContext,
    estimator: Estimator[Transformer])
  extends ML.Estimator[TransformerWrapper] {

  override def fitDF(dataset: sql.DataFrame): TransformerWrapper = {
    new TransformerWrapper(
      executionContext,
      estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())))
  }

  override def copy(extra: ParamMap): EstimatorWrapper = {
    val params = ParamTransformer.transform(extra)
    val estimatorCopy = estimator.replicate().set(params: _*)
    new EstimatorWrapper(executionContext, estimatorCopy)
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override lazy val params: Array[ml.param.Param[_]] = {
    estimator.params.map(new ParamWrapper(uid, _))
  }

  override val uid: String = Identifiable.randomUID("EstimatorWrapper")
} 
Example 12
Source File: FieldPointer.scala    From ArchiveSpark   with MIT License 5 votes vote down vote up
package org.archive.archivespark.model.pointers

import org.apache.spark.sql
import org.apache.spark.sql.Column
import org.archive.archivespark.model._
import org.archive.archivespark.util.SelectorUtil

trait GenericFieldPointer[+R <: EnrichRoot, +T] extends Serializable { this: FieldPointer[_, _] =>
}

trait FieldPointer[Root <: EnrichRoot, T] extends GenericFieldPointer[Root, T] {
  def path[R <: Root](root: EnrichRootCompanion[R]): Seq[String]

  def get(root: Root): Option[T] = enrichable(root).map(_.get)

  def exists(root: Root): Boolean = root[T](path(root)).isDefined

  def enrichable(root: Root): Option[TypedEnrichable[T]] = {
    val initialized = init(root, excludeFromOutput = false)
    initialized[T](path(initialized))
  }

  def multi: MultiFieldPointer[Root, T] = new SingleToMultiFieldPointer[Root, T](this)

  def init[R <: Root](root: R, excludeFromOutput: Boolean): R = root

  def pathTo[R <: Root](root: EnrichRootCompanion[R], field: String): Seq[String] = path(root) ++ SelectorUtil.parse(field)

  def col(root: EnrichRootCompanion[Root]): Column = sql.functions.col(SelectorUtil.toString(path(root).filter(f => f != "*" && !f.startsWith("["))))

  def parent[A]: FieldPointer[Root, A] = new RelativeFieldPointer(this, 1, Seq.empty)

  def child[A](field: String): FieldPointer[Root, A] = new RelativeFieldPointer(this, 0, Seq(field))

  def sibling[A](field: String): FieldPointer[Root, A] = new RelativeFieldPointer(this, 1, Seq(field))

  def mapEnrichable[A](field: String)(f: TypedEnrichable[T] => A): EnrichFunc[Root, T, A] = {
    val sourcePointer = this
    new EnrichFunc[Root, T, A] {
      override def source: FieldPointer[Root, T] = sourcePointer
      override def fields: Seq[String] = Seq(field)
      override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = {
        derivatives << f(source)
      }
    }
  }

  def map[A](field: String)(f: T => A): EnrichFunc[Root, T, A] = mapEnrichable(field)(e => f(e.get))

  def mapMultiEnrichable[A](field: String)(f: TypedEnrichable[T] => Seq[A]): MultiEnrichFunc[Root, T, A] = {
    val sourcePointer = this
    new MultiEnrichFunc[Root, T, A] {
      override def source: FieldPointer[Root, T] = sourcePointer
      override def fields: Seq[String] = Seq(field)
      override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = {
        derivatives.setNext(MultiValueEnrichable(f(source)))
      }
    }
  }

  def mapMulti[A](field: String)(f: T => Seq[A]): MultiEnrichFunc[Root, T, A] = mapMultiEnrichable(field)(e => f(e.get))

  def mapIdentity(field: String): EnrichFunc[Root, T, T] = {
    val sourcePointer = this
    new EnrichFunc[Root, T, T] {
      override def source: FieldPointer[Root, T] = sourcePointer
      override def fields: Seq[String] = Seq(field)
      override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = {
        derivatives.setNext(IdentityField[T])
      }
    }
  }
}

object FieldPointer {
  def apply[Root <: EnrichRoot, T](path: String): FieldPointer[Root, T] = apply(SelectorUtil.parse(path))
  def apply[Root <: EnrichRoot, T](path: Seq[String]): FieldPointer[Root, T] = new PathFieldPointer(path)

  def multi[Root <: EnrichRoot, T](path: String): MultiFieldPointer[Root, T] = multi(SelectorUtil.parse(path))
  def multi[Root <: EnrichRoot, T](path: Seq[String]): MultiFieldPointer[Root, T] = apply(path).multi

  def root[Root <: TypedEnrichRoot[T], T]: FieldPointer[Root, T] = new PathFieldPointer(Seq.empty)
} 
Example 13
Source File: TransformerWrapper.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import ai.deepsense.deeplang.ExecutionContext
import ai.deepsense.deeplang.doperables.Transformer
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import ai.deepsense.sparkutils.ML

class TransformerWrapper(
    executionContext: ExecutionContext,
    transformer: Transformer)
  extends ML.Model[TransformerWrapper] {

  override def copy(extra: ParamMap): TransformerWrapper = {
    val params = ParamTransformer.transform(extra)
    val transformerCopy = transformer.replicate().set(params: _*)
    new TransformerWrapper(executionContext, transformerCopy)
  }

  override def transformDF(dataset: sql.DataFrame): sql.DataFrame = {
    transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))
      .sparkDataFrame
  }

  override def transformSchema(schema: StructType): StructType = {
    transformer._transformSchema(schema).get
  }

  override lazy val params: Array[Param[_]] = {
    transformer.params.map(new ParamWrapper(uid, _))
  }

  override val uid: String = Identifiable.randomUID("TransformerWrapper")
} 
Example 14
Source File: SqlTransformer.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables

import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import ai.deepsense.deeplang.ExecutionContext
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.doperations.exceptions.SqlExpressionException
import ai.deepsense.deeplang.inference.{SqlInferenceWarning, SqlSchemaInferrer}
import ai.deepsense.deeplang.params.{CodeSnippetLanguage, CodeSnippetParam, Param, StringParam}
import ai.deepsense.sparkutils.SparkSQLSession
import ai.deepsense.sparkutils.SQL

class SqlTransformer extends Transformer {

  val dataFrameId = StringParam(
    name = "dataframe id",
    description = Some("An identifier that can be used in " +
      "the SQL expression to refer to the input DataFrame."))
  setDefault(dataFrameId -> "df")
  def getDataFrameId: String = $(dataFrameId)
  def setDataFrameId(value: String): this.type = set(dataFrameId, value)

  val expression = CodeSnippetParam(
    name = "expression",
    description = Some("SQL Expression to be executed on the DataFrame."),
    language = CodeSnippetLanguage(CodeSnippetLanguage.sql))
  setDefault(expression -> "SELECT * FROM df")
  def getExpression: String = $(expression)
  def setExpression(value: String): this.type = set(expression, value)

  override val params: Array[Param[_]] = Array(dataFrameId, expression)

  override protected def applyTransform(ctx: ExecutionContext, df: DataFrame): DataFrame = {
    logger.debug(s"SqlExpression(expression = '$getExpression'," +
      s" dataFrameId = '$getDataFrameId')")

    val localSparkSQLSession = ctx.sparkSQLSession.newSession()
    val localDataFrame = moveToSparkSQLSession(df.sparkDataFrame, localSparkSQLSession)

    SQL.registerTempTable(localDataFrame, getDataFrameId)
    try {
      logger.debug(s"Table '$dataFrameId' registered. Executing the expression")
      val sqlResult = moveToSparkSQLSession(localSparkSQLSession.sql(getExpression), ctx.sparkSQLSession)
      DataFrame.fromSparkDataFrame(sqlResult)
    } finally {
      logger.debug("Unregistering the temporary table " + getDataFrameId)
      localSparkSQLSession.dropTempTable(getDataFrameId)
    }
  }

  override protected def applyTransformSchema(schema: StructType): Option[StructType] = {
    val (resultSchema, warnings) =
      new SqlSchemaInferrer().inferSchema(getExpression, (getDataFrameId, schema))
    // We throw/log as there is no way to pass warnings further at this point.
    warnings.warnings.foreach {
      case SqlInferenceWarning(sqlExpression, warningText) =>
        throw SqlExpressionException(sqlExpression, warningText)
      case other => logger.warn(s"Inference warning not reported: ${other.message}")
    }
    Some(resultSchema)
  }

  private def moveToSparkSQLSession(df: sql.DataFrame, destinationSession: SparkSQLSession): sql.DataFrame =
    destinationSession.createDataFrame(df.rdd, df.schema)
} 
Example 15
Source File: SerializableSparkEstimator.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import ai.deepsense.sparkutils.ML

class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E)
  extends ML.Estimator[SerializableSparkModel[T]]
  with MLWritable {

  override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0"

  override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = {
    val result: T = sparkEstimator.fit(dataset)
    new SerializableSparkModel[T](result)
  }

  override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] =
    new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E])

  override def write: MLWriter = new DefaultMLWriter(this)

  override def transformSchema(schema: StructType): StructType =
    sparkEstimator.transformSchema(schema)
} 
Example 16
Source File: RowSyntax.scala    From spark-hyperloglog   with MIT License 5 votes vote down vote up
package com.collective.analytics.schema

import org.apache.spark.sql
import org.apache.spark.sql.Row
import org.apache.spark.sql.hyperloglog.MergeHyperLogLog

object RowSyntax {

  sealed trait ColumnType

  trait IntColumn extends ColumnType
  trait LongColumn extends ColumnType
  trait StringColumn extends ColumnType
  trait BinaryColumn extends ColumnType
  trait StringArrayColumn extends ColumnType
  trait HLLColumn extends ColumnType

  sealed trait ColumnReader[C <: ColumnType] { self =>
    type Out

    def read(row: sql.Row)(idx: Int): Out

    def map[Out1](f: Out => Out1): ColumnReader[C] {type Out = Out1} =
      new ColumnReader[C] {
        type Out = Out1

        def read(row: Row)(idx: Int): Out = {
          f(self.read(row)(idx))
        }
      }
  }

  implicit class RowOps(val row: Row) extends AnyVal {
    def read[C <: ColumnType](idx: Int)(implicit reader: ColumnReader[C]): reader.Out = {
      reader.read(row)(idx)
    }
  }

  class IntReader[C <: ColumnType] extends ColumnReader[C] {
    type Out = Int
    def read(row: Row)(idx: Int): Out = row.getInt(idx)
  }

  class LongReader[C <: ColumnType] extends ColumnReader[C] {
    type Out = Long
    def read(row: Row)(idx: Int): Out = row.getLong(idx)
  }

  class StringReader[C <: ColumnType] extends ColumnReader[C] {
    type Out = String
    def read(row: Row)(idx: Int): Out = row(idx) match {
      case null => ""
      case str: String => str
      case arr: Array[_] => new String(arr.asInstanceOf[Array[Byte]])
    }
  }

  class StringArrayReader[C <: ColumnType] extends ColumnReader[C] {
    type Out = Array[String]
    def read(row: Row)(idx: Int): Out = row(idx) match {
      case null => Array.empty[String]
      case arr: Array[_] => arr.map(_.toString)
    }
  }

  class BinaryReader[C <: ColumnType] extends ColumnReader[C] {
    type Out = Array[Byte]

    def read(row: Row)(idx: Int): Out = {
      row.getAs[Array[Byte]](idx)
    }
  }

  // Implicit Column Readers

  implicit val intReader = new IntReader[IntColumn]
  implicit val longReader = new LongReader[LongColumn]
  implicit val stringReader = new StringReader[StringColumn]
  implicit val stringArrayReader = new StringArrayReader[StringArrayColumn]
  implicit val binaryReader = new BinaryReader[BinaryColumn]

  implicit val cardinalityReader = new BinaryReader[HLLColumn] map { bytes =>
    MergeHyperLogLog.readHLLWritable(bytes).get()
  }

} 
Example 17
package com.spark.recommendation

import org.apache.spark.{sql, SparkConf}
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.sql.{Dataset, SparkSession}


  def getFeatures(): sql.DataFrame = {
    import spark.implicits._
    //val ratings = spark.read.textFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_05/data/ml-100k 2/u.data").map(parseRating).toDF()
    val ratings = spark.read.textFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_05/2.0.0/scala-spark-app/src/main/scala/com/spark/recommendation/sample_movielens_ratings.txt").map(parseRating).toDF()
    println(ratings.first())

//    val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
//    println(training.first())

    return ratings
  }

  def getSpark(): SparkSession = {
    return spark
  }

  def main(args: Array[String]) {
    getFeatures()
  }

} 
Example 18
Source File: ModelEstimator.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.spark.mlpipeline


import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegressionModel}
import org.apache.spark.ml._
import org.apache.spark.sql
import sql._


  @throws(classOf[IllegalArgumentException])
  final def trainWithSummary(
    trainDf: DataFrame,
    stages: Array[PipelineStage]
  ): Option[(Double, Double)] = {
    require(stages.size > 0, "Cannot process a pipeline without stages")

    // Print the training set data frame
    trainDf.printSchema

    this(trainDf, stages).stages.last match {
      case lrModel: LogisticRegressionModel =>
        val binarySummary = lrModel.summary.asInstanceOf[BinaryLogisticRegressionSummary]

        // Set the model threshold to maximize F-Measure
        val f1: Double = binarySummary.fMeasureByThreshold.select("F-Measure").head.getDouble(0)
        Some(f1, binarySummary.areaUnderROC)
      case _ => None
    }
  }
}

// ------------------------------  EOF -------------------------------------------------------- 
Example 19
Source File: SparkLeapFrame.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.spark

import ml.combust.mleap.core.types.{StructField, StructType}
import ml.combust.mleap.runtime.frame.{FrameBuilder, Row, RowUtil}
import ml.combust.mleap.runtime.function.{Selector, UserDefinedFunction}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql
import org.apache.spark.sql.mleap.TypeConverters
import org.apache.spark.sql.{DataFrame, SQLContext, types}

import scala.util.Try


case class SparkLeapFrame(schema: StructType,
                          dataset: RDD[Row],
                          sqlContext: SQLContext) extends FrameBuilder[SparkLeapFrame] {
  override def withColumn(output: String, inputs: Selector *)
                         (udf: UserDefinedFunction): Try[SparkLeapFrame] = {
    RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap {
      rowSelectors =>
        val field = StructField(output, udf.outputTypes.head)

        schema.withField(field).map {
          schema2 =>
            val dataset2 = dataset.map {
              row => row.withValue(rowSelectors: _*)(udf)
            }
            copy(schema = schema2, dataset = dataset2)
        }
    }
  }

  override def withColumns(outputs: Seq[String], inputs: Selector*)
                          (udf: UserDefinedFunction): Try[SparkLeapFrame] = {
    RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap {
      rowSelectors =>
        val fields = outputs.zip(udf.outputTypes).map {
          case (name, dt) => StructField(name, dt)
        }

        schema.withFields(fields).map {
          schema2 =>
            val dataset2 = dataset.map {
              row => row.withValues(rowSelectors: _*)(udf)
            }
            copy(schema = schema2, dataset = dataset2)
        }
    }
  }

  override def select(fieldNames: String *): Try[SparkLeapFrame] = {
    for(indices <- schema.indicesOf(fieldNames: _*);
      schema2 <- schema.selectIndices(indices: _*)) yield {
      val dataset2 = dataset.map(row => row.selectIndices(indices: _*))

      copy(schema = schema2, dataset = dataset2)
    }
  }

  override def drop(names: String*): Try[SparkLeapFrame] = {
    for(indices <- schema.indicesOf(names: _*);
        schema2 <- schema.dropIndices(indices: _*)) yield {
      val dataset2 = dataset.map(row => row.dropIndices(indices: _*))

      copy(schema = schema2, dataset = dataset2)
    }
  }

  override def filter(selectors: Selector*)
                     (udf: UserDefinedFunction): Try[SparkLeapFrame] = {
    RowUtil.createRowSelectors(schema, selectors: _*)(udf).map {
      rowSelectors =>
        val dataset2 = dataset.filter(row => row.shouldFilter(rowSelectors: _*)(udf))
        copy(schema = schema, dataset = dataset2)
    }
  }

  def toSpark: DataFrame = {
    val spec = schema.fields.map(TypeConverters.mleapToSparkConverter)
    val fields = spec.map(_._1)
    val converters = spec.map(_._2)
    val sparkSchema = new types.StructType(fields.toArray)
    val data = dataset.map {
      r =>
        val values = r.zip(converters).map {
          case (v, c) => c(v)
        }
        sql.Row(values.toSeq: _*)
    }

    sqlContext.createDataFrame(data, sparkSchema)
  }
} 
Example 20
Source File: RedisSourceConfig.scala    From spark-redis   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package org.apache.spark.sql.redis.stream

import org.apache.spark.sql
import org.apache.spark.sql.redis._


case class RedisSourceConfig(consumerConfigs: Seq[RedisConsumerConfig],
                             start: Option[RedisSourceOffset])

object RedisSourceConfig {

  def fromMap(config: Map[String, String]): RedisSourceConfig = {
    val streamKeys = config.getOrElse(StreamOptionStreamKeys,
      throw new IllegalArgumentException(s"Please specify '$StreamOptionStreamKeys'"))
    val start = config.get(StreamOptionStreamOffsets).map(RedisSourceOffset.fromJson)
    val parallelism = config.get(sql.redis.StreamOptionParallelism).map(_.toInt).getOrElse(1)
    val groupName = config.getOrElse(StreamOptionGroupName, "spark-source")
    val consumerPrefix = config.getOrElse(StreamOptionConsumerPrefix, "consumer")
    val batchSize = config.get(StreamOptionReadBatchSize).map(_.toInt).getOrElse(StreamOptionReadBatchSizeDefault)
    val block = config.get(StreamOptionReadBlock).map(_.toInt).getOrElse(StreamOptionReadBlockDefault)
    val consumerConfigs = streamKeys.split(",").flatMap { streamKey =>
      (1 to parallelism).map { consumerIndex =>
        RedisConsumerConfig(streamKey, s"$groupName", s"$consumerPrefix-$consumerIndex", batchSize, block)
      }
    }
    RedisSourceConfig(consumerConfigs, start)
  }
}

case class RedisConsumerConfig(streamKey: String, groupName: String, consumerName: String,
                               batchSize: Int, block: Int) 
Example 21
Source File: TiConverter.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import com.pingcap.tikv.exception.TiBatchWriteException
import com.pingcap.tikv.types._
import org.apache.spark.sql

object TiConverter {
  type TiDataType = com.pingcap.tikv.types.DataType
  type SparkSQLDataType = org.apache.spark.sql.types.DataType

  def fromSparkType(tp: SparkSQLDataType): TiDataType =
    // TODO: review type system
    // pending: https://internal.pingcap.net/jira/browse/TISPARK-99
    tp match {
      case _: sql.types.BinaryType => BytesType.BLOB
      case _: sql.types.StringType => StringType.VARCHAR
      case _: sql.types.LongType => IntegerType.BIGINT
      case _: sql.types.IntegerType => IntegerType.INT
      case _: sql.types.DoubleType => RealType.DOUBLE
      case _: sql.types.FloatType => RealType.FLOAT
      case sql.types.DecimalType.Fixed(prec, scale) =>
        new DecimalType(prec, scale)
      case _: sql.types.TimestampType => TimestampType.TIMESTAMP
      case _: sql.types.DateType => DateType.DATE
    }

  
  def sparkSQLObjectToJavaObject(value: Any): Object = {
    if (value == null) {
      return null
    }

    import scala.collection.JavaConversions._
    val result: java.lang.Object = value match {
      case v: java.lang.Boolean => v
      case v: java.lang.Byte => v
      case v: java.lang.Short => v
      case v: java.lang.Integer => v
      case v: java.lang.Long => v
      case v: java.lang.Float => v
      case v: java.lang.Double => v
      case v: java.lang.String => v
      case v: java.math.BigDecimal => v
      case v: java.sql.Date => v
      case v: java.sql.Timestamp => v
      case v: Array[Byte] =>
        val r: java.util.List[java.lang.Byte] = v.toList.map(b => java.lang.Byte.valueOf(b))
        r
      // TODO: to support following types
      //case v: scala.collection.Seq[_] =>
      //case v: scala.collection.Map[_, _] =>
      //case v: org.apache.spark.sql.Row   =>
      case _ =>
        throw new TiBatchWriteException(
          s"do not support converting SparkSQL Data Type ${value.getClass} to TiDB Data Type!")
    }
    result
  }
}