org.apache.spark.sql.execution.LogicalRDD Scala Example

Source File: RawCsvRDDToDataframe.scala From seahorse with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.csv

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.types.{StructType, _}


object RawCsvRDDToDataframe {

  def parse(
      rdd: RDD[String],
      sparkSession: SparkSession,
      options: Map[String, String]): DataFrame = {
    val csvOptions = MapToCsvOptions(options, sparkSession.sessionState.conf)
    val csvReader = SparkCsvReader.create(csvOptions)
    val firstLine = findFirstLine(csvOptions, rdd)
    val firstRow = csvReader.parseLine(firstLine)
    val header = if (csvOptions.headerFlag) {
      firstRow.zipWithIndex.map { case (value, index) =>
        if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value
      }
    } else {
      firstRow.zipWithIndex.map { case (value, index) => s"_c$index" }
    }

    val parsedRdd = tokenRdd(rdd, header, csvOptions)

    // TODO Migrate to Spark's schema inferencer eventually
    // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions)
    val schema = {
      val schemaFields = header.map { fieldName =>
        StructField(fieldName.toString, StringType, nullable = true)
      }
      StructType(schemaFields)
    }

    val ignoreMalformedRows = 0
    val internalRows = parsedRdd.flatMap { row =>
      val parser = CSVRelation.csvParser(schema, header, csvOptions)
      parser(row, ignoreMalformedRows)
    }

    Dataset.ofRows(
      sparkSession,
      LogicalRDD(
        schema.toAttributes,
        internalRows)(sparkSession))
  }

  private def tokenRdd(
      rdd: RDD[String],
      header: Array[String],
      options: CSVOptions): RDD[Array[String]] = {
    // Make sure firstLine is materialized before sending to executors
    val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null
    SparkCsvReader.univocityTokenizer(rdd, header, firstLine, options)
  }

  private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = {
    if (options.isCommentSet) {
      val comment = options.comment.toString
      rdd.filter { line =>
        line.trim.nonEmpty && !line.startsWith(comment)
      }.first()
    } else {
      rdd.filter { line =>
        line.trim.nonEmpty
      }.first()
    }
  }

}

Source File: RawCsvRDDToDataframe.scala From seahorse with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.csv

import scala.util.Try

import com.univocity.parsers.csv.CsvParser
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.types.{StructType, _}



object RawCsvRDDToDataframe {

  def parse(
      rdd: RDD[String],
      sparkSession: SparkSession,
      options: Map[String, String]): DataFrame = {
    val csvOptions = new CSVOptions(options, sparkSession.sessionState.conf.sessionLocalTimeZone)
    val csvReader = new CsvParser(csvOptions.asParserSettings)
    val firstLine = findFirstLine(csvOptions, rdd)
    val firstRow = csvReader.parseLine(firstLine)
    val header = if (csvOptions.headerFlag) {
      firstRow.zipWithIndex.map { case (value, index) =>
        if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value
      }
    } else {
      firstRow.zipWithIndex.map { case (value, index) => s"_c$index" }
    }

    // TODO Migrate to Spark's schema inferencer eventually
    // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions)
    val schema = {
      val schemaFields = header.map { fieldName =>
        StructField(fieldName.toString, StringType, nullable = true)
      }
      StructType(schemaFields)
    }

    val withoutHeader = if (csvOptions.headerFlag) {
      rdd.zipWithIndex()
        .filter { case (row, index) => index != 0 }
        .map { case (row, index) => row }
    }
    else {
      rdd
    }


    val internalRows = withoutHeader.filter(row => row.trim.nonEmpty)
      .flatMap { row =>
      val univocityParser = new UnivocityParser(schema, csvOptions)
      Try(univocityParser.parse(row)).toOption
    }

    Dataset.ofRows(
      sparkSession,
      LogicalRDD(
        schema.toAttributes,
        internalRows)(sparkSession))
  }

  private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = {
    if (options.isCommentSet) {
      val comment = options.comment.toString
      rdd.filter { line =>
        line.trim.nonEmpty && !line.startsWith(comment)
      }.first()
    } else {
      rdd.filter { line =>
        line.trim.nonEmpty
      }.first()
    }
  }

}

Source File: SqlUtils.scala From spark-acid with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.types.StructType

object SqlUtils {
  def convertToDF(sparkSession: SparkSession, plan : LogicalPlan): DataFrame = {
    Dataset.ofRows(sparkSession, plan)
  }

  def resolveReferences(sparkSession: SparkSession,
                        expr: Expression,
                        planContaining: LogicalPlan, failIfUnresolved: Boolean,
                        exprName: Option[String] = None): Expression = {
    resolveReferences(sparkSession, expr, Seq(planContaining), failIfUnresolved, exprName)
  }

  def resolveReferences(sparkSession: SparkSession,
                        expr: Expression,
                        planContaining: Seq[LogicalPlan],
                        failIfUnresolved: Boolean,
                        exprName: Option[String]): Expression = {
    val newPlan = FakeLogicalPlan(expr, planContaining)
    val resolvedExpr = sparkSession.sessionState.analyzer.execute(newPlan) match {
      case FakeLogicalPlan(resolvedExpr: Expression, _) =>
        // Return even if it did not successfully resolve
        resolvedExpr
      case _ =>
        expr
      // This is unexpected
    }
    if (failIfUnresolved) {
      resolvedExpr.flatMap(_.references).filter(!_.resolved).foreach {
        attr => {
          val failedMsg = exprName match {
            case Some(name) => s"${attr.sql} resolution in $name given these columns: "+
              planContaining.flatMap(_.output).map(_.name).mkString(",")
            case _ => s"${attr.sql} resolution failed given these columns: "+
              planContaining.flatMap(_.output).map(_.name).mkString(",")
          }
          attr.failAnalysis(failedMsg)
        }
      }
    }
    resolvedExpr
  }

  def hasSparkStopped(sparkSession: SparkSession): Boolean = {
    sparkSession.sparkContext.stopped.get()
  }

  
  def createDataFrameUsingAttributes(sparkSession: SparkSession,
                                     rdd: RDD[Row],
                                     schema: StructType,
                                     attributes: Seq[Attribute]): DataFrame = {
    val encoder = RowEncoder(schema)
    val catalystRows = rdd.map(encoder.toRow)
    val logicalPlan = LogicalRDD(
      attributes,
      catalystRows,
      isStreaming = false)(sparkSession)
    Dataset.ofRows(sparkSession, logicalPlan)
  }

  def analysisException(cause: String): Throwable = {
    new AnalysisException(cause)
  }
}

case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan])
  extends LogicalPlan {
  override def output: Seq[Attribute] = children.foldLeft(Seq[Attribute]())((out, child) => out ++ child.output)
}

Source File: RawCsvRDDToDataframe.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.csv

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.types.{StructType, _}

import io.deepsense.sparkutils.SparkSQLSession


object RawCsvRDDToDataframe {

  def parse(
      rdd: RDD[String],
      sparkSQLSession: SparkSQLSession,
      options: Map[String, String]): DataFrame = {
    val csvOptions = new CSVOptions(options)
    val lineCsvReader = new LineCsvReader(csvOptions)
    val firstLine = findFirstLine(csvOptions, rdd)
    val firstRow = lineCsvReader.parseLine(firstLine)

    val header = if (csvOptions.headerFlag) {
      firstRow.zipWithIndex.map { case (value, index) =>
        if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value
      }
    } else {
      firstRow.zipWithIndex.map { case (value, index) => s"_c$index" }
    }

    val parsedRdd = tokenRdd(rdd, csvOptions, header)

    // TODO Migrate to Spark's schema inferencer eventually
    // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions)
    val schema = {
      val schemaFields = header.map { fieldName =>
        StructField(fieldName.toString, StringType, nullable = true)
      }
      StructType(schemaFields)
    }

    val ignoreMalformedRows = 0
    val internalRows = parsedRdd.flatMap { row =>
      val parser = CSVRelation.csvParser(schema, header, csvOptions)
      parser(row, ignoreMalformedRows)
    }

    val sparkSession = sparkSQLSession.getSparkSession
    Dataset.ofRows(
      sparkSession,
      LogicalRDD(
        schema.toAttributes,
        internalRows)(sparkSession))
  }

  private def tokenRdd(
      rdd: RDD[String],
      options: CSVOptions,
      header: Array[String]): RDD[Array[String]] = {
    // Make sure firstLine is materialized before sending to executors
    val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null
    CSVRelation.univocityTokenizer(rdd, header, firstLine, options)
  }

  private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = {
    if (options.isCommentSet) {
      val comment = options.comment.toString
      rdd.filter { line =>
        line.trim.nonEmpty && !line.startsWith(comment)
      }.first()
    } else {
      rdd.filter { line =>
        line.trim.nonEmpty
      }.first()
    }
  }

}

org.apache.spark.sql.execution.LogicalRDD Scala Examples