org.apache.spark.sql.execution.LogicalRDD Scala Examples
The following examples show how to use org.apache.spark.sql.execution.LogicalRDD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RawCsvRDDToDataframe.scala From seahorse with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.types.{StructType, _} object RawCsvRDDToDataframe { def parse( rdd: RDD[String], sparkSession: SparkSession, options: Map[String, String]): DataFrame = { val csvOptions = MapToCsvOptions(options, sparkSession.sessionState.conf) val csvReader = SparkCsvReader.create(csvOptions) val firstLine = findFirstLine(csvOptions, rdd) val firstRow = csvReader.parseLine(firstLine) val header = if (csvOptions.headerFlag) { firstRow.zipWithIndex.map { case (value, index) => if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value } } else { firstRow.zipWithIndex.map { case (value, index) => s"_c$index" } } val parsedRdd = tokenRdd(rdd, header, csvOptions) // TODO Migrate to Spark's schema inferencer eventually // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions) val schema = { val schemaFields = header.map { fieldName => StructField(fieldName.toString, StringType, nullable = true) } StructType(schemaFields) } val ignoreMalformedRows = 0 val internalRows = parsedRdd.flatMap { row => val parser = CSVRelation.csvParser(schema, header, csvOptions) parser(row, ignoreMalformedRows) } Dataset.ofRows( sparkSession, LogicalRDD( schema.toAttributes, internalRows)(sparkSession)) } private def tokenRdd( rdd: RDD[String], header: Array[String], options: CSVOptions): RDD[Array[String]] = { // Make sure firstLine is materialized before sending to executors val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null SparkCsvReader.univocityTokenizer(rdd, header, firstLine, options) } private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { if (options.isCommentSet) { val comment = options.comment.toString rdd.filter { line => line.trim.nonEmpty && !line.startsWith(comment) }.first() } else { rdd.filter { line => line.trim.nonEmpty }.first() } } }
Example 2
Source File: RawCsvRDDToDataframe.scala From seahorse with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import scala.util.Try import com.univocity.parsers.csv.CsvParser import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.types.{StructType, _} object RawCsvRDDToDataframe { def parse( rdd: RDD[String], sparkSession: SparkSession, options: Map[String, String]): DataFrame = { val csvOptions = new CSVOptions(options, sparkSession.sessionState.conf.sessionLocalTimeZone) val csvReader = new CsvParser(csvOptions.asParserSettings) val firstLine = findFirstLine(csvOptions, rdd) val firstRow = csvReader.parseLine(firstLine) val header = if (csvOptions.headerFlag) { firstRow.zipWithIndex.map { case (value, index) => if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value } } else { firstRow.zipWithIndex.map { case (value, index) => s"_c$index" } } // TODO Migrate to Spark's schema inferencer eventually // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions) val schema = { val schemaFields = header.map { fieldName => StructField(fieldName.toString, StringType, nullable = true) } StructType(schemaFields) } val withoutHeader = if (csvOptions.headerFlag) { rdd.zipWithIndex() .filter { case (row, index) => index != 0 } .map { case (row, index) => row } } else { rdd } val internalRows = withoutHeader.filter(row => row.trim.nonEmpty) .flatMap { row => val univocityParser = new UnivocityParser(schema, csvOptions) Try(univocityParser.parse(row)).toOption } Dataset.ofRows( sparkSession, LogicalRDD( schema.toAttributes, internalRows)(sparkSession)) } private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { if (options.isCommentSet) { val comment = options.comment.toString rdd.filter { line => line.trim.nonEmpty && !line.startsWith(comment) }.first() } else { rdd.filter { line => line.trim.nonEmpty }.first() } } }
Example 3
Source File: SqlUtils.scala From spark-acid with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression} import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.types.StructType object SqlUtils { def convertToDF(sparkSession: SparkSession, plan : LogicalPlan): DataFrame = { Dataset.ofRows(sparkSession, plan) } def resolveReferences(sparkSession: SparkSession, expr: Expression, planContaining: LogicalPlan, failIfUnresolved: Boolean, exprName: Option[String] = None): Expression = { resolveReferences(sparkSession, expr, Seq(planContaining), failIfUnresolved, exprName) } def resolveReferences(sparkSession: SparkSession, expr: Expression, planContaining: Seq[LogicalPlan], failIfUnresolved: Boolean, exprName: Option[String]): Expression = { val newPlan = FakeLogicalPlan(expr, planContaining) val resolvedExpr = sparkSession.sessionState.analyzer.execute(newPlan) match { case FakeLogicalPlan(resolvedExpr: Expression, _) => // Return even if it did not successfully resolve resolvedExpr case _ => expr // This is unexpected } if (failIfUnresolved) { resolvedExpr.flatMap(_.references).filter(!_.resolved).foreach { attr => { val failedMsg = exprName match { case Some(name) => s"${attr.sql} resolution in $name given these columns: "+ planContaining.flatMap(_.output).map(_.name).mkString(",") case _ => s"${attr.sql} resolution failed given these columns: "+ planContaining.flatMap(_.output).map(_.name).mkString(",") } attr.failAnalysis(failedMsg) } } } resolvedExpr } def hasSparkStopped(sparkSession: SparkSession): Boolean = { sparkSession.sparkContext.stopped.get() } def createDataFrameUsingAttributes(sparkSession: SparkSession, rdd: RDD[Row], schema: StructType, attributes: Seq[Attribute]): DataFrame = { val encoder = RowEncoder(schema) val catalystRows = rdd.map(encoder.toRow) val logicalPlan = LogicalRDD( attributes, catalystRows, isStreaming = false)(sparkSession) Dataset.ofRows(sparkSession, logicalPlan) } def analysisException(cause: String): Throwable = { new AnalysisException(cause) } } case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan]) extends LogicalPlan { override def output: Seq[Attribute] = children.foldLeft(Seq[Attribute]())((out, child) => out ++ child.output) }
Example 4
Source File: RawCsvRDDToDataframe.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.types.{StructType, _} import io.deepsense.sparkutils.SparkSQLSession object RawCsvRDDToDataframe { def parse( rdd: RDD[String], sparkSQLSession: SparkSQLSession, options: Map[String, String]): DataFrame = { val csvOptions = new CSVOptions(options) val lineCsvReader = new LineCsvReader(csvOptions) val firstLine = findFirstLine(csvOptions, rdd) val firstRow = lineCsvReader.parseLine(firstLine) val header = if (csvOptions.headerFlag) { firstRow.zipWithIndex.map { case (value, index) => if (value == null || value.isEmpty || value == csvOptions.nullValue) s"_c$index" else value } } else { firstRow.zipWithIndex.map { case (value, index) => s"_c$index" } } val parsedRdd = tokenRdd(rdd, csvOptions, header) // TODO Migrate to Spark's schema inferencer eventually // val schema = CSVInferSchema.infer(parsedRdd, header, csvOptions) val schema = { val schemaFields = header.map { fieldName => StructField(fieldName.toString, StringType, nullable = true) } StructType(schemaFields) } val ignoreMalformedRows = 0 val internalRows = parsedRdd.flatMap { row => val parser = CSVRelation.csvParser(schema, header, csvOptions) parser(row, ignoreMalformedRows) } val sparkSession = sparkSQLSession.getSparkSession Dataset.ofRows( sparkSession, LogicalRDD( schema.toAttributes, internalRows)(sparkSession)) } private def tokenRdd( rdd: RDD[String], options: CSVOptions, header: Array[String]): RDD[Array[String]] = { // Make sure firstLine is materialized before sending to executors val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null CSVRelation.univocityTokenizer(rdd, header, firstLine, options) } private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { if (options.isCommentSet) { val comment = options.comment.toString rdd.filter { line => line.trim.nonEmpty && !line.startsWith(comment) }.first() } else { rdd.filter { line => line.trim.nonEmpty }.first() } } }