org.apache.spark.sql.DataFrameWriter Scala Examples
The following examples show how to use org.apache.spark.sql.DataFrameWriter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SolrDataFrameImplicits.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark.util import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row, SaveMode} object SolrDataFrameImplicits { implicit class SolrReader(reader: DataFrameReader) { def solr(collection: String, query: String = "*:*") = reader.format("solr").option("collection", collection).option("query", query).load() def solr(collection: String, options: Map[String, String]) = reader.format("solr").option("collection", collection).options(options).load() } implicit class SolrWriter(writer: DataFrameWriter[Row]) { def solr(collectionName: String, softCommitSecs: Int = 10, overwrite: Boolean = false, format: String = "solr") = { writer .format(format) .option("collection", collectionName) .option("soft_commit_secs", softCommitSecs.toString) .mode(if(overwrite) SaveMode.Overwrite else SaveMode.Append) .save() } } }
Example 2
Source File: SparkExtension.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark.sql.extension import com.microsoft.azure.kusto.data.ClientRequestProperties import com.microsoft.kusto.spark.datasink.{KustoSinkOptions, SparkIngestionProperties} import com.microsoft.kusto.spark.datasource.KustoSourceOptions import org.apache.spark.sql.streaming.DataStreamWriter import org.apache.spark.sql.{DataFrameWriter, _} object SparkExtension { implicit class DataFrameReaderExtension(df: DataFrameReader) { def kusto(kustoCluster: String, database: String, query: String, conf: Map[String, String] = Map.empty[String, String], cpr: Option[ClientRequestProperties] = None): DataFrame = { if (cpr.isDefined) { df.option(KustoSourceOptions.KUSTO_CLIENT_REQUEST_PROPERTIES_JSON, cpr.get.toString) } df.format("com.microsoft.kusto.spark.datasource") .option(KustoSourceOptions.KUSTO_CLUSTER, kustoCluster) .option(KustoSourceOptions.KUSTO_DATABASE, database) .option(KustoSourceOptions.KUSTO_QUERY, query) .options(conf) .load() } } implicit class DataFrameWriterExtension(df: DataFrameWriter[Row]) { def kusto(kustoCluster: String, database: String, table: String, conf: Map[String, String] = Map.empty[String, String], sparkIngestionProperties: Option[SparkIngestionProperties] = None): Unit = { if (sparkIngestionProperties.isDefined) { df.option(KustoSinkOptions.KUSTO_SPARK_INGESTION_PROPERTIES_JSON, sparkIngestionProperties.get.toString) } df.format("com.microsoft.kusto.spark.datasource") .option(KustoSinkOptions.KUSTO_CLUSTER, kustoCluster) .option(KustoSinkOptions.KUSTO_DATABASE, database) .option(KustoSinkOptions.KUSTO_TABLE, table) .options(conf) .mode(SaveMode.Append) .save() } } implicit class DataStreamWriterExtension(df: DataStreamWriter[Row]) { def kusto(kustoCluster: String, database: String, table: String, conf: Map[String, String] = Map.empty[String, String], sparkIngestionProperties: Option[SparkIngestionProperties] = None): Unit = { if (sparkIngestionProperties.isDefined) { df.option(KustoSinkOptions.KUSTO_SPARK_INGESTION_PROPERTIES_JSON, sparkIngestionProperties.get.toString) } df.format("com.microsoft.kusto.spark.datasource") .option(KustoSinkOptions.KUSTO_CLUSTER, kustoCluster) .option(KustoSinkOptions.KUSTO_DATABASE, database) .option(KustoSinkOptions.KUSTO_TABLE, table) .options(conf) } } }
Example 3
Source File: Output.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.sdk.pipeline.output import java.io.{Serializable => JSerializable} import akka.event.slf4j.SLF4JLogging import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._ import com.stratio.sparta.sdk.properties.{CustomProperties, Parameterizable} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SaveMode} abstract class Output(val name: String, properties: Map[String, JSerializable]) extends Parameterizable(properties) with SLF4JLogging with CustomProperties { val customKey = "saveOptions" val customPropertyKey = "saveOptionsKey" val customPropertyValue = "saveOptionsValue" val propertiesWithCustom = properties ++ getCustomProperties def setUp(options: Map[String, String] = Map.empty[String, String]): Unit = {} def cleanUp(options: Map[String, String] = Map.empty[String, String]): Unit = {} def save(dataFrame: DataFrame, saveMode: SaveModeEnum.Value, options: Map[String, String]): Unit def supportedSaveModes: Seq[SaveModeEnum.Value] = SaveModeEnum.allSaveModes def validateSaveMode(saveMode: SaveModeEnum.Value): Unit = { if (!supportedSaveModes.contains(saveMode)) log.info(s"Save mode $saveMode selected not supported by the output $name." + s" Using the default mode ${SaveModeEnum.Append}" ) } } object Output extends SLF4JLogging { final val ClassSuffix = "Output" final val SparkConfigurationMethod = "getSparkConfiguration" final val Separator = "_" final val FieldsSeparator = "," final val PrimaryKey = "primaryKey" final val TableNameKey = "tableName" final val PartitionByKey = "partitionBy" final val TimeDimensionKey = "timeDimension" final val MeasureMetadataKey = "measure" final val PrimaryKeyMetadataKey = "pk" def getSparkSaveMode(saveModeEnum: SaveModeEnum.Value): SaveMode = saveModeEnum match { case SaveModeEnum.Append => SaveMode.Append case SaveModeEnum.ErrorIfExists => SaveMode.ErrorIfExists case SaveModeEnum.Overwrite => SaveMode.Overwrite case SaveModeEnum.Ignore => SaveMode.Ignore case SaveModeEnum.Upsert => SaveMode.Append case _ => log.warn(s"Save Mode $saveModeEnum not supported, using default save mode ${SaveModeEnum.Append}") SaveMode.Append } def getTimeFromOptions(options: Map[String, String]): Option[String] = options.get(TimeDimensionKey).notBlank def getPrimaryKeyOptions(options: Map[String, String]): Option[String] = options.get(PrimaryKey).notBlank def getTableNameFromOptions(options: Map[String, String]): String = options.getOrElse(TableNameKey, { log.error("Table name not defined") throw new NoSuchElementException("tableName not found in options") }) def applyPartitionBy(options: Map[String, String], dataFrame: DataFrameWriter[Row], schemaFields: Array[StructField]): DataFrameWriter[Row] = { options.get(PartitionByKey).notBlank.fold(dataFrame)(partitions => { val fieldsInDataFrame = schemaFields.map(field => field.name) val partitionFields = partitions.split(",") if (partitionFields.forall(field => fieldsInDataFrame.contains(field))) dataFrame.partitionBy(partitionFields: _*) else { log.warn(s"Impossible to execute partition by fields: $partitionFields because the dataFrame not contain all" + s" fields. The dataFrame only contains: ${fieldsInDataFrame.mkString(",")}") dataFrame } }) } def defaultTimeStampField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, TimestampType, nullable, metadata) def defaultDateField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, DateType, nullable, metadata) def defaultStringField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, StringType, nullable, metadata) def defaultGeoField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, ArrayType(DoubleType), nullable, metadata) def defaultLongField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, LongType, nullable, metadata) }
Example 4
Source File: ALSModeling.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package com.spark.recommendation import java.util import com.spark.recommendation.FeatureExtraction.{Rating, parseRating} import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.recommendation.ALS import org.apache.spark.sql.{Row, DataFrame, DataFrameWriter} object ALSModeling { def createALSModel() { val ratings = FeatureExtraction.getFeatures(); val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) println(training.first()) // Build the recommendation model using ALS on the training data val als = new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") val model = als.fit(training) println(model.userFactors.count()) println(model.itemFactors.count()) val predictions = model.transform(test) println(predictions.printSchema()) val evaluator = new RegressionEvaluator() .setMetricName("rmse") .setLabelCol("rating") .setPredictionCol("prediction") val rmse = evaluator.evaluate(predictions) println(s"Root-mean-square error = $rmse") } def main(args: Array[String]) { createALSModel() } }
Example 5
Source File: CsvOptions.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv import org.apache.spark.sql.{DataFrameReader, Row, DataFrameWriter} import ai.deepsense.deeplang.doperations.inout.CsvParameters import ai.deepsense.deeplang.doperations.inout.CsvParameters.ColumnSeparatorChoice object CsvOptions { def map( namesIncluded: Boolean, columnSeparator: ColumnSeparatorChoice): Map[String, String] = { val headerFlag = if (namesIncluded) "true" else "false" Map( "header" -> headerFlag, "delimiter" -> CsvParameters.determineColumnSeparatorOf(columnSeparator).toString, "inferSchema" -> "false" ) } // Unfortunately, making analogous RichDataFrameWriter is awkward, if not impossible. // This is because between Spark 1.6 and 2.0 DataFrameWriter became parametrized implicit class RichDataFrameReader(self: DataFrameReader) { def setCsvOptions( namesIncluded: Boolean, columnSeparator: ColumnSeparatorChoice): DataFrameReader = { val paramMap = map(namesIncluded, columnSeparator) paramMap.foldLeft(self) { case (reader, (key, value)) => reader.option(key, value) } } } }
Example 6
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row } import org.apache.spark.sql.types.StructType package object ply { implicit class PlyDataFrameReader(reader: DataFrameReader) { def ply: String => DataFrame = reader.format("fr.ign.spark.iqmulus.ply").load } implicit class PlyDataFrame(df: DataFrame) { def saveAsPly(location: String, littleEndian: Boolean = true) = { val df_id = df.drop("pid").drop("fid") val schema = df_id.schema val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveAsPly(s"$location/$key.ply", schema, littleEndian)) df_id.rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class PlyRowIterator(iter: Iterator[Row]) { def saveAsPly( filename: String, schema: StructType, littleEndian: Boolean ) = { val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val rows = iter.toArray val count = rows.size.toLong val header = new PlyHeader(filename, littleEndian, Map("vertex" -> ((count, schema)))) val dos = new java.io.DataOutputStream(f); dos.write(header.toString.getBytes) val ros = new RowOutputStream(dos, littleEndian, schema) rows.foreach(ros.write) dos.close header } } }
Example 7
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row } import org.apache.spark.sql.types.{ FloatType, StructType } package object xyz { implicit class XyzDataFrameReader(reader: DataFrameReader) { def xyz: String => DataFrame = reader.format("fr.ign.spark.iqmulus.xyz").load } implicit class XyzDataFrame(df: DataFrame) { def saveAsXyz(location: String) = { val df_id = df.drop("id") require(df_id.schema.fieldNames.take(3) sameElements Array("x", "y", "z")) require(df_id.schema.fields.map(_.dataType).take(3).forall(_ == FloatType)) val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveXyz(s"$location/$key.xyz")) df_id.rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class XyzRowIterator(iter: Iterator[Row]) { def saveXyz(filename: String) = { val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val dos = new java.io.DataOutputStream(f) var count = 0L iter.foreach(row => { count += 1; dos.writeBytes(row.mkString("", "\t", "\n")) }) dos.close (filename, count) } } }
Example 8
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame } import org.apache.spark.sql.types.StructType import org.apache.spark.sql.Row package object las { implicit class LasDataFrameReader(reader: DataFrameReader) { def las: String => DataFrame = reader.format("fr.ign.spark.iqmulus.las").load } implicit class LasDataFrame(df: DataFrame) { def saveAsLas( location: String, formatOpt: Option[Byte] = None, version: Version = Version(), scale: Array[Double] = Array(0.01, 0.01, 0.01), offset: Array[Double] = Array(0, 0, 0) ) = { val format = formatOpt.getOrElse(LasHeader.formatFromSchema(df.schema)) val schema = LasHeader.schema(format) // no user types for now val cols = schema.fieldNames.intersect(df.schema.fieldNames) val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveAsLas(s"$location/$key.las", schema, format, scale, offset, version)) df.select(cols.head, cols.tail: _*).rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class LasRowIterator(iter: Iterator[Row]) { def saveAsLas( filename: String, schema: StructType, format: Byte, scale: Array[Double], offset: Array[Double], version: Version = Version() ) = { // materialize the partition to access it in a single pass, TODO workaround that val rows = iter.toArray val count = rows.length.toLong val pmin = Array.fill[Double](3)(Double.PositiveInfinity) val pmax = Array.fill[Double](3)(Double.NegativeInfinity) val countByReturn = Array.fill[Long](15)(0) rows.foreach { row => val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble val ret = row.getAs[Byte]("flags") & 0x3 countByReturn(ret) += 1 pmin(0) = Math.min(pmin(0), x) pmin(1) = Math.min(pmin(1), y) pmin(2) = Math.min(pmin(2), z) pmax(0) = Math.max(pmax(0), x) pmax(1) = Math.max(pmax(1), y) pmax(2) = Math.max(pmax(2), z) } val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val header = new LasHeader(filename, format, count, pmin, pmax, scale, offset, version = version, pdr_return_nb = countByReturn) val dos = new java.io.DataOutputStream(f); header.write(dos) val ros = new RowOutputStream(dos, littleEndian = true, schema) rows.foreach(ros.write) dos.close header } } }
Example 9
Source File: CsvOptions.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv import org.apache.spark.sql.{DataFrameReader, Row, DataFrameWriter} import io.deepsense.deeplang.doperations.inout.CsvParameters import io.deepsense.deeplang.doperations.inout.CsvParameters.ColumnSeparatorChoice object CsvOptions { def map( namesIncluded: Boolean, columnSeparator: ColumnSeparatorChoice): Map[String, String] = { val headerFlag = if (namesIncluded) "true" else "false" Map( "header" -> headerFlag, "delimiter" -> CsvParameters.determineColumnSeparatorOf(columnSeparator).toString, "inferSchema" -> "false" ) } // Unfortunately, making analogous RichDataFrameWriter is awkward, if not impossible. // This is because between Spark 1.6 and 2.0 DataFrameWriter became parametrized implicit class RichDataFrameReader(self: DataFrameReader) { def setCsvOptions( namesIncluded: Boolean, columnSeparator: ColumnSeparatorChoice): DataFrameReader = { val paramMap = map(namesIncluded, columnSeparator) paramMap.foldLeft(self) { case (reader, (key, value)) => reader.option(key, value) } } } }