org.apache.spark.sql.DataFrameReader Scala Examples
The following examples show how to use org.apache.spark.sql.DataFrameReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SolrDataFrameImplicits.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark.util import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row, SaveMode} object SolrDataFrameImplicits { implicit class SolrReader(reader: DataFrameReader) { def solr(collection: String, query: String = "*:*") = reader.format("solr").option("collection", collection).option("query", query).load() def solr(collection: String, options: Map[String, String]) = reader.format("solr").option("collection", collection).options(options).load() } implicit class SolrWriter(writer: DataFrameWriter[Row]) { def solr(collectionName: String, softCommitSecs: Int = 10, overwrite: Boolean = false, format: String = "solr") = { writer .format(format) .option("collection", collectionName) .option("soft_commit_secs", softCommitSecs.toString) .mode(if(overwrite) SaveMode.Overwrite else SaveMode.Append) .save() } } }
Example 2
Source File: DataFrameReaderFunctions.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, DataFrameReader} private def buildFrame(options: Map[String, String] = null, schema: StructType = null): DataFrame = { val builder = dfr .format(source) .schema(schema) if (options != null) { builder.options(options) } builder.load() } }
Example 3
Source File: package.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.github.traviscrawford.spark import org.apache.spark.sql.DataFrame import org.apache.spark.sql.DataFrameReader package object dynamodb { implicit class DynamoDBDataFrameReader(reader: DataFrameReader) { def dynamodb(region: String, table: String): DataFrame = reader .format("com.github.traviscrawford.spark.dynamodb") .option("region", region) .option("table", table) .load } }
Example 4
Source File: streaming_coffee.scala From odsc-west-streaming-trends with GNU General Public License v3.0 | 5 votes |
import org.apache.spark.sql.functions._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.DataFrame import org.apache.spark.sql.Dataset import org.apache.spark.sql.DataFrameReader ///////////////// NOTE: Open another terminal window (nc -lk 9999) FIRST OR THIS WON'T WORK ///////////////// case class Coffee( name: String, roast:Int, region:String, bean: String, acidity:Int = 1, bitterness:Int = 1, flavors: Seq[String] ) case class CoffeeRating( coffeeName: String, score: Int, notes: Option[String] = None ) // add coffeeStand // would go in your main() function //val sparkSession = SparkSession.builder.appName("coffeeShop").getOrCreate() val sparkSession = spark val availableCoffee = Seq( Coffee(name="folgers", roast=2, region="US", bean="robusta", acidity=7, bitterness=10, flavors=Seq("nutty")), Coffee(name="yuban", roast=2, region="Mexico", bean="robusta", acidity=6, bitterness=7, flavors=Seq("nutty")), Coffee(name="nespresso", roast=2, region="Cuba", bean="arabica", acidity=5, bitterness=3, flavors=Seq("nutty", "chocolate")), Coffee(name="ritual", roast=1, region="Brazil", bean="arabica", acidity=2, bitterness=1, flavors=Seq("fruity", "floral", "chocolate")), Coffee(name="four barrel", roast=1, region="Columbia", bean="arabica", flavors=Seq("nutty", "fruity")), Coffee(name="french collection", roast=3, region="France", bean="arabica", flavors=Seq("nutty", "fruity")) ) import spark.implicits._ def asCoffeeRating(input: String): CoffeeRating = { val data = input.split(",") val coffeeName = data(0) val score = data(1).toInt val note = if (data.size > 2) Some(data(2)) else None CoffeeRating(coffeeName, score, note) } val coffeeStandDF = sparkSession.sparkContext.parallelize(availableCoffee, 3).toDF val coffeeRatingsReader = sparkSession.readStream.format("socket").option("host", "localhost").option("port", 9999).load() val rawRatingsData: Dataset[String] = coffeeRatingsReader.as[String] val coffeeRatingsInput = rawRatingsData.map { asCoffeeRating }.toDF val coffeeAndRatingsDF = coffeeStandDF.join(coffeeRatingsInput, coffeeStandDF("name") === coffeeRatingsInput("coffeeName")) val averageRatings = coffeeAndRatingsDF.groupBy(col("name")).agg(avg("score") as "rating").sort(desc("rating")) val query = averageRatings.writeStream.outputMode("complete").format("console").start() // nc -lk 9999 //folgers,1 //folgers,2,"gross" //ritual,5,"awesome"
Example 5
Source File: CsvOptions.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv import org.apache.spark.sql.{DataFrameReader, Row, DataFrameWriter} import ai.deepsense.deeplang.doperations.inout.CsvParameters import ai.deepsense.deeplang.doperations.inout.CsvParameters.ColumnSeparatorChoice object CsvOptions { def map( namesIncluded: Boolean, columnSeparator: ColumnSeparatorChoice): Map[String, String] = { val headerFlag = if (namesIncluded) "true" else "false" Map( "header" -> headerFlag, "delimiter" -> CsvParameters.determineColumnSeparatorOf(columnSeparator).toString, "inferSchema" -> "false" ) } // Unfortunately, making analogous RichDataFrameWriter is awkward, if not impossible. // This is because between Spark 1.6 and 2.0 DataFrameWriter became parametrized implicit class RichDataFrameReader(self: DataFrameReader) { def setCsvOptions( namesIncluded: Boolean, columnSeparator: ColumnSeparatorChoice): DataFrameReader = { val paramMap = map(namesIncluded, columnSeparator) paramMap.foldLeft(self) { case (reader, (key, value)) => reader.option(key, value) } } } }
Example 6
Source File: DataFrameReaderConfigurator.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.csv import org.apache.spark.sql.DataFrameReader object DataFrameReaderConfigurator { implicit class addAbilityToConfigureDataFrameReader(reader:DataFrameReader){ def applyConfiguration(cSVLoaderConfig: CsvLoaderConfig):DataFrameReader={ reader.option("header",cSVLoaderConfig.header.toString) reader.option("delimiter",cSVLoaderConfig.delimiter) reader.option("quote",cSVLoaderConfig.quote) cSVLoaderConfig.schema match{ case None => reader.option("inferSchema",cSVLoaderConfig.inferSchema.toString) case Some(schema) => reader.schema(schema) } reader } } }
Example 7
Source File: LibSVMRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import com.google.common.base.Objects import org.apache.spark.Logging import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @Since("1.6.0") class DefaultSource extends RelationProvider with DataSourceRegister { @Since("1.6.0") override def shortName(): String = "libsvm" @Since("1.6.0") override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) : BaseRelation = { val path = parameters.getOrElse("path", throw new IllegalArgumentException("'path' must be specified")) val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt val vectorType = parameters.getOrElse("vectorType", "sparse") new LibSVMRelation(path, numFeatures, vectorType)(sqlContext) } }
Example 8
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row } import org.apache.spark.sql.types.StructType package object ply { implicit class PlyDataFrameReader(reader: DataFrameReader) { def ply: String => DataFrame = reader.format("fr.ign.spark.iqmulus.ply").load } implicit class PlyDataFrame(df: DataFrame) { def saveAsPly(location: String, littleEndian: Boolean = true) = { val df_id = df.drop("pid").drop("fid") val schema = df_id.schema val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveAsPly(s"$location/$key.ply", schema, littleEndian)) df_id.rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class PlyRowIterator(iter: Iterator[Row]) { def saveAsPly( filename: String, schema: StructType, littleEndian: Boolean ) = { val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val rows = iter.toArray val count = rows.size.toLong val header = new PlyHeader(filename, littleEndian, Map("vertex" -> ((count, schema)))) val dos = new java.io.DataOutputStream(f); dos.write(header.toString.getBytes) val ros = new RowOutputStream(dos, littleEndian, schema) rows.foreach(ros.write) dos.close header } } }
Example 9
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row } import org.apache.spark.sql.types.{ FloatType, StructType } package object xyz { implicit class XyzDataFrameReader(reader: DataFrameReader) { def xyz: String => DataFrame = reader.format("fr.ign.spark.iqmulus.xyz").load } implicit class XyzDataFrame(df: DataFrame) { def saveAsXyz(location: String) = { val df_id = df.drop("id") require(df_id.schema.fieldNames.take(3) sameElements Array("x", "y", "z")) require(df_id.schema.fields.map(_.dataType).take(3).forall(_ == FloatType)) val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveXyz(s"$location/$key.xyz")) df_id.rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class XyzRowIterator(iter: Iterator[Row]) { def saveXyz(filename: String) = { val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val dos = new java.io.DataOutputStream(f) var count = 0L iter.foreach(row => { count += 1; dos.writeBytes(row.mkString("", "\t", "\n")) }) dos.close (filename, count) } } }
Example 10
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame } import org.apache.spark.sql.types.StructType import org.apache.spark.sql.Row package object las { implicit class LasDataFrameReader(reader: DataFrameReader) { def las: String => DataFrame = reader.format("fr.ign.spark.iqmulus.las").load } implicit class LasDataFrame(df: DataFrame) { def saveAsLas( location: String, formatOpt: Option[Byte] = None, version: Version = Version(), scale: Array[Double] = Array(0.01, 0.01, 0.01), offset: Array[Double] = Array(0, 0, 0) ) = { val format = formatOpt.getOrElse(LasHeader.formatFromSchema(df.schema)) val schema = LasHeader.schema(format) // no user types for now val cols = schema.fieldNames.intersect(df.schema.fieldNames) val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveAsLas(s"$location/$key.las", schema, format, scale, offset, version)) df.select(cols.head, cols.tail: _*).rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class LasRowIterator(iter: Iterator[Row]) { def saveAsLas( filename: String, schema: StructType, format: Byte, scale: Array[Double], offset: Array[Double], version: Version = Version() ) = { // materialize the partition to access it in a single pass, TODO workaround that val rows = iter.toArray val count = rows.length.toLong val pmin = Array.fill[Double](3)(Double.PositiveInfinity) val pmax = Array.fill[Double](3)(Double.NegativeInfinity) val countByReturn = Array.fill[Long](15)(0) rows.foreach { row => val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble val ret = row.getAs[Byte]("flags") & 0x3 countByReturn(ret) += 1 pmin(0) = Math.min(pmin(0), x) pmin(1) = Math.min(pmin(1), y) pmin(2) = Math.min(pmin(2), z) pmax(0) = Math.max(pmax(0), x) pmax(1) = Math.max(pmax(1), y) pmax(2) = Math.max(pmax(2), z) } val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val header = new LasHeader(filename, format, count, pmin, pmax, scale, offset, version = version, pdr_return_nb = countByReturn) val dos = new java.io.DataOutputStream(f); header.write(dos) val ros = new RowOutputStream(dos, littleEndian = true, schema) rows.foreach(ros.write) dos.close header } } }
Example 11
Source File: DataFrameReaderFunctions.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.sql import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, DataFrameReader} class DataFrameReaderFunctions(@transient val dfr: DataFrameReader) extends Serializable { private def buildFrame(options: Map[String, String] = null, schema: StructType = null, schemaFilter: Option[Filter] = null): DataFrame = { val builder = dfr .format(source) .schema(schema) val filter = schemaFilter.map(N1QLRelation.filterToExpression) if (filter.isDefined) { builder.option("schemaFilter", filter.get) } if (options != null) { builder.options(options) } builder.load() } }
Example 12
Source File: AngelTestUtils.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.util import com.tencent.angel.sona.core.DriverContext import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.sql.{DataFrameReader, SparkSession} class AngelTestUtils extends SparkFunSuite { protected var spark: SparkSession = _ protected var libsvm: DataFrameReader = _ protected var dummy: DataFrameReader = _ protected var sparkConf: SparkConf = _ protected var driverCtx: DriverContext = _ protected override def beforeAll(): Unit = { super.beforeAll() spark = SparkSession.builder() .master("local[2]") .appName("AngelClassification") .getOrCreate() libsvm = spark.read.format("libsvmex") dummy = spark.read.format("dummy") sparkConf = spark.sparkContext.getConf driverCtx = DriverContext.get(sparkConf) driverCtx.startAngelAndPSAgent() } protected override def afterAll(): Unit = { super.afterAll() driverCtx.stopAngelAndPSAgent() } }
Example 13
Source File: package.scala From spark-athena with Apache License 2.0 | 5 votes |
package io.github.tmheo.spark import java.util.Properties import com.amazonaws.athena.jdbc.shaded.com.amazonaws.regions.Regions import org.apache.spark.sql.{DataFrame, DataFrameReader} import scala.collection.JavaConverters._ package object athena { implicit class AthenaDataFrameReader(reader: DataFrameReader) { def athena(table: String): DataFrame = { reader.format("io.github.tmheo.spark.athena") .option(JDBCOptions.JDBC_TABLE_NAME, table) .load } def athena(table: String, region: String, s3StatingDir: String): DataFrame = { reader.format("io.github.tmheo.spark.athena") .option(JDBCOptions.JDBC_TABLE_NAME, table) .option("region", region) .option("s3_staging_dir", s3StatingDir) .load } def athena(table: String, s3StatingDir: String): DataFrame = { athena(table, Regions.getCurrentRegion.getName, s3StatingDir) } def athena(table: String, properties: Properties): DataFrame = { val options = properties.asScala options += (JDBCOptions.JDBC_TABLE_NAME -> table) reader.format("io.github.tmheo.spark.athena").options(options).load } } }
Example 14
Source File: CsvOptions.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv import org.apache.spark.sql.{DataFrameReader, Row, DataFrameWriter} import io.deepsense.deeplang.doperations.inout.CsvParameters import io.deepsense.deeplang.doperations.inout.CsvParameters.ColumnSeparatorChoice object CsvOptions { def map( namesIncluded: Boolean, columnSeparator: ColumnSeparatorChoice): Map[String, String] = { val headerFlag = if (namesIncluded) "true" else "false" Map( "header" -> headerFlag, "delimiter" -> CsvParameters.determineColumnSeparatorOf(columnSeparator).toString, "inferSchema" -> "false" ) } // Unfortunately, making analogous RichDataFrameWriter is awkward, if not impossible. // This is because between Spark 1.6 and 2.0 DataFrameWriter became parametrized implicit class RichDataFrameReader(self: DataFrameReader) { def setCsvOptions( namesIncluded: Boolean, columnSeparator: ColumnSeparatorChoice): DataFrameReader = { val paramMap = map(namesIncluded, columnSeparator) paramMap.foldLeft(self) { case (reader, (key, value)) => reader.option(key, value) } } } }