org.apache.spark.sql.SQLImplicits Scala Examples
The following examples show how to use org.apache.spark.sql.SQLImplicits.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MLlibTestSparkContext.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.io.File import org.scalatest.Suite import org.apache.spark.SparkContext import org.apache.spark.ml.util.TempDirectory import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits} import org.apache.spark.util.Utils trait MLlibTestSparkContext extends TempDirectory { self: Suite => @transient var spark: SparkSession = _ @transient var sc: SparkContext = _ @transient var checkpointDir: String = _ override def beforeAll() { super.beforeAll() spark = SparkSession.builder .master("local[2]") .appName("MLlibUnitTest") .getOrCreate() sc = spark.sparkContext checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString sc.setCheckpointDir(checkpointDir) } override def afterAll() { try { Utils.deleteRecursively(new File(checkpointDir)) SparkSession.clearActiveSession() if (spark != null) { spark.stop() } spark = null } finally { super.afterAll() } } protected object testImplicits extends SQLImplicits { protected override def _sqlContext: SQLContext = self.spark.sqlContext } }
Example 2
Source File: MovieRecommendation.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.MovieRecommendation import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.SQLImplicits import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.MatrixFactorizationModel import org.apache.spark.mllib.recommendation.Rating import scala.Tuple2 import org.apache.spark.rdd.RDD object MovieRecommendation { //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = { val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product))) val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating) }.join(data.map(x => ((x.user, x.product), x.rating))).values if (implicitPrefs) { println("(Prediction, Rating)") println(predictionsAndRatings.take(5).mkString("\n")) } math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean()) } def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession .builder() .appName("JavaLDAExample") .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/"). getOrCreate() val ratigsFile = "data/ratings.csv" val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile) val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp")) ratingsDF.show(false) val moviesFile = "data/movies.csv" val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile) val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres")) moviesDF.show(false) ratingsDF.createOrReplaceTempView("ratings") moviesDF.createOrReplaceTempView("movies") var rmseTest = computeRmse(model, testRDD, true) println("Test RMSE: = " + rmseTest) //Less is better //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668 println("Recommendations: (MovieId => Rating)") println("----------------------------------") val recommendationsUser = model.recommendProducts(668, 6) recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println) println("----------------------------------") spark.stop() } }
Example 3
Source File: GraphFrameTestSparkContext.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes import java.io.File import java.nio.file.Files import org.apache.commons.io.FileUtils import org.scalatest.{BeforeAndAfterAll, Suite} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits} trait GraphFrameTestSparkContext extends BeforeAndAfterAll { self: Suite => @transient var spark: SparkSession = _ @transient var sc: SparkContext = _ @transient var sqlContext: SQLContext = _ @transient var sparkMajorVersion: Int = _ @transient var sparkMinorVersion: Int = _ def isLaterVersion(minVersion: String): Boolean = { val (minMajorVersion, minMinorVersion) = TestUtils.majorMinorVersion(minVersion) if (sparkMajorVersion != minMajorVersion) { return sparkMajorVersion > minMajorVersion } else { return sparkMinorVersion >= minMinorVersion } } override def beforeAll() { super.beforeAll() spark = SparkSession.builder() .master("local[2]") .appName("GraphFramesUnitTest") .config("spark.sql.shuffle.partitions", 4) .getOrCreate() val checkpointDir = Files.createTempDirectory(this.getClass.getName).toString spark.sparkContext.setCheckpointDir(checkpointDir) sc = spark.sparkContext sqlContext = spark.sqlContext val (verMajor, verMinor) = TestUtils.majorMinorVersion(sc.version) sparkMajorVersion = verMajor sparkMinorVersion = verMinor } override def afterAll() { val checkpointDir = sc.getCheckpointDir if (spark != null) { spark.stop() } spark = null sqlContext = null sc = null checkpointDir.foreach { dir => FileUtils.deleteQuietly(new File(dir)) } super.afterAll() } }
Example 4
Source File: MLlibTestSparkContext.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.io.File import org.scalatest.Suite import org.apache.spark.SparkContext import org.apache.spark.ml.util.TempDirectory import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits} import org.apache.spark.util.Utils trait MLlibTestSparkContext extends TempDirectory { self: Suite => @transient var spark: SparkSession = _ @transient var sc: SparkContext = _ @transient var checkpointDir: String = _ override def beforeAll() { super.beforeAll() spark = SparkSession.builder .master("local[2]") .appName("MLlibUnitTest") .getOrCreate() sc = spark.sparkContext checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString sc.setCheckpointDir(checkpointDir) } override def afterAll() { try { Utils.deleteRecursively(new File(checkpointDir)) SparkSession.clearActiveSession() if (spark != null) { spark.stop() } spark = null } finally { super.afterAll() } } protected object testImplicits extends SQLImplicits { protected override def _sqlContext: SQLContext = self.spark.sqlContext } }
Example 5
Source File: CloudantSparkSQLSuite.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.cloudant import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits} class CloudantSparkSQLSuite extends ClientSparkFunSuite { // import spark implicits private object testImplicits extends SQLImplicits { protected override def _sqlContext: SQLContext = spark.sqlContext } val endpoint = "_all_docs" override def beforeAll() { super.beforeAll() spark = SparkSession.builder().config(conf) .config("cloudant.protocol", TestUtils.getProtocol) .config("cloudant.host", TestUtils.getHost) .config("cloudant.username", TestUtils.getUsername) .config("cloudant.password", TestUtils.getPassword) .config("cloudant.endpoint", endpoint) .getOrCreate() } testIf("verify results from temp view of database n_airportcodemapping", () => TestUtils.shouldRunTest()) { // create a temp table from Cloudant db and query it using sql syntax val sparkSql = spark.sql( s""" |CREATE OR REPLACE TEMPORARY VIEW airportTable |USING org.apache.bahir.cloudant |OPTIONS ( database 'n_airportcodemapping') """.stripMargin) // create a dataframe val airportData = spark.sql( s""" |SELECT _id, airportName |FROM airportTable |WHERE _id >= 'CAA' AND _id <= 'GAA' ORDER BY _id """.stripMargin) assert(airportData.count() == 4) // create filtered dataframe to compare with SQL temp. view val df2 = spark.read.format("org.apache.bahir.cloudant") .load("n_airportcodemapping") val df2count = df2.filter(df2("_id") >="CAA" && df2("_id") <="GAA") .select("_id", "airportName") .orderBy(df2("_id")).count() assert(df2count == airportData.count()) } testIf("verify results from temp view of index in n_flight", () => TestUtils.shouldRunTest()) { // create a temp table from Cloudant index and query it using sql syntax val sparkSql = spark.sql( s""" |CREATE TEMPORARY VIEW flightTable |USING org.apache.bahir.cloudant |OPTIONS (database 'n_flight', index '_design/view/_search/n_flights') """.stripMargin) val flightData = spark.sql( s""" |SELECT flightSegmentId, scheduledDepartureTime |FROM flightTable |WHERE flightSegmentId >'AA9' AND flightSegmentId<'AA95' """.stripMargin) assert(flightData.count() == 1) // create filtered dataframe to compare with SQL temp. view val df2 = spark.read.format("org.apache.bahir.cloudant") .load("n_flight") val df2count = df2.filter(df2("flightSegmentId") > "AA9" && df2("flightSegmentId") < "AA95") .select("flightSegmentId", "scheduledDepartureTime") .orderBy(df2("_id")).count() assert(df2count == flightData.count()) } }
Example 6
Source File: MLlibTestSparkContext.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.io.File import org.scalatest.Suite import org.apache.spark.SparkContext import org.apache.spark.ml.util.TempDirectory import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits} import org.apache.spark.util.Utils trait MLlibTestSparkContext extends TempDirectory { self: Suite => @transient var spark: SparkSession = _ @transient var sc: SparkContext = _ @transient var checkpointDir: String = _ override def beforeAll() { super.beforeAll() spark = SparkSession.builder .master("local[2]") .appName("MLlibUnitTest") .getOrCreate() sc = spark.sparkContext checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString sc.setCheckpointDir(checkpointDir) } override def afterAll() { try { Utils.deleteRecursively(new File(checkpointDir)) SparkSession.clearActiveSession() if (spark != null) { spark.stop() } spark = null } finally { super.afterAll() } } protected object testImplicits extends SQLImplicits { protected override def _sqlContext: SQLContext = self.spark.sqlContext } }
Example 7
Source File: MLlibTestSparkContext.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.io.File import org.scalatest.Suite import org.apache.spark.SparkContext import org.apache.spark.ml.util.TempDirectory import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits} import org.apache.spark.util.Utils trait MLlibTestSparkContext extends TempDirectory { self: Suite => @transient var spark: SparkSession = _ @transient var sc: SparkContext = _ @transient var checkpointDir: String = _ override def beforeAll() { super.beforeAll() spark = SparkSession.builder .master("local[2]") .appName("MLlibUnitTest") .getOrCreate() sc = spark.sparkContext checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString sc.setCheckpointDir(checkpointDir) } override def afterAll() { try { Utils.deleteRecursively(new File(checkpointDir)) SparkSession.clearActiveSession() if (spark != null) { spark.stop() } spark = null } finally { super.afterAll() } } protected object testImplicits extends SQLImplicits { protected override def _sqlContext: SQLContext = self.spark.sqlContext } }
Example 8
Source File: HiveTestTrait.scala From cloud-integration with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.io.File import com.cloudera.spark.cloud.ObjectStoreConfigurations import org.scalatest.BeforeAndAfterAll import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits} import org.apache.spark.sql.hive.test.TestHiveContext import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.Utils trait HiveTestTrait extends SparkFunSuite with BeforeAndAfterAll { // override protected val enableAutoThreadAudit = false protected var hiveContext: HiveInstanceForTests = _ protected var spark: SparkSession = _ protected override def beforeAll(): Unit = { super.beforeAll() // set up spark and hive context hiveContext = new HiveInstanceForTests() spark = hiveContext.sparkSession } protected override def afterAll(): Unit = { try { SparkSession.clearActiveSession() if (hiveContext != null) { hiveContext.reset() hiveContext = null } if (spark != null) { spark.close() spark = null } } finally { super.afterAll() } } } class HiveInstanceForTests extends TestHiveContext( new SparkContext( System.getProperty("spark.sql.test.master", "local[1]"), "TestSQLContext", new SparkConf() .setAll(ObjectStoreConfigurations.RW_TEST_OPTIONS) .set("spark.sql.warehouse.dir", TestSetup.makeWarehouseDir().toURI.getPath) ) ) { } object TestSetup { def makeWarehouseDir(): File = { val warehouseDir = Utils.createTempDir(namePrefix = "warehouse") warehouseDir.delete() warehouseDir } }
Example 9
Source File: MLlibTestSparkContext.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.util import java.io.File import org.apache.spark.SparkContext import org.apache.spark.sql.types.UDTRegistration import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession} import org.apache.spark.util.{SparkUtil, Utils} import org.scalatest.Suite trait MLlibTestSparkContext extends TempDirectory { self: Suite => @transient var spark: SparkSession = _ @transient var sc: SparkContext = _ @transient var checkpointDir: String = _ override def beforeAll() { super.beforeAll() SparkUtil.UDTRegister("org.apache.spark.linalg.Vector", "org.apache.spark.linalg.VectorUDT") SparkUtil.UDTRegister("org.apache.spark.linalg.DenseVector", "org.apache.spark.linalg.VectorUDT") SparkUtil.UDTRegister("org.apache.spark.linalg.SparseVector", "org.apache.spark.linalg.VectorUDT") SparkUtil.UDTRegister("org.apache.spark.linalg.Matrix", "org.apache.spark.linalg.MatrixUDT") SparkUtil.UDTRegister("org.apache.spark.linalg.DenseMatrix", "org.apache.spark.linalg.MatrixUDT") SparkUtil.UDTRegister("org.apache.spark.linalg.SparseMatrix", "org.apache.spark.linalg.MatrixUDT") spark = SparkSession.builder .master("local[2]") .appName("MLlibUnitTest") .getOrCreate() sc = spark.sparkContext checkpointDir = SparkUtil.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString sc.setCheckpointDir(checkpointDir) } override def afterAll() { try { SparkUtil.deleteRecursively(new File(checkpointDir)) SparkSession.clearActiveSession() if (spark != null) { spark.stop() } spark = null } finally { super.afterAll() } } /** * A helper object for importing SQL implicits. * * Note that the alternative of importing `spark.implicits._` is not possible here. * This is because we create the `SQLContext` immediately before the first test is run, * but the implicits import is needed in the constructor. */ protected object testImplicits extends SQLImplicits { protected override def _sqlContext: SQLContext = self.spark.sqlContext } }