org.apache.spark.sql.SQLImplicits Scala Example

Source File: MLlibTestSparkContext.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.io.File

import org.scalatest.Suite

import org.apache.spark.SparkContext
import org.apache.spark.ml.util.TempDirectory
import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits}
import org.apache.spark.util.Utils

trait MLlibTestSparkContext extends TempDirectory { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var checkpointDir: String = _

  override def beforeAll() {
    super.beforeAll()
    spark = SparkSession.builder
      .master("local[2]")
      .appName("MLlibUnitTest")
      .getOrCreate()
    sc = spark.sparkContext

    checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString
    sc.setCheckpointDir(checkpointDir)
  }

  override def afterAll() {
    try {
      Utils.deleteRecursively(new File(checkpointDir))
      SparkSession.clearActiveSession()
      if (spark != null) {
        spark.stop()
      }
      spark = null
    } finally {
      super.afterAll()
    }
  }

  
  protected object testImplicits extends SQLImplicits {
    protected override def _sqlContext: SQLContext = self.spark.sqlContext
  }
}

Source File: MovieRecommendation.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLImplicits
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object MovieRecommendation {  
  //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. 
  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = {
    val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
    val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating)
    }.join(data.map(x => ((x.user, x.product), x.rating))).values
    if (implicitPrefs) {
      println("(Prediction, Rating)")
      println(predictionsAndRatings.take(5).mkString("\n"))
    }
    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
  }

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("JavaLDAExample")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/").
      getOrCreate()

    val ratigsFile = "data/ratings.csv"
    val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile)

    val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp"))
    ratingsDF.show(false)

    val moviesFile = "data/movies.csv"
    val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile)

    val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres"))
    moviesDF.show(false)

    ratingsDF.createOrReplaceTempView("ratings")
    moviesDF.createOrReplaceTempView("movies")

    

    var rmseTest = computeRmse(model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    println("----------------------------------")
    val recommendationsUser = model.recommendProducts(668, 6)
    recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
    println("----------------------------------")

    spark.stop()
  }
}

Source File: GraphFrameTestSparkContext.scala From graphframes with Apache License 2.0

5 votes

package org.graphframes

import java.io.File
import java.nio.file.Files

import org.apache.commons.io.FileUtils
import org.scalatest.{BeforeAndAfterAll, Suite}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits}

trait GraphFrameTestSparkContext extends BeforeAndAfterAll { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _
  @transient var sparkMajorVersion: Int = _
  @transient var sparkMinorVersion: Int = _

  
  def isLaterVersion(minVersion: String): Boolean = {
    val (minMajorVersion, minMinorVersion) = TestUtils.majorMinorVersion(minVersion)
    if (sparkMajorVersion != minMajorVersion) {
      return sparkMajorVersion > minMajorVersion
    } else {
      return sparkMinorVersion >= minMinorVersion
    }
  }

  override def beforeAll() {
    super.beforeAll()

    spark = SparkSession.builder()
      .master("local[2]")
      .appName("GraphFramesUnitTest")
      .config("spark.sql.shuffle.partitions", 4)
      .getOrCreate()

    val checkpointDir = Files.createTempDirectory(this.getClass.getName).toString
    spark.sparkContext.setCheckpointDir(checkpointDir)
    sc = spark.sparkContext
    sqlContext = spark.sqlContext

    val (verMajor, verMinor) = TestUtils.majorMinorVersion(sc.version)
    sparkMajorVersion = verMajor
    sparkMinorVersion = verMinor
  }

  override def afterAll() {
    val checkpointDir = sc.getCheckpointDir
    if (spark != null) {
      spark.stop()
    }
    spark = null
    sqlContext = null
    sc = null

    checkpointDir.foreach { dir =>
      FileUtils.deleteQuietly(new File(dir))
    }
    super.afterAll()
  }
}

Source File: MLlibTestSparkContext.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.io.File

import org.scalatest.Suite

import org.apache.spark.SparkContext
import org.apache.spark.ml.util.TempDirectory
import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits}
import org.apache.spark.util.Utils

trait MLlibTestSparkContext extends TempDirectory { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var checkpointDir: String = _

  override def beforeAll() {
    super.beforeAll()
    spark = SparkSession.builder
      .master("local[2]")
      .appName("MLlibUnitTest")
      .getOrCreate()
    sc = spark.sparkContext

    checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString
    sc.setCheckpointDir(checkpointDir)
  }

  override def afterAll() {
    try {
      Utils.deleteRecursively(new File(checkpointDir))
      SparkSession.clearActiveSession()
      if (spark != null) {
        spark.stop()
      }
      spark = null
    } finally {
      super.afterAll()
    }
  }

  
  protected object testImplicits extends SQLImplicits {
    protected override def _sqlContext: SQLContext = self.spark.sqlContext
  }
}

Source File: CloudantSparkSQLSuite.scala From bahir with Apache License 2.0

5 votes

package org.apache.bahir.cloudant

import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits}

class CloudantSparkSQLSuite extends ClientSparkFunSuite {
  // import spark implicits
  private object testImplicits extends SQLImplicits {
    protected override def _sqlContext: SQLContext = spark.sqlContext
  }

  val endpoint = "_all_docs"

  override def beforeAll() {
    super.beforeAll()
    spark = SparkSession.builder().config(conf)
      .config("cloudant.protocol", TestUtils.getProtocol)
      .config("cloudant.host", TestUtils.getHost)
      .config("cloudant.username", TestUtils.getUsername)
      .config("cloudant.password", TestUtils.getPassword)
      .config("cloudant.endpoint", endpoint)
      .getOrCreate()
  }

  testIf("verify results from temp view of database n_airportcodemapping",
      () => TestUtils.shouldRunTest()) {
    // create a temp table from Cloudant db and query it using sql syntax
    val sparkSql = spark.sql(
      s"""
         |CREATE OR REPLACE TEMPORARY VIEW airportTable
         |USING org.apache.bahir.cloudant
         |OPTIONS ( database 'n_airportcodemapping')
        """.stripMargin)

    // create a dataframe
    val airportData = spark.sql(
      s"""
         |SELECT _id, airportName
         |FROM airportTable
         |WHERE _id >= 'CAA' AND _id <= 'GAA' ORDER BY _id
        """.stripMargin)
    assert(airportData.count() == 4)

    // create filtered dataframe to compare with SQL temp. view
    val df2 = spark.read.format("org.apache.bahir.cloudant")
      .load("n_airportcodemapping")
    val df2count = df2.filter(df2("_id") >="CAA" && df2("_id") <="GAA")
      .select("_id", "airportName")
      .orderBy(df2("_id")).count()

    assert(df2count == airportData.count())
  }

  testIf("verify results from temp view of index in n_flight", () => TestUtils.shouldRunTest()) {
    // create a temp table from Cloudant index  and query it using sql syntax
    val sparkSql = spark.sql(
      s"""
         |CREATE TEMPORARY VIEW flightTable
         |USING org.apache.bahir.cloudant
         |OPTIONS (database 'n_flight', index '_design/view/_search/n_flights')
        """.stripMargin)

    val flightData = spark.sql(
      s"""
         |SELECT flightSegmentId, scheduledDepartureTime
         |FROM flightTable
         |WHERE flightSegmentId >'AA9' AND flightSegmentId<'AA95'
        """.stripMargin)
    assert(flightData.count() == 1)

    // create filtered dataframe to compare with SQL temp. view
    val df2 = spark.read.format("org.apache.bahir.cloudant")
      .load("n_flight")
    val df2count = df2.filter(df2("flightSegmentId") > "AA9"
      && df2("flightSegmentId") < "AA95")
      .select("flightSegmentId", "scheduledDepartureTime")
      .orderBy(df2("_id")).count()

    assert(df2count == flightData.count())
  }
}

Source File: MLlibTestSparkContext.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.io.File

import org.scalatest.Suite

import org.apache.spark.SparkContext
import org.apache.spark.ml.util.TempDirectory
import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits}
import org.apache.spark.util.Utils

trait MLlibTestSparkContext extends TempDirectory { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var checkpointDir: String = _

  override def beforeAll() {
    super.beforeAll()
    spark = SparkSession.builder
      .master("local[2]")
      .appName("MLlibUnitTest")
      .getOrCreate()
    sc = spark.sparkContext

    checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString
    sc.setCheckpointDir(checkpointDir)
  }

  override def afterAll() {
    try {
      Utils.deleteRecursively(new File(checkpointDir))
      SparkSession.clearActiveSession()
      if (spark != null) {
        spark.stop()
      }
      spark = null
    } finally {
      super.afterAll()
    }
  }

  
  protected object testImplicits extends SQLImplicits {
    protected override def _sqlContext: SQLContext = self.spark.sqlContext
  }
}

Source File: MLlibTestSparkContext.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.io.File

import org.scalatest.Suite

import org.apache.spark.SparkContext
import org.apache.spark.ml.util.TempDirectory
import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits}
import org.apache.spark.util.Utils

trait MLlibTestSparkContext extends TempDirectory { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var checkpointDir: String = _

  override def beforeAll() {
    super.beforeAll()
    spark = SparkSession.builder
      .master("local[2]")
      .appName("MLlibUnitTest")
      .getOrCreate()
    sc = spark.sparkContext

    checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString
    sc.setCheckpointDir(checkpointDir)
  }

  override def afterAll() {
    try {
      Utils.deleteRecursively(new File(checkpointDir))
      SparkSession.clearActiveSession()
      if (spark != null) {
        spark.stop()
      }
      spark = null
    } finally {
      super.afterAll()
    }
  }

  
  protected object testImplicits extends SQLImplicits {
    protected override def _sqlContext: SQLContext = self.spark.sqlContext
  }
}

Source File: HiveTestTrait.scala From cloud-integration with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import java.io.File

import com.cloudera.spark.cloud.ObjectStoreConfigurations
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits}
import org.apache.spark.sql.hive.test.TestHiveContext
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.Utils


trait HiveTestTrait extends SparkFunSuite with BeforeAndAfterAll {
//  override protected val enableAutoThreadAudit = false
  protected var hiveContext: HiveInstanceForTests = _
  protected var spark: SparkSession = _


  protected override def beforeAll(): Unit = {
    super.beforeAll()
    // set up spark and hive context
    hiveContext = new HiveInstanceForTests()
    spark = hiveContext.sparkSession
  }

  protected override def afterAll(): Unit = {
    try {
      SparkSession.clearActiveSession()

      if (hiveContext != null) {
        hiveContext.reset()
        hiveContext = null
      }
      if (spark != null) {
        spark.close()
        spark = null
      }
    } finally {
      super.afterAll()
    }
  }

}

class HiveInstanceForTests
  extends TestHiveContext(
    new SparkContext(
      System.getProperty("spark.sql.test.master", "local[1]"),
      "TestSQLContext",
      new SparkConf()
        .setAll(ObjectStoreConfigurations.RW_TEST_OPTIONS)
        .set("spark.sql.warehouse.dir",
          TestSetup.makeWarehouseDir().toURI.getPath)
    )
  ) {

}




object TestSetup {

  def makeWarehouseDir(): File = {
    val warehouseDir = Utils.createTempDir(namePrefix = "warehouse")
    warehouseDir.delete()
    warehouseDir
  }
}

Source File: MLlibTestSparkContext.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.util

import java.io.File

import org.apache.spark.SparkContext
import org.apache.spark.sql.types.UDTRegistration
import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession}
import org.apache.spark.util.{SparkUtil, Utils}
import org.scalatest.Suite

trait MLlibTestSparkContext extends TempDirectory { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var checkpointDir: String = _

  override def beforeAll() {
    super.beforeAll()

    SparkUtil.UDTRegister("org.apache.spark.linalg.Vector", "org.apache.spark.linalg.VectorUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.DenseVector", "org.apache.spark.linalg.VectorUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.SparseVector", "org.apache.spark.linalg.VectorUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.Matrix", "org.apache.spark.linalg.MatrixUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.DenseMatrix", "org.apache.spark.linalg.MatrixUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.SparseMatrix", "org.apache.spark.linalg.MatrixUDT")

    spark = SparkSession.builder
      .master("local[2]")
      .appName("MLlibUnitTest")
      .getOrCreate()
    sc = spark.sparkContext

    checkpointDir = SparkUtil.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString
    sc.setCheckpointDir(checkpointDir)
  }

  override def afterAll() {
    try {
      SparkUtil.deleteRecursively(new File(checkpointDir))
      SparkSession.clearActiveSession()
      if (spark != null) {
        spark.stop()
      }
      spark = null
    } finally {
      super.afterAll()
    }
  }

  /**
   * A helper object for importing SQL implicits.
   *
   * Note that the alternative of importing `spark.implicits._` is not possible here.
   * This is because we create the `SQLContext` immediately before the first test is run,
   * but the implicits import is needed in the constructor.
   */
  protected object testImplicits extends SQLImplicits {
    protected override def _sqlContext: SQLContext = self.spark.sqlContext
  }
}

org.apache.spark.sql.SQLImplicits Scala Examples