com.datastax.spark.connector.cql.CassandraConnector Scala Examples

The following examples show how to use com.datastax.spark.connector.cql.CassandraConnector. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: WriteRead.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
} 
Example 2
Source File: CassandraDriver.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package cassandra

import org.apache.spark.sql._
import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import kafka.KafkaService
import radio.{SimpleSongAggregation, SimpleSongAggregationKafka}
import spark.SparkHelper
import foreachSink._
import log.LazyLogger

object CassandraDriver extends LazyLogger {
  private val spark = SparkHelper.getSparkSession()
  import spark.implicits._

  val connector = CassandraConnector(SparkHelper.getSparkSession().sparkContext.getConf)

  val namespace = "structuredstreaming"
  val foreachTableSink = "radio"
  val StreamProviderTableSink = "radioothersink"
  val kafkaMetadata = "kafkametadata"
  def getTestInfo() = {
    val rdd = spark.sparkContext.cassandraTable(namespace, kafkaMetadata)

    if( !rdd.isEmpty ) {
      log.warn(rdd.count)
      log.warn(rdd.first)
    } else {
      log.warn(s"$namespace, $kafkaMetadata is empty in cassandra")
    }
  }


  
  def transformKafkaMetadataArrayToJson(array: Array[CassandraRow]) : String = {
      s"""{"${KafkaService.topicName}":
          {
           "${array(0).getLong("partition")}": ${array(0).getLong("offset")}
          }
         }
      """.replaceAll("\n", "").replaceAll(" ", "")
  }

  def debug() = {
   val output = spark.sparkContext.cassandraTable(namespace, foreachTableSink)

    log.warn(output.count)
  }
} 
Example 3
Source File: CassandraOutputTest.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.plugin.output.cassandra

import java.io.{Serializable => JSerializable}

import com.datastax.spark.connector.cql.CassandraConnector
import com.stratio.sparta.sdk._
import com.stratio.sparta.sdk.properties.JsoneyString
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.mock.MockitoSugar
import org.scalatest.{FlatSpec, Matchers}

@RunWith(classOf[JUnitRunner])
class CassandraOutputTest extends FlatSpec with Matchers with MockitoSugar with AnswerSugar {

  val s = "sum"
  val properties = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042"))

  "getSparkConfiguration" should "return a Seq with the configuration" in {
    val configuration = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042"))
    val cass = CassandraOutput.getSparkConfiguration(configuration)

    cass should be(List(("spark.cassandra.connection.host", "127.0.0.1"), ("spark.cassandra.connection.port", "9042")))
  }

  "getSparkConfiguration" should "return all cassandra-spark config" in {
    val config: Map[String, JSerializable] = Map(
      ("sparkProperties" -> JsoneyString(
        "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," +
          "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")),
      ("anotherProperty" -> "true")
    )

    val sparkConfig = CassandraOutput.getSparkConfiguration(config)

    sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(true)
    sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(true)
    sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false)
  }

  "getSparkConfiguration" should "not return cassandra-spark config" in {
    val config: Map[String, JSerializable] = Map(
      ("hadoopProperties" -> JsoneyString(
        "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," +
          "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")),
      ("anotherProperty" -> "true")
    )

    val sparkConfig = CassandraOutput.getSparkConfiguration(config)

    sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(false)
    sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(false)
    sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false)
  }
} 
Example 4
Source File: L6-20CassandraConnector.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

import com.datastax.spark.connector.SomeColumns
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.streaming.toDStreamFunctions
import com.datastax.spark.connector.toNamedColumnRef

object CassandraConnectorSinkApp {

  def main(args: Array[String]) {
    if (args.length != 6) {
      System.err.println(
        "Usage: CassandraConnectorSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <tableName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
      .set("spark.cassandra.connection.host", cassandraHost)
      .set("spark.cassandra.connection.port", cassandraPort)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    CassandraConnector(conf).withSessionDo { session =>
      session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace))
      session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName))
    }

    HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
      .map(stock => (stock._1, stock._2 / (windowSize / batchInterval)))
      .saveToCassandra(keyspace, tableName)

    ssc.start()
    ssc.awaitTermination()
  }
} 
Example 5
Source File: SparkStressImplicits.scala    From spark-cassandra-stress   with Apache License 2.0 5 votes vote down vote up
package com.datastax.sparkstress

import org.apache.spark.rdd.RDD
import com.datastax.bdp.spark.writer.BulkTableWriter._
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.writer.RowWriterFactory

import scala.collection.JavaConverters._

object SparkStressImplicits {

  def bulkSaveToCassandra[T: RowWriterFactory](rdd: RDD[T], keyspace: String, table: String): Unit = {
    rdd.bulkSaveToCassandra(keyspace, table)
  }

  def clusterSize(connector: CassandraConnector): Int = {
    connector.withClusterDo(_.getMetadata.getAllHosts.size)
  }

  def getLocalDC(cc: CassandraConnector): String = {
      val hostsInProvidedDC = cc.hosts
      cc.withClusterDo(cluster =>
        cluster
          .getMetadata
          .getAllHosts.asScala
          .find( node => hostsInProvidedDC.contains(node.getAddress))
          .map(_.getDatacenter)
          .getOrElse("Analytics")
      )
  }
} 
Example 6
Source File: SparkStressImplicits.scala    From spark-cassandra-stress   with Apache License 2.0 5 votes vote down vote up
package com.datastax.sparkstress

import java.net.InetSocketAddress

import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.writer.RowWriterFactory
import org.apache.spark.rdd.RDD

import scala.collection.JavaConverters._

object SparkStressImplicits {

  def bulkSaveToCassandra[T: RowWriterFactory](rdd: RDD[T], keyspace: String, table: String): Unit = {
    // bulk save was removed in 6.9
    throw new UnsupportedOperationException
  }

  def clusterSize(connector: CassandraConnector): Int = {
    connector.withSessionDo(_.getMetadata.getNodes.size())
  }

  def getLocalDC(connector: CassandraConnector): String = {
    val hostsInProvidedDC = connector.hosts
    connector.withSessionDo(
      _.getMetadata
        .getNodes
        .values()
        .asScala
        .find(node => hostsInProvidedDC.contains(node.getEndPoint.resolve().asInstanceOf[InetSocketAddress]))
        .map(_.getDatacenter)
        .getOrElse("Analytics")
    )
  }
} 
Example 7
Source File: StreamingTask.scala    From spark-cassandra-stress   with Apache License 2.0 5 votes vote down vote up
package com.datastax.sparkstress

import java.util.concurrent.TimeUnit

import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.streaming._
import com.datastax.sparkstress.RowGenerator.PerfRowGenerator
import com.datastax.sparkstress.RowTypes._
import com.datastax.sparkstress.SparkStressImplicits._
import com.datastax.sparkstress.StressTask._
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{StreamingContext, _}

import scala.reflect.ClassTag

abstract class StreamingTask[rowType](
  val config: Config,
  val ss: SparkSession)
(implicit ct:ClassTag[rowType]) extends StressTask {

  val ssc = new StreamingContext(ss.sparkContext, Seconds(config.streamingBatchIntervalSeconds))
  val opsPerBatch = (config.numReceivers * config.receiverThroughputPerBatch)
  val estimatedReqRuntime: Long = ((config.totalOps / opsPerBatch) * config.streamingBatchIntervalSeconds) + 10
  val terminationTime: Long = {
    if (config.terminationTimeMinutes == 0) {
      estimatedReqRuntime
    } else {
      val newTerminationTime: Long = TimeUnit.MINUTES.toSeconds(config.terminationTimeMinutes)
      if (estimatedReqRuntime <= newTerminationTime) {
        println(s"Using the estimated runtime (${estimatedReqRuntime} secs}) required to stream ${config.totalOps} since it is <= the requested runtime (${newTerminationTime} secs).")
        estimatedReqRuntime
      } else {
        println(s"Converting requested runtime of ${config.terminationTimeMinutes} min to ${newTerminationTime} secs.")
        newTerminationTime
      }
    }
  }

  def setupCQL() = {
    val cc = CassandraConnector(ss.sparkContext.getConf)
    cc.withSessionDo { session =>
      if (config.deleteKeyspace) {
        println(s"Destroying Keyspace")
        session.execute(s"DROP KEYSPACE IF EXISTS ${config.keyspace}")
      }
      val kscql = getKeyspaceCql(config.keyspace, getLocalDC(cc), config.replicationFactor)
      val tbcql = getTableCql(config.table)
      println( s"""Running the following create statements\n$kscql\n${tbcql.mkString("\n")}""")
      session.execute(kscql)
      session.execute(s"USE ${config.keyspace}")
      for (cql <- tbcql)
        session.execute(cql)
    }
    printf("Done Setting up CQL Keyspace/Table\n")
  }

  def getTableCql(tbName: String): Seq[String]

  
  override def getGenerator: RowGenerator[PerfRowClass] = generator

  override def dstreamOps(dstream: DStream[PerfRowClass]): Unit = dstream.saveToCassandra(config.keyspace, config.table)
} 
Example 8
Source File: WriteReadSpec.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example


import com.datastax.spark.connector._
import com.datastax.spark.connector.embedded._
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}

case class KVRow (k: Int, v: Int)


@RunWith(classOf[JUnitRunner])
class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{
  override def clearCache(): Unit = CassandraConnector.evictCache()

  //Sets up CassandraConfig and SparkContext
  useCassandraConfig(Seq(YamlTransformations.Default))
  useSparkConf(defaultConf)

  val connector = CassandraConnector(defaultConf)
  val ksName = "test"
  val tableName = "kv"

  "We" should "be able to access our Embedded Cassandra Node" in {
    connector
      .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables"))
      .all() should not be empty
  }

  it should "be able to do some work with Spark" in {
    sc.parallelize(1 to 10).count should be (10)
  }

  it should "be able to do some work with Cassandra and Spark" in {
    val testData = (1 to 1000).map(value => KVRow(value, value))
    connector.withSessionDo{ session =>
      session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
      session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))")
      sc.parallelize(testData).saveToCassandra(ksName, tableName)
      val results = sc.cassandraTable[KVRow](ksName, tableName).collect()
      results should contain theSameElementsAs testData
    }
  }

} 
Example 9
Source File: WriteRead.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
} 
Example 10
Source File: WriteReadSpec.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example


import com.datastax.spark.connector._
import com.datastax.spark.connector.embedded._
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}

case class KVRow (k: Int, v: Int)


@RunWith(classOf[JUnitRunner])
class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{
  override def clearCache(): Unit = CassandraConnector.evictCache()

  //Sets up CassandraConfig and SparkContext
  useCassandraConfig(Seq(YamlTransformations.Default))
  useSparkConf(defaultConf)

  val connector = CassandraConnector(defaultConf)
  val ksName = "test"
  val tableName = "kv"

  "We" should "be able to access our Embedded Cassandra Node" in {
    connector
      .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables"))
      .all() should not be empty
  }

  it should "be able to do some work with Spark" in {
    sc.parallelize(1 to 10).count should be (10)
  }

  it should "be able to do some work with Cassandra and Spark" in {
    val testData = (1 to 1000).map(value => KVRow(value, value))
    connector.withSessionDo{ session =>
      session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
      session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))")
      sc.parallelize(testData).saveToCassandra(ksName, tableName)
      val results = sc.cassandraTable[KVRow](ksName, tableName).collect()
      results should contain theSameElementsAs testData
    }
  }

} 
Example 11
Source File: WriteRead.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
} 
Example 12
Source File: WriteReadSpec.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example


import com.datastax.spark.connector._
import com.datastax.spark.connector.embedded._
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}

case class KVRow (k: Int, v: Int)


@RunWith(classOf[JUnitRunner])
class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{
  override def clearCache(): Unit = CassandraConnector.evictCache()

  //Sets up CassandraConfig and SparkContext
  useCassandraConfig(Seq(YamlTransformations.Default))
  useSparkConf(defaultConf)

  val connector = CassandraConnector(defaultConf)
  val ksName = "test"
  val tableName = "kv"

  "We" should "be able to access our Embedded Cassandra Node" in {
    connector
      .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables"))
      .all() should not be empty
  }

  it should "be able to do some work with Spark" in {
    sc.parallelize(1 to 10).count should be (10)
  }

  it should "be able to do some work with Cassandra and Spark" in {
    val testData = (1 to 1000).map(value => KVRow(value, value))
    connector.withSessionDo{ session =>
      session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
      session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))")
      sc.parallelize(testData).saveToCassandra(ksName, tableName)
      val results = sc.cassandraTable[KVRow](ksName, tableName).collect()
      results should contain theSameElementsAs testData
    }
  }

} 
Example 13
Source File: SparkCassSSTableLoaderClientManager.scala    From Spark2Cassandra   with Apache License 2.0 5 votes vote down vote up
package com.github.jparkie.spark.cassandra.client

import java.net.InetAddress

import com.datastax.spark.connector.cql.{ AuthConf, CassandraConnector }
import com.github.jparkie.spark.cassandra.conf.SparkCassServerConf
import org.apache.spark.Logging

import scala.collection.mutable

private[cassandra] trait SparkCassSSTableLoaderClientManager extends Serializable with Logging {
  case class SessionKey(
    hosts:               Set[InetAddress],
    port:                Int,
    authConf:            AuthConf,
    sparkCassServerConf: SparkCassServerConf
  ) extends Serializable

  @transient
  private[client] val internalClients = mutable.HashMap.empty[SessionKey, SparkCassSSTableLoaderClient]

  private[client] def buildSessionKey(
    cassandraConnector:  CassandraConnector,
    sparkCassServerConf: SparkCassServerConf
  ): SessionKey = {
    SessionKey(cassandraConnector.hosts, cassandraConnector.port, cassandraConnector.authConf, sparkCassServerConf)
  }

  private[client] def buildClient(
    cassandraConnector:  CassandraConnector,
    sparkCassServerConf: SparkCassServerConf
  ): SparkCassSSTableLoaderClient = {
    val newSession = cassandraConnector.openSession()

    logInfo(s"Created SSTableLoaderClient to the following Cassandra nodes: ${cassandraConnector.hosts}")

    val sparkCassSSTableLoaderClient = new SparkCassSSTableLoaderClient(newSession, sparkCassServerConf)

    sys.addShutdownHook {
      logInfo("Closed Cassandra Session for SSTableLoaderClient.")

      sparkCassSSTableLoaderClient.stop()
    }

    sparkCassSSTableLoaderClient
  }

  
  private[cassandra] def evictAll(): Unit = synchronized {
    internalClients.values.foreach(_.stop())
    internalClients.clear()
  }
}

object SparkCassSSTableLoaderClientManager extends SparkCassSSTableLoaderClientManager 
Example 14
Source File: WriteReadSpec.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example


import com.datastax.spark.connector._
import com.datastax.spark.connector.embedded._
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}

case class KVRow (k: Int, v: Int)


@RunWith(classOf[JUnitRunner])
class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{
  override def clearCache(): Unit = CassandraConnector.evictCache()

  //Sets up CassandraConfig and SparkContext
  useCassandraConfig(Seq(YamlTransformations.Default))
  useSparkConf(defaultConf)

  val connector = CassandraConnector(defaultConf)
  val ksName = "test"
  val tableName = "kv"

  "We" should "be able to access our Embedded Cassandra Node" in {
    connector
      .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables"))
      .all() should not be empty
  }

  it should "be able to do some work with Spark" in {
    sc.parallelize(1 to 10).count should be (10)
  }

  it should "be able to do some work with Cassandra and Spark" in {
    val testData = (1 to 1000).map(value => KVRow(value, value))
    connector.withSessionDo{ session =>
      session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
      session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))")
      sc.parallelize(testData).saveToCassandra(ksName, tableName)
      val results = sc.cassandraTable[KVRow](ksName, tableName).collect()
      results should contain theSameElementsAs testData
    }
  }

} 
Example 15
Source File: WriteRead.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
} 
Example 16
Source File: WriteReadSpec.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example


import com.datastax.spark.connector._
import com.datastax.spark.connector.embedded._
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}

case class KVRow (k: Int, v: Int)


@RunWith(classOf[JUnitRunner])
class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{
  override def clearCache(): Unit = CassandraConnector.evictCache()

  //Sets up CassandraConfig and SparkContext
  useCassandraConfig(Seq(YamlTransformations.Default))
  useSparkConf(defaultConf)

  val connector = CassandraConnector(defaultConf)
  val ksName = "test"
  val tableName = "kv"

  "We" should "be able to access our Embedded Cassandra Node" in {
    connector
      .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables"))
      .all() should not be empty
  }

  it should "be able to do some work with Spark" in {
    sc.parallelize(1 to 10).count should be (10)
  }

  it should "be able to do some work with Cassandra and Spark" in {
    val testData = (1 to 1000).map(value => KVRow(value, value))
    connector.withSessionDo{ session =>
      session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
      session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))")
      sc.parallelize(testData).saveToCassandra(ksName, tableName)
      val results = sc.cassandraTable[KVRow](ksName, tableName).collect()
      results should contain theSameElementsAs testData
    }
  }

} 
Example 17
Source File: WriteRead.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
} 
Example 18
Source File: WriteReadSpec.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example


import com.datastax.spark.connector._
import com.datastax.spark.connector.embedded._
import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}

case class KVRow (k: Int, v: Int)


@RunWith(classOf[JUnitRunner])
class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{
  override def clearCache(): Unit = CassandraConnector.evictCache()

  //Sets up CassandraConfig and SparkContext
  useCassandraConfig(Seq(YamlTransformations.Default))
  useSparkConf(defaultConf)

  val connector = CassandraConnector(defaultConf)
  val ksName = "test"
  val tableName = "kv"

  "We" should "be able to access our Embedded Cassandra Node" in {
    connector
      .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables"))
      .all() should not be empty
  }

  it should "be able to do some work with Spark" in {
    sc.parallelize(1 to 10).count should be (10)
  }

  it should "be able to do some work with Cassandra and Spark" in {
    val testData = (1 to 1000).map(value => KVRow(value, value))
    connector.withSessionDo{ session =>
      session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
      session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))")
      sc.parallelize(testData).saveToCassandra(ksName, tableName)
      val results = sc.cassandraTable[KVRow](ksName, tableName).collect()
      results should contain theSameElementsAs testData
    }
  }

} 
Example 19
Source File: WriteRead.scala    From SparkBuildExamples   with Apache License 2.0 5 votes vote down vote up
package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
} 
Example 20
Source File: StreamingDemo.scala    From spark-streaming-demo   with Apache License 2.0 5 votes vote down vote up
package com.datastax.examples.meetup

import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}


  def createSchema(): Unit = {
    CassandraConnector(conf).withSessionDo { session =>
      session.execute(s"DROP KEYSPACE IF EXISTS $CassandraKeyspace")
      session.execute(s"CREATE KEYSPACE IF NOT EXISTS $CassandraKeyspace WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
      session.execute(s"""
             CREATE TABLE IF NOT EXISTS $CassandraKeyspace.$CassandraTable (
                event text,
                interval text,
                dimension text,
                subtotal counter,
                PRIMARY KEY((event, interval), dimension)
            ) WITH CLUSTERING ORDER BY (dimension ASC)
           """)
    }
  }
} 
Example 21
Source File: CassandraServerSpecLike.scala    From Spark2Cassandra   with Apache License 2.0 5 votes vote down vote up
package com.github.jparkie.spark.cassandra

import java.net.{ InetAddress, InetSocketAddress }

import com.datastax.driver.core.Session
import com.datastax.spark.connector.cql.CassandraConnector
import org.cassandraunit.utils.EmbeddedCassandraServerHelper
import org.scalatest.{ BeforeAndAfterAll, Suite }

trait CassandraServerSpecLike extends BeforeAndAfterAll { this: Suite =>
  // Remove protected modifier because of SharedSparkContext.
  override def beforeAll(): Unit = {
    super.beforeAll()

    EmbeddedCassandraServerHelper.startEmbeddedCassandra()
  }

  // Remove protected modifier because of SharedSparkContext.
  override def afterAll(): Unit = {
    EmbeddedCassandraServerHelper.cleanEmbeddedCassandra()

    super.afterAll()
  }

  def getClusterName: String = {
    EmbeddedCassandraServerHelper.getClusterName
  }

  def getHosts: Set[InetAddress] = {
    val temporaryAddress =
      new InetSocketAddress(EmbeddedCassandraServerHelper.getHost, EmbeddedCassandraServerHelper.getNativeTransportPort)
        .getAddress

    Set(temporaryAddress)
  }

  def getNativeTransportPort: Int = {
    EmbeddedCassandraServerHelper.getNativeTransportPort
  }

  def getRpcPort: Int = {
    EmbeddedCassandraServerHelper.getRpcPort
  }

  def getCassandraConnector: CassandraConnector = {
    CassandraConnector(hosts = getHosts, port = getNativeTransportPort)
  }

  def createKeyspace(session: Session, keyspace: String): Unit = {
    session.execute(
      s"""CREATE KEYSPACE "$keyspace"
          |WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };
       """.stripMargin
    )
  }
} 
Example 22
Source File: SparkCassRDDFunctions.scala    From Spark2Cassandra   with Apache License 2.0 5 votes vote down vote up
package com.github.jparkie.spark.cassandra.rdd

import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.mapper.ColumnMapper
import com.datastax.spark.connector.writer.{ DefaultRowWriter, RowWriterFactory }
import com.datastax.spark.connector.{ AllColumns, ColumnSelector }
import com.github.jparkie.spark.cassandra.SparkCassBulkWriter
import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf }
import org.apache.spark.rdd.RDD

import scala.reflect.runtime.universe._


  def bulkLoadToCass(
    keyspaceName:        String,
    tableName:           String,
    columns:             ColumnSelector      = AllColumns,
    sparkCassWriteConf:  SparkCassWriteConf  = SparkCassWriteConf.fromSparkConf(internalSparkContext.getConf),
    sparkCassServerConf: SparkCassServerConf = SparkCassServerConf.fromSparkConf(internalSparkContext.getConf)
  )(implicit
    connector: CassandraConnector = CassandraConnector(internalSparkContext.getConf),
    rwf: RowWriterFactory[T] = DefaultRowWriter.factory[T]): Unit = {
    val sparkCassBulkWriter = SparkCassBulkWriter(
      connector,
      keyspaceName,
      tableName,
      columns,
      sparkCassWriteConf,
      sparkCassServerConf
    )

    internalSparkContext.runJob(rdd, sparkCassBulkWriter.write _)
  }
} 
Example 23
Source File: SparkCassDataFrameFunctions.scala    From Spark2Cassandra   with Apache License 2.0 5 votes vote down vote up
package com.github.jparkie.spark.cassandra.sql

import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.writer.{ RowWriterFactory, SqlRowWriter }
import com.datastax.spark.connector.{ AllColumns, ColumnSelector }
import com.github.jparkie.spark.cassandra.SparkCassBulkWriter
import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf }
import org.apache.spark.sql.{ DataFrame, Row }


  def bulkLoadToCass(
    keyspaceName:        String,
    tableName:           String,
    columns:             ColumnSelector      = AllColumns,
    sparkCassWriteConf:  SparkCassWriteConf  = SparkCassWriteConf.fromSparkConf(internalSparkContext.getConf),
    sparkCassServerConf: SparkCassServerConf = SparkCassServerConf.fromSparkConf(internalSparkContext.getConf)
  )(implicit
    connector: CassandraConnector = CassandraConnector(internalSparkContext.getConf),
    rwf: RowWriterFactory[Row] = SqlRowWriter.Factory): Unit = {
    val sparkCassBulkWriter = SparkCassBulkWriter(
      connector,
      keyspaceName,
      tableName,
      columns,
      sparkCassWriteConf,
      sparkCassServerConf
    )

    internalSparkContext.runJob(dataFrame.rdd, sparkCassBulkWriter.write _)
  }
}