com.datastax.spark.connector.cql.CassandraConnector Scala Examples
The following examples show how to use com.datastax.spark.connector.cql.CassandraConnector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 2
Source File: CassandraDriver.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package cassandra import org.apache.spark.sql._ import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import kafka.KafkaService import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper import foreachSink._ import log.LazyLogger object CassandraDriver extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ val connector = CassandraConnector(SparkHelper.getSparkSession().sparkContext.getConf) val namespace = "structuredstreaming" val foreachTableSink = "radio" val StreamProviderTableSink = "radioothersink" val kafkaMetadata = "kafkametadata" def getTestInfo() = { val rdd = spark.sparkContext.cassandraTable(namespace, kafkaMetadata) if( !rdd.isEmpty ) { log.warn(rdd.count) log.warn(rdd.first) } else { log.warn(s"$namespace, $kafkaMetadata is empty in cassandra") } } def transformKafkaMetadataArrayToJson(array: Array[CassandraRow]) : String = { s"""{"${KafkaService.topicName}": { "${array(0).getLong("partition")}": ${array(0).getLong("offset")} } } """.replaceAll("\n", "").replaceAll(" ", "") } def debug() = { val output = spark.sparkContext.cassandraTable(namespace, foreachTableSink) log.warn(output.count) } }
Example 3
Source File: CassandraOutputTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.output.cassandra import java.io.{Serializable => JSerializable} import com.datastax.spark.connector.cql.CassandraConnector import com.stratio.sparta.sdk._ import com.stratio.sparta.sdk.properties.JsoneyString import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.mock.MockitoSugar import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class CassandraOutputTest extends FlatSpec with Matchers with MockitoSugar with AnswerSugar { val s = "sum" val properties = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042")) "getSparkConfiguration" should "return a Seq with the configuration" in { val configuration = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042")) val cass = CassandraOutput.getSparkConfiguration(configuration) cass should be(List(("spark.cassandra.connection.host", "127.0.0.1"), ("spark.cassandra.connection.port", "9042"))) } "getSparkConfiguration" should "return all cassandra-spark config" in { val config: Map[String, JSerializable] = Map( ("sparkProperties" -> JsoneyString( "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," + "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")), ("anotherProperty" -> "true") ) val sparkConfig = CassandraOutput.getSparkConfiguration(config) sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(true) sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(true) sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false) } "getSparkConfiguration" should "not return cassandra-spark config" in { val config: Map[String, JSerializable] = Map( ("hadoopProperties" -> JsoneyString( "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," + "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")), ("anotherProperty" -> "true") ) val sparkConfig = CassandraOutput.getSparkConfiguration(config) sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(false) sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(false) sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false) } }
Example 4
Source File: L6-20CassandraConnector.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import com.datastax.spark.connector.SomeColumns import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.streaming.toDStreamFunctions import com.datastax.spark.connector.toNamedColumnRef object CassandraConnectorSinkApp { def main(args: Array[String]) { if (args.length != 6) { System.err.println( "Usage: CassandraConnectorSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <tableName> <columnName>") System.exit(1) } val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.cassandra.connection.host", cassandraHost) .set("spark.cassandra.connection.port", cassandraPort) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) CassandraConnector(conf).withSessionDo { session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace)) session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName)) } HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .map(stock => (stock._1, stock._2 / (windowSize / batchInterval))) .saveToCassandra(keyspace, tableName) ssc.start() ssc.awaitTermination() } }
Example 5
Source File: SparkStressImplicits.scala From spark-cassandra-stress with Apache License 2.0 | 5 votes |
package com.datastax.sparkstress import org.apache.spark.rdd.RDD import com.datastax.bdp.spark.writer.BulkTableWriter._ import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.writer.RowWriterFactory import scala.collection.JavaConverters._ object SparkStressImplicits { def bulkSaveToCassandra[T: RowWriterFactory](rdd: RDD[T], keyspace: String, table: String): Unit = { rdd.bulkSaveToCassandra(keyspace, table) } def clusterSize(connector: CassandraConnector): Int = { connector.withClusterDo(_.getMetadata.getAllHosts.size) } def getLocalDC(cc: CassandraConnector): String = { val hostsInProvidedDC = cc.hosts cc.withClusterDo(cluster => cluster .getMetadata .getAllHosts.asScala .find( node => hostsInProvidedDC.contains(node.getAddress)) .map(_.getDatacenter) .getOrElse("Analytics") ) } }
Example 6
Source File: SparkStressImplicits.scala From spark-cassandra-stress with Apache License 2.0 | 5 votes |
package com.datastax.sparkstress import java.net.InetSocketAddress import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.writer.RowWriterFactory import org.apache.spark.rdd.RDD import scala.collection.JavaConverters._ object SparkStressImplicits { def bulkSaveToCassandra[T: RowWriterFactory](rdd: RDD[T], keyspace: String, table: String): Unit = { // bulk save was removed in 6.9 throw new UnsupportedOperationException } def clusterSize(connector: CassandraConnector): Int = { connector.withSessionDo(_.getMetadata.getNodes.size()) } def getLocalDC(connector: CassandraConnector): String = { val hostsInProvidedDC = connector.hosts connector.withSessionDo( _.getMetadata .getNodes .values() .asScala .find(node => hostsInProvidedDC.contains(node.getEndPoint.resolve().asInstanceOf[InetSocketAddress])) .map(_.getDatacenter) .getOrElse("Analytics") ) } }
Example 7
Source File: StreamingTask.scala From spark-cassandra-stress with Apache License 2.0 | 5 votes |
package com.datastax.sparkstress import java.util.concurrent.TimeUnit import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.streaming._ import com.datastax.sparkstress.RowGenerator.PerfRowGenerator import com.datastax.sparkstress.RowTypes._ import com.datastax.sparkstress.SparkStressImplicits._ import com.datastax.sparkstress.StressTask._ import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{StreamingContext, _} import scala.reflect.ClassTag abstract class StreamingTask[rowType]( val config: Config, val ss: SparkSession) (implicit ct:ClassTag[rowType]) extends StressTask { val ssc = new StreamingContext(ss.sparkContext, Seconds(config.streamingBatchIntervalSeconds)) val opsPerBatch = (config.numReceivers * config.receiverThroughputPerBatch) val estimatedReqRuntime: Long = ((config.totalOps / opsPerBatch) * config.streamingBatchIntervalSeconds) + 10 val terminationTime: Long = { if (config.terminationTimeMinutes == 0) { estimatedReqRuntime } else { val newTerminationTime: Long = TimeUnit.MINUTES.toSeconds(config.terminationTimeMinutes) if (estimatedReqRuntime <= newTerminationTime) { println(s"Using the estimated runtime (${estimatedReqRuntime} secs}) required to stream ${config.totalOps} since it is <= the requested runtime (${newTerminationTime} secs).") estimatedReqRuntime } else { println(s"Converting requested runtime of ${config.terminationTimeMinutes} min to ${newTerminationTime} secs.") newTerminationTime } } } def setupCQL() = { val cc = CassandraConnector(ss.sparkContext.getConf) cc.withSessionDo { session => if (config.deleteKeyspace) { println(s"Destroying Keyspace") session.execute(s"DROP KEYSPACE IF EXISTS ${config.keyspace}") } val kscql = getKeyspaceCql(config.keyspace, getLocalDC(cc), config.replicationFactor) val tbcql = getTableCql(config.table) println( s"""Running the following create statements\n$kscql\n${tbcql.mkString("\n")}""") session.execute(kscql) session.execute(s"USE ${config.keyspace}") for (cql <- tbcql) session.execute(cql) } printf("Done Setting up CQL Keyspace/Table\n") } def getTableCql(tbName: String): Seq[String] override def getGenerator: RowGenerator[PerfRowClass] = generator override def dstreamOps(dstream: DStream[PerfRowClass]): Unit = dstream.saveToCassandra(config.keyspace, config.table) }
Example 8
Source File: WriteReadSpec.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.embedded._ import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} case class KVRow (k: Int, v: Int) @RunWith(classOf[JUnitRunner]) class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{ override def clearCache(): Unit = CassandraConnector.evictCache() //Sets up CassandraConfig and SparkContext useCassandraConfig(Seq(YamlTransformations.Default)) useSparkConf(defaultConf) val connector = CassandraConnector(defaultConf) val ksName = "test" val tableName = "kv" "We" should "be able to access our Embedded Cassandra Node" in { connector .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables")) .all() should not be empty } it should "be able to do some work with Spark" in { sc.parallelize(1 to 10).count should be (10) } it should "be able to do some work with Cassandra and Spark" in { val testData = (1 to 1000).map(value => KVRow(value, value)) connector.withSessionDo{ session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }") session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))") sc.parallelize(testData).saveToCassandra(ksName, tableName) val results = sc.cassandraTable[KVRow](ksName, tableName).collect() results should contain theSameElementsAs testData } } }
Example 9
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 10
Source File: WriteReadSpec.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.embedded._ import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} case class KVRow (k: Int, v: Int) @RunWith(classOf[JUnitRunner]) class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{ override def clearCache(): Unit = CassandraConnector.evictCache() //Sets up CassandraConfig and SparkContext useCassandraConfig(Seq(YamlTransformations.Default)) useSparkConf(defaultConf) val connector = CassandraConnector(defaultConf) val ksName = "test" val tableName = "kv" "We" should "be able to access our Embedded Cassandra Node" in { connector .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables")) .all() should not be empty } it should "be able to do some work with Spark" in { sc.parallelize(1 to 10).count should be (10) } it should "be able to do some work with Cassandra and Spark" in { val testData = (1 to 1000).map(value => KVRow(value, value)) connector.withSessionDo{ session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }") session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))") sc.parallelize(testData).saveToCassandra(ksName, tableName) val results = sc.cassandraTable[KVRow](ksName, tableName).collect() results should contain theSameElementsAs testData } } }
Example 11
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 12
Source File: WriteReadSpec.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.embedded._ import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} case class KVRow (k: Int, v: Int) @RunWith(classOf[JUnitRunner]) class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{ override def clearCache(): Unit = CassandraConnector.evictCache() //Sets up CassandraConfig and SparkContext useCassandraConfig(Seq(YamlTransformations.Default)) useSparkConf(defaultConf) val connector = CassandraConnector(defaultConf) val ksName = "test" val tableName = "kv" "We" should "be able to access our Embedded Cassandra Node" in { connector .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables")) .all() should not be empty } it should "be able to do some work with Spark" in { sc.parallelize(1 to 10).count should be (10) } it should "be able to do some work with Cassandra and Spark" in { val testData = (1 to 1000).map(value => KVRow(value, value)) connector.withSessionDo{ session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }") session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))") sc.parallelize(testData).saveToCassandra(ksName, tableName) val results = sc.cassandraTable[KVRow](ksName, tableName).collect() results should contain theSameElementsAs testData } } }
Example 13
Source File: SparkCassSSTableLoaderClientManager.scala From Spark2Cassandra with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.cassandra.client import java.net.InetAddress import com.datastax.spark.connector.cql.{ AuthConf, CassandraConnector } import com.github.jparkie.spark.cassandra.conf.SparkCassServerConf import org.apache.spark.Logging import scala.collection.mutable private[cassandra] trait SparkCassSSTableLoaderClientManager extends Serializable with Logging { case class SessionKey( hosts: Set[InetAddress], port: Int, authConf: AuthConf, sparkCassServerConf: SparkCassServerConf ) extends Serializable @transient private[client] val internalClients = mutable.HashMap.empty[SessionKey, SparkCassSSTableLoaderClient] private[client] def buildSessionKey( cassandraConnector: CassandraConnector, sparkCassServerConf: SparkCassServerConf ): SessionKey = { SessionKey(cassandraConnector.hosts, cassandraConnector.port, cassandraConnector.authConf, sparkCassServerConf) } private[client] def buildClient( cassandraConnector: CassandraConnector, sparkCassServerConf: SparkCassServerConf ): SparkCassSSTableLoaderClient = { val newSession = cassandraConnector.openSession() logInfo(s"Created SSTableLoaderClient to the following Cassandra nodes: ${cassandraConnector.hosts}") val sparkCassSSTableLoaderClient = new SparkCassSSTableLoaderClient(newSession, sparkCassServerConf) sys.addShutdownHook { logInfo("Closed Cassandra Session for SSTableLoaderClient.") sparkCassSSTableLoaderClient.stop() } sparkCassSSTableLoaderClient } private[cassandra] def evictAll(): Unit = synchronized { internalClients.values.foreach(_.stop()) internalClients.clear() } } object SparkCassSSTableLoaderClientManager extends SparkCassSSTableLoaderClientManager
Example 14
Source File: WriteReadSpec.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.embedded._ import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} case class KVRow (k: Int, v: Int) @RunWith(classOf[JUnitRunner]) class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{ override def clearCache(): Unit = CassandraConnector.evictCache() //Sets up CassandraConfig and SparkContext useCassandraConfig(Seq(YamlTransformations.Default)) useSparkConf(defaultConf) val connector = CassandraConnector(defaultConf) val ksName = "test" val tableName = "kv" "We" should "be able to access our Embedded Cassandra Node" in { connector .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables")) .all() should not be empty } it should "be able to do some work with Spark" in { sc.parallelize(1 to 10).count should be (10) } it should "be able to do some work with Cassandra and Spark" in { val testData = (1 to 1000).map(value => KVRow(value, value)) connector.withSessionDo{ session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }") session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))") sc.parallelize(testData).saveToCassandra(ksName, tableName) val results = sc.cassandraTable[KVRow](ksName, tableName).collect() results should contain theSameElementsAs testData } } }
Example 15
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 16
Source File: WriteReadSpec.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.embedded._ import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} case class KVRow (k: Int, v: Int) @RunWith(classOf[JUnitRunner]) class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{ override def clearCache(): Unit = CassandraConnector.evictCache() //Sets up CassandraConfig and SparkContext useCassandraConfig(Seq(YamlTransformations.Default)) useSparkConf(defaultConf) val connector = CassandraConnector(defaultConf) val ksName = "test" val tableName = "kv" "We" should "be able to access our Embedded Cassandra Node" in { connector .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables")) .all() should not be empty } it should "be able to do some work with Spark" in { sc.parallelize(1 to 10).count should be (10) } it should "be able to do some work with Cassandra and Spark" in { val testData = (1 to 1000).map(value => KVRow(value, value)) connector.withSessionDo{ session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }") session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))") sc.parallelize(testData).saveToCassandra(ksName, tableName) val results = sc.cassandraTable[KVRow](ksName, tableName).collect() results should contain theSameElementsAs testData } } }
Example 17
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 18
Source File: WriteReadSpec.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.embedded._ import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.embedded.{EmbeddedCassandra, SparkTemplate} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} case class KVRow (k: Int, v: Int) @RunWith(classOf[JUnitRunner]) class WriteReadSpec extends FlatSpec with EmbeddedCassandra with SparkTemplate with Matchers{ override def clearCache(): Unit = CassandraConnector.evictCache() //Sets up CassandraConfig and SparkContext useCassandraConfig(Seq(YamlTransformations.Default)) useSparkConf(defaultConf) val connector = CassandraConnector(defaultConf) val ksName = "test" val tableName = "kv" "We" should "be able to access our Embedded Cassandra Node" in { connector .withSessionDo(session => session.execute("SELECT * FROM system_schema.tables")) .all() should not be empty } it should "be able to do some work with Spark" in { sc.parallelize(1 to 10).count should be (10) } it should "be able to do some work with Cassandra and Spark" in { val testData = (1 to 1000).map(value => KVRow(value, value)) connector.withSessionDo{ session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS $ksName WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }") session.execute(s"CREATE TABLE IF NOT EXISTS $ksName.$tableName (k int, v int, PRIMARY KEY (k))") sc.parallelize(testData).saveToCassandra(ksName, tableName) val results = sc.cassandraTable[KVRow](ksName, tableName).collect() results should contain theSameElementsAs testData } } }
Example 19
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 20
Source File: StreamingDemo.scala From spark-streaming-demo with Apache License 2.0 | 5 votes |
package com.datastax.examples.meetup import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} def createSchema(): Unit = { CassandraConnector(conf).withSessionDo { session => session.execute(s"DROP KEYSPACE IF EXISTS $CassandraKeyspace") session.execute(s"CREATE KEYSPACE IF NOT EXISTS $CassandraKeyspace WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") session.execute(s""" CREATE TABLE IF NOT EXISTS $CassandraKeyspace.$CassandraTable ( event text, interval text, dimension text, subtotal counter, PRIMARY KEY((event, interval), dimension) ) WITH CLUSTERING ORDER BY (dimension ASC) """) } } }
Example 21
Source File: CassandraServerSpecLike.scala From Spark2Cassandra with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.cassandra import java.net.{ InetAddress, InetSocketAddress } import com.datastax.driver.core.Session import com.datastax.spark.connector.cql.CassandraConnector import org.cassandraunit.utils.EmbeddedCassandraServerHelper import org.scalatest.{ BeforeAndAfterAll, Suite } trait CassandraServerSpecLike extends BeforeAndAfterAll { this: Suite => // Remove protected modifier because of SharedSparkContext. override def beforeAll(): Unit = { super.beforeAll() EmbeddedCassandraServerHelper.startEmbeddedCassandra() } // Remove protected modifier because of SharedSparkContext. override def afterAll(): Unit = { EmbeddedCassandraServerHelper.cleanEmbeddedCassandra() super.afterAll() } def getClusterName: String = { EmbeddedCassandraServerHelper.getClusterName } def getHosts: Set[InetAddress] = { val temporaryAddress = new InetSocketAddress(EmbeddedCassandraServerHelper.getHost, EmbeddedCassandraServerHelper.getNativeTransportPort) .getAddress Set(temporaryAddress) } def getNativeTransportPort: Int = { EmbeddedCassandraServerHelper.getNativeTransportPort } def getRpcPort: Int = { EmbeddedCassandraServerHelper.getRpcPort } def getCassandraConnector: CassandraConnector = { CassandraConnector(hosts = getHosts, port = getNativeTransportPort) } def createKeyspace(session: Session, keyspace: String): Unit = { session.execute( s"""CREATE KEYSPACE "$keyspace" |WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; """.stripMargin ) } }
Example 22
Source File: SparkCassRDDFunctions.scala From Spark2Cassandra with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.cassandra.rdd import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.mapper.ColumnMapper import com.datastax.spark.connector.writer.{ DefaultRowWriter, RowWriterFactory } import com.datastax.spark.connector.{ AllColumns, ColumnSelector } import com.github.jparkie.spark.cassandra.SparkCassBulkWriter import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf } import org.apache.spark.rdd.RDD import scala.reflect.runtime.universe._ def bulkLoadToCass( keyspaceName: String, tableName: String, columns: ColumnSelector = AllColumns, sparkCassWriteConf: SparkCassWriteConf = SparkCassWriteConf.fromSparkConf(internalSparkContext.getConf), sparkCassServerConf: SparkCassServerConf = SparkCassServerConf.fromSparkConf(internalSparkContext.getConf) )(implicit connector: CassandraConnector = CassandraConnector(internalSparkContext.getConf), rwf: RowWriterFactory[T] = DefaultRowWriter.factory[T]): Unit = { val sparkCassBulkWriter = SparkCassBulkWriter( connector, keyspaceName, tableName, columns, sparkCassWriteConf, sparkCassServerConf ) internalSparkContext.runJob(rdd, sparkCassBulkWriter.write _) } }
Example 23
Source File: SparkCassDataFrameFunctions.scala From Spark2Cassandra with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.cassandra.sql import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.writer.{ RowWriterFactory, SqlRowWriter } import com.datastax.spark.connector.{ AllColumns, ColumnSelector } import com.github.jparkie.spark.cassandra.SparkCassBulkWriter import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf } import org.apache.spark.sql.{ DataFrame, Row } def bulkLoadToCass( keyspaceName: String, tableName: String, columns: ColumnSelector = AllColumns, sparkCassWriteConf: SparkCassWriteConf = SparkCassWriteConf.fromSparkConf(internalSparkContext.getConf), sparkCassServerConf: SparkCassServerConf = SparkCassServerConf.fromSparkConf(internalSparkContext.getConf) )(implicit connector: CassandraConnector = CassandraConnector(internalSparkContext.getConf), rwf: RowWriterFactory[Row] = SqlRowWriter.Factory): Unit = { val sparkCassBulkWriter = SparkCassBulkWriter( connector, keyspaceName, tableName, columns, sparkCassWriteConf, sparkCassServerConf ) internalSparkContext.runJob(dataFrame.rdd, sparkCassBulkWriter.write _) } }