org.apache.spark.sql.execution.streaming.Source Scala Examples
The following examples show how to use org.apache.spark.sql.execution.streaming.Source.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RedisStreamProvider.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis.stream import com.redislabs.provider.redis.util.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} import org.apache.spark.sql.types.{StringType, StructField, StructType} class RedisStreamProvider extends DataSourceRegister with StreamSourceProvider with Logging { override def shortName(): String = "redis" override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { providerName -> schema.getOrElse { StructType(Seq(StructField("_id", StringType))) } } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { val (_, ss) = sourceSchema(sqlContext, schema, providerName, parameters) val source = new RedisSource(sqlContext, metadataPath, Some(ss), parameters) source.start() source } }
Example 2
Source File: BlockingSource.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 3
Source File: MockSourceProvider.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.StreamSourceProvider import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MockSourceProvider extends StreamSourceProvider { override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", MockSourceProvider.fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { MockSourceProvider.sourceProviderFunction() } } object MockSourceProvider { // Function to generate sources. May provide multiple sources if the user implements such a // function. private var sourceProviderFunction: () => Source = _ final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = { var i = 0 val sources = source +: otherSources sourceProviderFunction = () => { val source = sources(i % sources.length) i += 1 source } try { f } finally { sourceProviderFunction = null } } }
Example 4
Source File: BlockingSource.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 5
Source File: DefaultSource.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery import com.google.cloud.hadoop.io.bigquery.BigQueryStrings import com.samelamin.spark.bigquery.converters.SchemaConverters import com.samelamin.spark.bigquery.streaming.{BigQuerySink, BigQuerySource} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.{Sink, Source} import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.spark.sql.sources.RelationProvider class DefaultSource extends StreamSinkProvider with StreamSourceProvider with RelationProvider{ override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { val path = parameters.get("transaction_log").getOrElse("transaction_log") new BigQuerySink(sqlContext.sparkSession, path, parameters) } def getConvertedSchema(sqlContext: SQLContext,options: Map[String, String]): StructType = { val bigqueryClient = BigQueryClient.getInstance(sqlContext) val tableReference = BigQueryStrings.parseTableReference(options.get("tableReferenceSource").get) SchemaConverters.BQToSQLSchema(bigqueryClient.getTableSchema(tableReference)) } override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, options: Map[String, String]): (String, StructType) = { val convertedSchema = getConvertedSchema(sqlContext,options) ("bigquery", schema.getOrElse(convertedSchema)) } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { new BigQuerySource(sqlContext, schema, parameters) } override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BigQueryRelation = { val tableName = parameters.get("tableReferenceSource").get new BigQueryRelation(tableName)(sqlContext) } }
Example 6
Source File: SqsSourceProvider.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sqs import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} import org.apache.spark.sql.types.StructType class SqsSourceProvider extends DataSourceRegister with StreamSourceProvider with Logging { override def shortName(): String = "s3-sqs" override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { require(schema.isDefined, "Sqs source doesn't support empty schema") (shortName(), schema.get) } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { new SqsSource( sqlContext.sparkSession, metadataPath, parameters, schema.get) } }
Example 7
Source File: HDFSMQTTSourceProvider.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.mqtt import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} import org.apache.spark.sql.types.StructType import org.apache.bahir.sql.streaming.mqtt.{MQTTStreamConstants, MQTTUtils} class HDFSMQTTSourceProvider extends StreamSourceProvider with DataSourceRegister with Logging { override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("hdfs-mqtt", MQTTStreamConstants.SCHEMA_DEFAULT) } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { val parsedResult = MQTTUtils.parseConfigParams(parameters) new HdfsBasedMQTTStreamSource( sqlContext, metadataPath, parsedResult._1, // brokerUrl parsedResult._2, // clientId parsedResult._3, // topic parsedResult._5, // mqttConnectionOptions parsedResult._6, // qos parsedResult._7, // maxBatchMessageNum parsedResult._8, // maxBatchMessageSize parsedResult._9 // maxRetryNum ) } override def shortName(): String = "hdfs-mqtt" } object HDFSMQTTSourceProvider { val SEP = "##" }
Example 8
Source File: BlockingSource.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 9
Source File: BlockingSource.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 10
Source File: MockSourceProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.StreamSourceProvider import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MockSourceProvider extends StreamSourceProvider { override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", MockSourceProvider.fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { MockSourceProvider.sourceProviderFunction() } } object MockSourceProvider { // Function to generate sources. May provide multiple sources if the user implements such a // function. private var sourceProviderFunction: () => Source = _ final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = { var i = 0 val sources = source +: otherSources sourceProviderFunction = () => { val source = sources(i % sources.length) i += 1 source } try { f } finally { sourceProviderFunction = null } } }
Example 11
Source File: CurrentEventsByPersistenceIdQuerySourceProvider.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package akka.persistence.jdbc.spark.sql.execution.streaming import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source } import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider } import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{ SQLContext, _ } object CurrentEventsByPersistenceIdQuerySourceProvider { val name = "current-events-by-persistence-id" } class CurrentEventsByPersistenceIdQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable { override def sourceSchema( sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): (String, StructType) = { println(s"[CurrentEventsByPersistenceIdQuerySourceProvider.sourceSchema]: schema: $schema, providerName: $providerName, parameters: $parameters") CurrentEventsByPersistenceIdQuerySourceProvider.name -> schema.get } override def createSource( sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): Source = { val eventMapperFQCN: String = parameters.get("event-mapper") match { case Some(_eventMapper) => _eventMapper case _ => throw new RuntimeException("No event mapper FQCN") } val pid = (parameters.get("pid"), parameters.get("persistence-id")) match { case (Some(pid), _) => pid case (_, Some(pid)) => pid case _ => throw new RuntimeException("No persistence_id") } new CurrentEventsByPersistenceIdQuerySourceImpl(sqlContext, parameters("path"), eventMapperFQCN, pid, schema.get) } override def shortName(): String = CurrentEventsByPersistenceIdQuerySourceProvider.name } class CurrentEventsByPersistenceIdQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String, eventMapperFQCN: String, persistenceId: String, override val schema: StructType) extends Source with ReadJournalSource { override def getOffset: Option[Offset] = { val offset = maxEventsByPersistenceId(persistenceId) println("[CurrentEventsByPersistenceIdQuery]: Returning maximum offset: " + offset) Some(LongOffset(offset)) } override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = { val (start, end) = getStartEnd(_start, _end) val df: DataFrame = eventsByPersistenceId(persistenceId, start, end, eventMapperFQCN) println(s"[CurrentEventsByPersistenceIdQuery]: Getting currentPersistenceIds from start: $start, end: $end, DataFrame.count: ${df.count}") df } }
Example 12
Source File: ReadJournalSource.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package akka.persistence.jdbc.spark.sql.execution.streaming import akka.actor.{ ActorSystem, ExtendedActorSystem } import akka.persistence.query.PersistenceQuery import akka.persistence.query.scaladsl.{ CurrentEventsByPersistenceIdQuery, CurrentEventsByTagQuery, CurrentPersistenceIdsQuery, ReadJournal } import akka.stream.scaladsl.Sink import akka.stream.scaladsl.extension.{ Sink => Snk } import akka.stream.{ ActorMaterializer, Materializer } import org.apache.spark.sql._ import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source } import org.apache.spark.sql.types.StructType import scala.collection.immutable._ import scala.concurrent.duration.{ FiniteDuration, _ } import scala.concurrent.{ Await, ExecutionContext, Future } trait ReadJournalSource { _: Source => def readJournalPluginId: String def sqlContext: SQLContext // some machinery implicit val system: ActorSystem = ActorSystem() implicit val mat: Materializer = ActorMaterializer() implicit val ec: ExecutionContext = system.dispatcher // read journal, only interested in the Current queries, as Spark isn't asynchronous lazy val readJournal = PersistenceQuery(system).readJournalFor(readJournalPluginId) .asInstanceOf[ReadJournal with CurrentPersistenceIdsQuery with CurrentEventsByPersistenceIdQuery with CurrentEventsByTagQuery] implicit class FutureOps[A](f: Future[A])(implicit ec: ExecutionContext, timeout: FiniteDuration = null) { def futureValue: A = Await.result(f, Option(timeout).getOrElse(10.seconds)) } def maxPersistenceIds: Long = readJournal.currentPersistenceIds().runWith(Snk.count).futureValue def persistenceIds(start: Long, end: Long) = readJournal.currentPersistenceIds().drop(start).take(end).runWith(Sink.seq).futureValue def maxEventsByPersistenceId(pid: String): Long = readJournal.currentEventsByPersistenceId(pid, 0, Long.MaxValue).runWith(Snk.count).futureValue def eventsByPersistenceId(pid: String, start: Long, end: Long, eventMapperFQCN: String): Seq[Row] = { readJournal.currentEventsByPersistenceId(pid, start, end) .map(env => getMapper(eventMapperFQCN).get.row(env, sqlContext)).runWith(Sink.seq).futureValue } implicit def mapToDataFrame(rows: Seq[Row]): DataFrame = { import scala.collection.JavaConversions._ sqlContext.createDataFrame(rows, schema) } def getStartEnd(_start: Option[Offset], _end: Offset): (Long, Long) = (_start, _end) match { case (Some(LongOffset(start)), LongOffset(end)) => (start, end) case (None, LongOffset(end)) => (0L, end) } def getMapper(eventMapperFQCN: String): Option[EventMapper] = system.asInstanceOf[ExtendedActorSystem].dynamicAccess.createInstanceFor[EventMapper](eventMapperFQCN, List.empty) .recover { case cause => cause.printStackTrace(); null }.toOption override def stop(): Unit = { println("Stopping jdbc read journal") system.terminate() } }
Example 13
Source File: CurrentPersistenceIdsQuerySourceProvider.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package akka.persistence.jdbc.spark.sql.execution.streaming import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source } import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider } import org.apache.spark.sql.types.{ StringType, StructField, StructType } import org.apache.spark.sql.{ SQLContext, _ } object CurrentPersistenceIdsQuerySourceProvider { val name = "current-persistence-id" val schema: StructType = StructType(Array( StructField("persistence_id", StringType, nullable = false) )) } class CurrentPersistenceIdsQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable { override def sourceSchema( sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): (String, StructType) = { CurrentPersistenceIdsQuerySourceProvider.name -> CurrentPersistenceIdsQuerySourceProvider.schema } override def createSource( sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): Source = { new CurrentPersistenceIdsQuerySourceImpl(sqlContext, parameters("path")) } override def shortName(): String = CurrentPersistenceIdsQuerySourceProvider.name } class CurrentPersistenceIdsQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String) extends Source with ReadJournalSource { override def schema: StructType = CurrentPersistenceIdsQuerySourceProvider.schema override def getOffset: Option[Offset] = { val offset = maxPersistenceIds println("[CurrentPersistenceIdsQuery]: Returning maximum offset: " + offset) Some(LongOffset(offset)) } override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = { val (start, end) = getStartEnd(_start, _end) println(s"[CurrentPersistenceIdsQuery]: Getting currentPersistenceIds from start: $start, end: $end") import sqlContext.implicits._ persistenceIds(start, end).toDF() } }