org.apache.spark.sql.sources.StreamSourceProvider Scala Examples
The following examples show how to use org.apache.spark.sql.sources.StreamSourceProvider.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RedisStreamProvider.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis.stream import com.redislabs.provider.redis.util.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} import org.apache.spark.sql.types.{StringType, StructField, StructType} class RedisStreamProvider extends DataSourceRegister with StreamSourceProvider with Logging { override def shortName(): String = "redis" override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { providerName -> schema.getOrElse { StructType(Seq(StructField("_id", StringType))) } } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { val (_, ss) = sourceSchema(sqlContext, schema, providerName, parameters) val source = new RedisSource(sqlContext, metadataPath, Some(ss), parameters) source.start() source } }
Example 2
Source File: BlockingSource.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 3
Source File: MockSourceProvider.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.StreamSourceProvider import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MockSourceProvider extends StreamSourceProvider { override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", MockSourceProvider.fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { MockSourceProvider.sourceProviderFunction() } } object MockSourceProvider { // Function to generate sources. May provide multiple sources if the user implements such a // function. private var sourceProviderFunction: () => Source = _ final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = { var i = 0 val sources = source +: otherSources sourceProviderFunction = () => { val source = sources(i % sources.length) i += 1 source } try { f } finally { sourceProviderFunction = null } } }
Example 4
Source File: BlockingSource.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 5
Source File: SqsSourceProvider.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sqs import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} import org.apache.spark.sql.types.StructType class SqsSourceProvider extends DataSourceRegister with StreamSourceProvider with Logging { override def shortName(): String = "s3-sqs" override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { require(schema.isDefined, "Sqs source doesn't support empty schema") (shortName(), schema.get) } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { new SqsSource( sqlContext.sparkSession, metadataPath, parameters, schema.get) } }
Example 6
Source File: HDFSMQTTSourceProvider.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.mqtt import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} import org.apache.spark.sql.types.StructType import org.apache.bahir.sql.streaming.mqtt.{MQTTStreamConstants, MQTTUtils} class HDFSMQTTSourceProvider extends StreamSourceProvider with DataSourceRegister with Logging { override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("hdfs-mqtt", MQTTStreamConstants.SCHEMA_DEFAULT) } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { val parsedResult = MQTTUtils.parseConfigParams(parameters) new HdfsBasedMQTTStreamSource( sqlContext, metadataPath, parsedResult._1, // brokerUrl parsedResult._2, // clientId parsedResult._3, // topic parsedResult._5, // mqttConnectionOptions parsedResult._6, // qos parsedResult._7, // maxBatchMessageNum parsedResult._8, // maxBatchMessageSize parsedResult._9 // maxRetryNum ) } override def shortName(): String = "hdfs-mqtt" } object HDFSMQTTSourceProvider { val SEP = "##" }
Example 7
Source File: BlockingSource.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 8
Source File: BlockingSource.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 9
Source File: MockSourceProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Source import org.apache.spark.sql.sources.StreamSourceProvider import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MockSourceProvider extends StreamSourceProvider { override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", MockSourceProvider.fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { MockSourceProvider.sourceProviderFunction() } } object MockSourceProvider { // Function to generate sources. May provide multiple sources if the user implements such a // function. private var sourceProviderFunction: () => Source = _ final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = { var i = 0 val sources = source +: otherSources sourceProviderFunction = () => { val source = sources(i % sources.length) i += 1 source } try { f } finally { sourceProviderFunction = null } } }
Example 10
Source File: CurrentEventsByPersistenceIdQuerySourceProvider.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package akka.persistence.jdbc.spark.sql.execution.streaming import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source } import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider } import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{ SQLContext, _ } object CurrentEventsByPersistenceIdQuerySourceProvider { val name = "current-events-by-persistence-id" } class CurrentEventsByPersistenceIdQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable { override def sourceSchema( sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): (String, StructType) = { println(s"[CurrentEventsByPersistenceIdQuerySourceProvider.sourceSchema]: schema: $schema, providerName: $providerName, parameters: $parameters") CurrentEventsByPersistenceIdQuerySourceProvider.name -> schema.get } override def createSource( sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): Source = { val eventMapperFQCN: String = parameters.get("event-mapper") match { case Some(_eventMapper) => _eventMapper case _ => throw new RuntimeException("No event mapper FQCN") } val pid = (parameters.get("pid"), parameters.get("persistence-id")) match { case (Some(pid), _) => pid case (_, Some(pid)) => pid case _ => throw new RuntimeException("No persistence_id") } new CurrentEventsByPersistenceIdQuerySourceImpl(sqlContext, parameters("path"), eventMapperFQCN, pid, schema.get) } override def shortName(): String = CurrentEventsByPersistenceIdQuerySourceProvider.name } class CurrentEventsByPersistenceIdQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String, eventMapperFQCN: String, persistenceId: String, override val schema: StructType) extends Source with ReadJournalSource { override def getOffset: Option[Offset] = { val offset = maxEventsByPersistenceId(persistenceId) println("[CurrentEventsByPersistenceIdQuery]: Returning maximum offset: " + offset) Some(LongOffset(offset)) } override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = { val (start, end) = getStartEnd(_start, _end) val df: DataFrame = eventsByPersistenceId(persistenceId, start, end, eventMapperFQCN) println(s"[CurrentEventsByPersistenceIdQuery]: Getting currentPersistenceIds from start: $start, end: $end, DataFrame.count: ${df.count}") df } }
Example 11
Source File: CurrentPersistenceIdsQuerySourceProvider.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package akka.persistence.jdbc.spark.sql.execution.streaming import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source } import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider } import org.apache.spark.sql.types.{ StringType, StructField, StructType } import org.apache.spark.sql.{ SQLContext, _ } object CurrentPersistenceIdsQuerySourceProvider { val name = "current-persistence-id" val schema: StructType = StructType(Array( StructField("persistence_id", StringType, nullable = false) )) } class CurrentPersistenceIdsQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable { override def sourceSchema( sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): (String, StructType) = { CurrentPersistenceIdsQuerySourceProvider.name -> CurrentPersistenceIdsQuerySourceProvider.schema } override def createSource( sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String] ): Source = { new CurrentPersistenceIdsQuerySourceImpl(sqlContext, parameters("path")) } override def shortName(): String = CurrentPersistenceIdsQuerySourceProvider.name } class CurrentPersistenceIdsQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String) extends Source with ReadJournalSource { override def schema: StructType = CurrentPersistenceIdsQuerySourceProvider.schema override def getOffset: Option[Offset] = { val offset = maxPersistenceIds println("[CurrentPersistenceIdsQuery]: Returning maximum offset: " + offset) Some(LongOffset(offset)) } override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = { val (start, end) = getStartEnd(_start, _end) println(s"[CurrentPersistenceIdsQuery]: Getting currentPersistenceIds from start: $start, end: $end") import sqlContext.implicits._ persistenceIds(start, end).toDF() } }