org.apache.spark.sql.execution.streaming.Offset Scala Examples

The following examples show how to use org.apache.spark.sql.execution.streaming.Offset. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: OffsetSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}

trait OffsetSuite extends SparkFunSuite {
  
  def compare(one: Offset, two: Offset): Unit = {
    test(s"comparison $one <=> $two") {
      assert(one == one)
      assert(two == two)
      assert(one != two)
      assert(two != one)
    }
  }
}

class LongOffsetSuite extends OffsetSuite {
  val one = LongOffset(1)
  val two = LongOffset(2)
  val three = LongOffset(3)
  compare(one, two)

  compare(LongOffset(SerializedOffset(one.json)),
          LongOffset(SerializedOffset(three.json)))
} 
Example 2
Source File: CurrentPersistenceIdsQuerySourceProvider.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package akka.persistence.jdbc.spark.sql.execution.streaming

import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source }
import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider }
import org.apache.spark.sql.types.{ StringType, StructField, StructType }
import org.apache.spark.sql.{ SQLContext, _ }

object CurrentPersistenceIdsQuerySourceProvider {
  val name = "current-persistence-id"
  val schema: StructType = StructType(Array(
    StructField("persistence_id", StringType, nullable = false)
  ))
}

class CurrentPersistenceIdsQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable {
  override def sourceSchema(
    sqlContext: SQLContext,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): (String, StructType) = {
    CurrentPersistenceIdsQuerySourceProvider.name -> CurrentPersistenceIdsQuerySourceProvider.schema
  }

  override def createSource(
    sqlContext: SQLContext,
    metadataPath: String,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): Source = {
    new CurrentPersistenceIdsQuerySourceImpl(sqlContext, parameters("path"))
  }
  override def shortName(): String = CurrentPersistenceIdsQuerySourceProvider.name
}

class CurrentPersistenceIdsQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String) extends Source with ReadJournalSource {
  override def schema: StructType = CurrentPersistenceIdsQuerySourceProvider.schema

  override def getOffset: Option[Offset] = {
    val offset = maxPersistenceIds
    println("[CurrentPersistenceIdsQuery]: Returning maximum offset: " + offset)
    Some(LongOffset(offset))
  }

  override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = {
    val (start, end) = getStartEnd(_start, _end)
    println(s"[CurrentPersistenceIdsQuery]: Getting currentPersistenceIds from start: $start, end: $end")
    import sqlContext.implicits._
    persistenceIds(start, end).toDF()
  }
} 
Example 3
Source File: ReadJournalSource.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package akka.persistence.jdbc.spark.sql.execution.streaming

import akka.actor.{ ActorSystem, ExtendedActorSystem }
import akka.persistence.query.PersistenceQuery
import akka.persistence.query.scaladsl.{ CurrentEventsByPersistenceIdQuery, CurrentEventsByTagQuery, CurrentPersistenceIdsQuery, ReadJournal }
import akka.stream.scaladsl.Sink
import akka.stream.scaladsl.extension.{ Sink => Snk }
import akka.stream.{ ActorMaterializer, Materializer }
import org.apache.spark.sql._
import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source }
import org.apache.spark.sql.types.StructType

import scala.collection.immutable._
import scala.concurrent.duration.{ FiniteDuration, _ }
import scala.concurrent.{ Await, ExecutionContext, Future }

trait ReadJournalSource {
  _: Source =>
  def readJournalPluginId: String
  def sqlContext: SQLContext

  // some machinery
  implicit val system: ActorSystem = ActorSystem()
  implicit val mat: Materializer = ActorMaterializer()
  implicit val ec: ExecutionContext = system.dispatcher

  // read journal, only interested in the Current queries, as Spark isn't asynchronous
  lazy val readJournal = PersistenceQuery(system).readJournalFor(readJournalPluginId)
    .asInstanceOf[ReadJournal with CurrentPersistenceIdsQuery with CurrentEventsByPersistenceIdQuery with CurrentEventsByTagQuery]

  implicit class FutureOps[A](f: Future[A])(implicit ec: ExecutionContext, timeout: FiniteDuration = null) {
    def futureValue: A = Await.result(f, Option(timeout).getOrElse(10.seconds))
  }

  def maxPersistenceIds: Long =
    readJournal.currentPersistenceIds().runWith(Snk.count).futureValue

  def persistenceIds(start: Long, end: Long) =
    readJournal.currentPersistenceIds().drop(start).take(end).runWith(Sink.seq).futureValue

  def maxEventsByPersistenceId(pid: String): Long =
    readJournal.currentEventsByPersistenceId(pid, 0, Long.MaxValue).runWith(Snk.count).futureValue

  def eventsByPersistenceId(pid: String, start: Long, end: Long, eventMapperFQCN: String): Seq[Row] = {
    readJournal.currentEventsByPersistenceId(pid, start, end)
      .map(env => getMapper(eventMapperFQCN).get.row(env, sqlContext)).runWith(Sink.seq).futureValue
  }

  implicit def mapToDataFrame(rows: Seq[Row]): DataFrame = {
    import scala.collection.JavaConversions._
    sqlContext.createDataFrame(rows, schema)
  }

  def getStartEnd(_start: Option[Offset], _end: Offset): (Long, Long) = (_start, _end) match {
    case (Some(LongOffset(start)), LongOffset(end)) => (start, end)
    case (None, LongOffset(end))                    => (0L, end)
  }

  def getMapper(eventMapperFQCN: String): Option[EventMapper] =
    system.asInstanceOf[ExtendedActorSystem].dynamicAccess.createInstanceFor[EventMapper](eventMapperFQCN, List.empty)
      .recover { case cause => cause.printStackTrace(); null }.toOption

  override def stop(): Unit = {
    println("Stopping jdbc read journal")
    system.terminate()
  }
} 
Example 4
Source File: CurrentEventsByPersistenceIdQuerySourceProvider.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package akka.persistence.jdbc.spark.sql.execution.streaming

import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source }
import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider }
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{ SQLContext, _ }

object CurrentEventsByPersistenceIdQuerySourceProvider {
  val name = "current-events-by-persistence-id"
}

class CurrentEventsByPersistenceIdQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable {
  override def sourceSchema(
    sqlContext: SQLContext,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): (String, StructType) = {
    println(s"[CurrentEventsByPersistenceIdQuerySourceProvider.sourceSchema]: schema: $schema, providerName: $providerName, parameters: $parameters")
    CurrentEventsByPersistenceIdQuerySourceProvider.name -> schema.get
  }

  override def createSource(
    sqlContext: SQLContext,
    metadataPath: String,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): Source = {

    val eventMapperFQCN: String = parameters.get("event-mapper") match {
      case Some(_eventMapper) => _eventMapper
      case _                  => throw new RuntimeException("No event mapper FQCN")
    }

    val pid = (parameters.get("pid"), parameters.get("persistence-id")) match {
      case (Some(pid), _) => pid
      case (_, Some(pid)) => pid
      case _              => throw new RuntimeException("No persistence_id")
    }

    new CurrentEventsByPersistenceIdQuerySourceImpl(sqlContext, parameters("path"), eventMapperFQCN, pid, schema.get)
  }
  override def shortName(): String = CurrentEventsByPersistenceIdQuerySourceProvider.name
}

class CurrentEventsByPersistenceIdQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String, eventMapperFQCN: String, persistenceId: String, override val schema: StructType) extends Source with ReadJournalSource {
  override def getOffset: Option[Offset] = {
    val offset = maxEventsByPersistenceId(persistenceId)
    println("[CurrentEventsByPersistenceIdQuery]: Returning maximum offset: " + offset)
    Some(LongOffset(offset))
  }

  override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = {
    val (start, end) = getStartEnd(_start, _end)
    val df: DataFrame = eventsByPersistenceId(persistenceId, start, end, eventMapperFQCN)
    println(s"[CurrentEventsByPersistenceIdQuery]: Getting currentPersistenceIds from start: $start, end: $end, DataFrame.count: ${df.count}")
    df
  }
} 
Example 5
Source File: PulsarOffset.scala    From pulsar-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.pulsar

import org.apache.pulsar.client.api.MessageId
import org.apache.pulsar.client.impl.MessageIdImpl

import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2, PartitionOffset}

private[pulsar] sealed trait PulsarOffset

private[pulsar] case object EarliestOffset extends PulsarOffset

private[pulsar] case object LatestOffset extends PulsarOffset

private[pulsar] case class TimeOffset(ts: Long) extends PulsarOffset

private[pulsar] sealed trait PerTopicOffset extends PulsarOffset

private[pulsar] case class SpecificPulsarOffset(topicOffsets: Map[String, MessageId])
    extends OffsetV2
    with PerTopicOffset {

  override val json = JsonUtils.topicOffsets(topicOffsets)
}

private[pulsar] case class SpecificPulsarStartingTime(topicTimes: Map[String, Long])
    extends OffsetV2
    with PerTopicOffset {

  override def json(): String = JsonUtils.topicTimes(topicTimes)
}

private[pulsar] case class PulsarPartitionOffset(topic: String, messageId: MessageId)
    extends PartitionOffset

private[pulsar] object SpecificPulsarOffset {

  def getTopicOffsets(offset: Offset): Map[String, MessageId] = {
    offset match {
      case o: SpecificPulsarOffset => o.topicOffsets
      case so: SerializedOffset => SpecificPulsarOffset(so).topicOffsets
      case _ =>
        throw new IllegalArgumentException(
          s"Invalid conversion from offset of ${offset.getClass} to PulsarSourceOffset")
    }
  }

  def apply(offset: SerializedOffset): SpecificPulsarOffset =
    SpecificPulsarOffset(JsonUtils.topicOffsets(offset.json))

  def apply(offsetTuples: (String, MessageId)*): SpecificPulsarOffset = {
    SpecificPulsarOffset(offsetTuples.toMap)
  }
}

private[pulsar] case class UserProvidedMessageId(mid: MessageId)
    extends MessageIdImpl(
      mid.asInstanceOf[MessageIdImpl].getLedgerId,
      mid.asInstanceOf[MessageIdImpl].getEntryId,
      mid.asInstanceOf[MessageIdImpl].getPartitionIndex) 
Example 6
Source File: KinesisSourceOffset.scala    From kinesis-sql   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kinesis

import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import scala.collection.mutable.HashMap
import scala.util.control.NonFatal

import org.apache.spark.sql.execution.streaming.Offset
import org.apache.spark.sql.execution.streaming.SerializedOffset
import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2, PartitionOffset}


 
  def apply(json: String): KinesisSourceOffset = {
    try {
      val readObj = Serialization.read[ Map[ String, Map[ String, String ] ] ](json)
      val metadata = readObj.get("metadata")
      val shardInfoMap: Map[String, ShardInfo ] = readObj.filter(_._1 != "metadata").map {
        case (shardId, value) => shardId.toString -> new ShardInfo(shardId.toString,
          value.get("iteratorType").get,
          value.get("iteratorPosition").get)
      }.toMap
      KinesisSourceOffset(
        new ShardOffsets(
          metadata.get("batchId").toLong,
          metadata.get("streamName"),
          shardInfoMap))
    } catch {
      case NonFatal(x) => throw new IllegalArgumentException(x)
    }
  }

  def getMap(shardInfos: Array[ShardInfo]): Map[String, ShardInfo] = {
    shardInfos.map {
      s: ShardInfo => (s.shardId -> s)
    }.toMap
  }

} 
Example 7
Source File: BlockingSource.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.util

import java.util.concurrent.CountDownLatch

import org.apache.spark.sql.{SQLContext, _}
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class BlockingSource extends StreamSourceProvider with StreamSinkProvider {

  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    BlockingSource.latch.await()
    new Source {
      override def schema: StructType = fakeSchema
      override def getOffset: Option[Offset] = Some(new LongOffset(0))
      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
        import spark.implicits._
        Seq[Int]().toDS().toDF()
      }
      override def stop() {}
    }
  }

  override def createSink(
      spark: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new Sink {
      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
    }
  }
}

object BlockingSource {
  var latch: CountDownLatch = null
} 
Example 8
Source File: OffsetSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}

trait OffsetSuite extends SparkFunSuite {
  
  def compare(one: Offset, two: Offset): Unit = {
    test(s"comparison $one <=> $two") {
      assert(one == one)
      assert(two == two)
      assert(one != two)
      assert(two != one)
    }
  }
}

class LongOffsetSuite extends OffsetSuite {
  val one = LongOffset(1)
  val two = LongOffset(2)
  val three = LongOffset(3)
  compare(one, two)

  compare(LongOffset(SerializedOffset(one.json)),
          LongOffset(SerializedOffset(three.json)))
} 
Example 9
Source File: BlockingSource.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.util

import java.util.concurrent.CountDownLatch

import org.apache.spark.sql.{SQLContext, _}
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class BlockingSource extends StreamSourceProvider with StreamSinkProvider {

  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    BlockingSource.latch.await()
    new Source {
      override def schema: StructType = fakeSchema
      override def getOffset: Option[Offset] = Some(new LongOffset(0))
      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
        import spark.implicits._
        Seq[Int]().toDS().toDF()
      }
      override def stop() {}
    }
  }

  override def createSink(
      spark: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new Sink {
      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
    }
  }
}

object BlockingSource {
  var latch: CountDownLatch = null
} 
Example 10
Source File: StreamingQueryException.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.annotation.Experimental
import org.apache.spark.sql.execution.streaming.{Offset, StreamExecution}


  val time: Long = System.currentTimeMillis

  override def toString(): String = {
    val causeStr =
      s"${cause.getMessage} ${cause.getStackTrace.take(10).mkString("", "\n|\t", "\n")}"
    s"""
       |$causeStr
       |
       |${query.asInstanceOf[StreamExecution].toDebugString}
       """.stripMargin
  }
} 
Example 11
Source File: KafkaSourceOffset.scala    From spark-kafka-0-8-sql   with Apache License 2.0 5 votes vote down vote up
package com.hortonworks.spark.sql.kafka08

import kafka.common.TopicAndPartition
import org.apache.spark.sql.execution.streaming.Offset
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset


object KafkaSourceOffset {

  def getPartitionOffsets(offset: Offset): Map[TopicAndPartition, LeaderOffset] = {
    offset match {
      case o: KafkaSourceOffset => o.partitionToOffsets
      case _ =>
        throw new IllegalArgumentException(
          s"Invalid conversion from offset of ${offset.getClass} to KafkaSourceOffset")
    }
  }
} 
Example 12
Source File: BigQuerySource.scala    From spark-bigquery   with Apache License 2.0 5 votes vote down vote up
package com.samelamin.spark.bigquery.streaming

import java.math.BigInteger
import com.google.cloud.hadoop.io.bigquery.BigQueryStrings
import com.samelamin.spark.bigquery.BigQueryClient
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.execution.streaming.{Offset, _}
import org.apache.spark.sql.types.{BinaryType, StringType, StructField, StructType}
import com.samelamin.spark.bigquery._
import com.samelamin.spark.bigquery.converters.SchemaConverters
import org.joda.time.DateTime
import org.slf4j.LoggerFactory


  override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
    val startIndex = start.getOrElse(LongOffset(0L)).asInstanceOf[LongOffset].offset.toLong
    val endIndex = end.asInstanceOf[LongOffset].offset.toLong
    val startPartitionTime = new DateTime(startIndex).toLocalDate
    val endPartitionTime = new DateTime(endIndex).toLocalDate.toString
    logger.info(s"Fetching data between $startIndex and $endIndex")
    val query =
      s"""
         |SELECT
         |  *
         |FROM
         |  `${fullyQualifiedOutputTableId.replace(':','.')}`
         |WHERE
         |  $timestampColumn BETWEEN TIMESTAMP_MILLIS($startIndex) AND TIMESTAMP_MILLIS($endIndex)
         |  AND _PARTITIONTIME BETWEEN TIMESTAMP('$startPartitionTime') AND TIMESTAMP('$endPartitionTime')
         |  """.stripMargin
    val bigQuerySQLContext = new BigQuerySQLContext(sqlContext)
    val df = bigQuerySQLContext.bigQuerySelect(query)
    df
  }

  override def stop(): Unit = {}
  def getConvertedSchema(sqlContext: SQLContext): StructType = {
    val bigqueryClient = BigQueryClient.getInstance(sqlContext)
    val tableReference = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId)
    SchemaConverters.BQToSQLSchema(bigqueryClient.getTableSchema(tableReference))
  }
}

object BigQuerySource {
  val DEFAULT_SCHEMA = StructType(
    StructField("Sample Column", StringType) ::
      StructField("value", BinaryType) :: Nil
  )
} 
Example 13
Source File: BlockingSource.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.util

import java.util.concurrent.CountDownLatch

import org.apache.spark.sql.{SQLContext, _}
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class BlockingSource extends StreamSourceProvider with StreamSinkProvider {

  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    BlockingSource.latch.await()
    new Source {
      override def schema: StructType = fakeSchema
      override def getOffset: Option[Offset] = Some(new LongOffset(0))
      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
        import spark.implicits._
        Seq[Int]().toDS().toDF()
      }
      override def stop() {}
    }
  }

  override def createSink(
      spark: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new Sink {
      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
    }
  }
}

object BlockingSource {
  var latch: CountDownLatch = null
} 
Example 14
Source File: OffsetSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}

trait OffsetSuite extends SparkFunSuite {
  
  def compare(one: Offset, two: Offset): Unit = {
    test(s"comparison $one <=> $two") {
      assert(one == one)
      assert(two == two)
      assert(one != two)
      assert(two != one)
    }
  }
}

class LongOffsetSuite extends OffsetSuite {
  val one = LongOffset(1)
  val two = LongOffset(2)
  val three = LongOffset(3)
  compare(one, two)

  compare(LongOffset(SerializedOffset(one.json)),
          LongOffset(SerializedOffset(three.json)))
} 
Example 15
Source File: BlockingSource.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.util

import java.util.concurrent.CountDownLatch

import org.apache.spark.sql.{SQLContext, _}
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class BlockingSource extends StreamSourceProvider with StreamSinkProvider {

  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    BlockingSource.latch.await()
    new Source {
      override def schema: StructType = fakeSchema
      override def getOffset: Option[Offset] = Some(new LongOffset(0))
      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
        import spark.implicits._
        Seq[Int]().toDS().toDF()
      }
      override def stop() {}
    }
  }

  override def createSink(
      spark: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new Sink {
      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
    }
  }
}

object BlockingSource {
  var latch: CountDownLatch = null
} 
Example 16
Source File: OffsetSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}

trait OffsetSuite extends SparkFunSuite {
  
  def compare(one: Offset, two: Offset): Unit = {
    test(s"comparison $one <=> $two") {
      assert(one == one)
      assert(two == two)
      assert(one != two)
      assert(two != one)
    }
  }
}

class LongOffsetSuite extends OffsetSuite {
  val one = LongOffset(1)
  val two = LongOffset(2)
  val three = LongOffset(3)
  compare(one, two)

  compare(LongOffset(SerializedOffset(one.json)),
          LongOffset(SerializedOffset(three.json)))
} 
Example 17
Source File: RedisSourceOffset.scala    From spark-redis   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package org.apache.spark.sql.redis.stream

import com.redislabs.provider.redis.util.JsonUtils
import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
import org.json4s.jackson.Serialization
import org.json4s.{Formats, NoTypeHints}


case class RedisSourceOffset(offsets: Map[String, RedisConsumerOffset]) extends Offset {

  override def json(): String = JsonUtils.toJson(this)
}

object RedisSourceOffset {

  private implicit val formats: Formats = Serialization.formats(NoTypeHints)

  def fromOffset(offset: Offset): RedisSourceOffset = {
    offset match {
      case o: RedisSourceOffset => o
      case so: SerializedOffset => fromJson(so.json)
      case _ =>
        throw new IllegalArgumentException(
          s"Invalid conversion from offset of ${offset.getClass} to RedisSourceOffset")
    }

    fromJson(offset.json())
  }

  def fromJson(json: String): RedisSourceOffset = {
    try {
      Serialization.read[RedisSourceOffset](json)
    } catch {
      case e: Throwable =>
        val example = RedisSourceOffset(Map("my-stream" -> RedisConsumerOffset("redis-source", "1543674099961-0")))
        val jsonExample = Serialization.write(example)
        throw new RuntimeException(s"Unable to parse offset json. Example of valid json: $jsonExample", e)
    }
  }
}

case class RedisConsumerOffset(groupName: String, offset: String)

case class RedisSourceOffsetRange(start: Option[String], end: String, config: RedisConsumerConfig) 
Example 18
Source File: OffsetSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.{CompositeOffset, LongOffset, Offset}

trait OffsetSuite extends SparkFunSuite {
  
  def compare(one: Offset, two: Offset): Unit = {
    test(s"comparison $one <=> $two") {
      assert(one == one)
      assert(two == two)
      assert(one != two)
      assert(two != one)
    }
  }
}

class LongOffsetSuite extends OffsetSuite {
  val one = LongOffset(1)
  val two = LongOffset(2)
  compare(one, two)
}

class CompositeOffsetSuite extends OffsetSuite {
  compare(
    one = CompositeOffset(Some(LongOffset(1)) :: Nil),
    two = CompositeOffset(Some(LongOffset(2)) :: Nil))

  compare(
    one = CompositeOffset(None :: Nil),
    two = CompositeOffset(Some(LongOffset(2)) :: Nil))

  compare(
    one = CompositeOffset.fill(LongOffset(0), LongOffset(1)),
    two = CompositeOffset.fill(LongOffset(1), LongOffset(2)))

  compare(
    one = CompositeOffset.fill(LongOffset(1), LongOffset(1)),
    two = CompositeOffset.fill(LongOffset(1), LongOffset(2)))

}