org.apache.spark.sql.execution.streaming.SerializedOffset Scala Example

Source File: RedisSourceOffset.scala From spark-redis with BSD 3-Clause "New" or "Revised" License

5 votes

package org.apache.spark.sql.redis.stream

import com.redislabs.provider.redis.util.JsonUtils
import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
import org.json4s.jackson.Serialization
import org.json4s.{Formats, NoTypeHints}


case class RedisSourceOffset(offsets: Map[String, RedisConsumerOffset]) extends Offset {

  override def json(): String = JsonUtils.toJson(this)
}

object RedisSourceOffset {

  private implicit val formats: Formats = Serialization.formats(NoTypeHints)

  def fromOffset(offset: Offset): RedisSourceOffset = {
    offset match {
      case o: RedisSourceOffset => o
      case so: SerializedOffset => fromJson(so.json)
      case _ =>
        throw new IllegalArgumentException(
          s"Invalid conversion from offset of ${offset.getClass} to RedisSourceOffset")
    }

    fromJson(offset.json())
  }

  def fromJson(json: String): RedisSourceOffset = {
    try {
      Serialization.read[RedisSourceOffset](json)
    } catch {
      case e: Throwable =>
        val example = RedisSourceOffset(Map("my-stream" -> RedisConsumerOffset("redis-source", "1543674099961-0")))
        val jsonExample = Serialization.write(example)
        throw new RuntimeException(s"Unable to parse offset json. Example of valid json: $jsonExample", e)
    }
  }
}

case class RedisConsumerOffset(groupName: String, offset: String)

case class RedisSourceOffsetRange(start: Option[String], end: String, config: RedisConsumerConfig)

Source File: OffsetSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}

trait OffsetSuite extends SparkFunSuite {
  
  def compare(one: Offset, two: Offset): Unit = {
    test(s"comparison $one <=> $two") {
      assert(one == one)
      assert(two == two)
      assert(one != two)
      assert(two != one)
    }
  }
}

class LongOffsetSuite extends OffsetSuite {
  val one = LongOffset(1)
  val two = LongOffset(2)
  val three = LongOffset(3)
  compare(one, two)

  compare(LongOffset(SerializedOffset(one.json)),
          LongOffset(SerializedOffset(three.json)))
}

Source File: OffsetSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}

trait OffsetSuite extends SparkFunSuite {
  
  def compare(one: Offset, two: Offset): Unit = {
    test(s"comparison $one <=> $two") {
      assert(one == one)
      assert(two == two)
      assert(one != two)
      assert(two != one)
    }
  }
}

class LongOffsetSuite extends OffsetSuite {
  val one = LongOffset(1)
  val two = LongOffset(2)
  val three = LongOffset(3)
  compare(one, two)

  compare(LongOffset(SerializedOffset(one.json)),
          LongOffset(SerializedOffset(three.json)))
}

Source File: OffsetSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}

trait OffsetSuite extends SparkFunSuite {
  
  def compare(one: Offset, two: Offset): Unit = {
    test(s"comparison $one <=> $two") {
      assert(one == one)
      assert(two == two)
      assert(one != two)
      assert(two != one)
    }
  }
}

class LongOffsetSuite extends OffsetSuite {
  val one = LongOffset(1)
  val two = LongOffset(2)
  val three = LongOffset(3)
  compare(one, two)

  compare(LongOffset(SerializedOffset(one.json)),
          LongOffset(SerializedOffset(three.json)))
}

Source File: OffsetSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}

trait OffsetSuite extends SparkFunSuite {
  
  def compare(one: Offset, two: Offset): Unit = {
    test(s"comparison $one <=> $two") {
      assert(one == one)
      assert(two == two)
      assert(one != two)
      assert(two != one)
    }
  }
}

class LongOffsetSuite extends OffsetSuite {
  val one = LongOffset(1)
  val two = LongOffset(2)
  val three = LongOffset(3)
  compare(one, two)

  compare(LongOffset(SerializedOffset(one.json)),
          LongOffset(SerializedOffset(three.json)))
}

Source File: KinesisSourceOffset.scala From kinesis-sql with Apache License 2.0

5 votes

package org.apache.spark.sql.kinesis

import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import scala.collection.mutable.HashMap
import scala.util.control.NonFatal

import org.apache.spark.sql.execution.streaming.Offset
import org.apache.spark.sql.execution.streaming.SerializedOffset
import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2, PartitionOffset}


 
  def apply(json: String): KinesisSourceOffset = {
    try {
      val readObj = Serialization.read[ Map[ String, Map[ String, String ] ] ](json)
      val metadata = readObj.get("metadata")
      val shardInfoMap: Map[String, ShardInfo ] = readObj.filter(_._1 != "metadata").map {
        case (shardId, value) => shardId.toString -> new ShardInfo(shardId.toString,
          value.get("iteratorType").get,
          value.get("iteratorPosition").get)
      }.toMap
      KinesisSourceOffset(
        new ShardOffsets(
          metadata.get("batchId").toLong,
          metadata.get("streamName"),
          shardInfoMap))
    } catch {
      case NonFatal(x) => throw new IllegalArgumentException(x)
    }
  }

  def getMap(shardInfos: Array[ShardInfo]): Map[String, ShardInfo] = {
    shardInfos.map {
      s: ShardInfo => (s.shardId -> s)
    }.toMap
  }

}

Source File: PulsarOffset.scala From pulsar-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.pulsar

import org.apache.pulsar.client.api.MessageId
import org.apache.pulsar.client.impl.MessageIdImpl

import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2, PartitionOffset}

private[pulsar] sealed trait PulsarOffset

private[pulsar] case object EarliestOffset extends PulsarOffset

private[pulsar] case object LatestOffset extends PulsarOffset

private[pulsar] case class TimeOffset(ts: Long) extends PulsarOffset

private[pulsar] sealed trait PerTopicOffset extends PulsarOffset

private[pulsar] case class SpecificPulsarOffset(topicOffsets: Map[String, MessageId])
    extends OffsetV2
    with PerTopicOffset {

  override val json = JsonUtils.topicOffsets(topicOffsets)
}

private[pulsar] case class SpecificPulsarStartingTime(topicTimes: Map[String, Long])
    extends OffsetV2
    with PerTopicOffset {

  override def json(): String = JsonUtils.topicTimes(topicTimes)
}

private[pulsar] case class PulsarPartitionOffset(topic: String, messageId: MessageId)
    extends PartitionOffset

private[pulsar] object SpecificPulsarOffset {

  def getTopicOffsets(offset: Offset): Map[String, MessageId] = {
    offset match {
      case o: SpecificPulsarOffset => o.topicOffsets
      case so: SerializedOffset => SpecificPulsarOffset(so).topicOffsets
      case _ =>
        throw new IllegalArgumentException(
          s"Invalid conversion from offset of ${offset.getClass} to PulsarSourceOffset")
    }
  }

  def apply(offset: SerializedOffset): SpecificPulsarOffset =
    SpecificPulsarOffset(JsonUtils.topicOffsets(offset.json))

  def apply(offsetTuples: (String, MessageId)*): SpecificPulsarOffset = {
    SpecificPulsarOffset(offsetTuples.toMap)
  }
}

private[pulsar] case class UserProvidedMessageId(mid: MessageId)
    extends MessageIdImpl(
      mid.asInstanceOf[MessageIdImpl].getLedgerId,
      mid.asInstanceOf[MessageIdImpl].getEntryId,
      mid.asInstanceOf[MessageIdImpl].getPartitionIndex)

Source File: PulsarSourceOffsetSuite.scala From pulsar-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.pulsar

import java.io.File

import org.apache.pulsar.client.impl.MessageIdImpl

import org.apache.spark.sql.execution.streaming.{LongOffset, OffsetSeq, OffsetSeqLog, SerializedOffset}
import org.apache.spark.sql.streaming.OffsetSuite
import org.apache.spark.sql.test.SharedSQLContext

class PulsarSourceOffsetSuite extends OffsetSuite with SharedSQLContext {

  compare(
    one = SpecificPulsarOffset(("t", new MessageIdImpl(1, 1, -1))),
    two = SpecificPulsarOffset(("t", new MessageIdImpl(1, 2, -1))))

  compare(
    one = SpecificPulsarOffset(
      ("t", new MessageIdImpl(1, 1, -1)),
      ("t1", new MessageIdImpl(1, 1, -1))),
    two = SpecificPulsarOffset(
      ("t", new MessageIdImpl(1, 2, -1)),
      ("t1", new MessageIdImpl(1, 2, -1)))
  )

  compare(
    one = SpecificPulsarOffset(("t", new MessageIdImpl(1, 1, -1))),
    two = SpecificPulsarOffset(
      ("t", new MessageIdImpl(1, 2, -1)),
      ("t1", new MessageIdImpl(1, 1, -1))))

  val kso1 = SpecificPulsarOffset(("t", new MessageIdImpl(1, 1, -1)))
  val kso2 =
    SpecificPulsarOffset(("t", new MessageIdImpl(1, 2, -1)), ("t1", new MessageIdImpl(1, 3, -1)))
  val kso3 = SpecificPulsarOffset(
    ("t", new MessageIdImpl(1, 2, -1)),
    ("t1", new MessageIdImpl(1, 3, -1)),
    ("t2", new MessageIdImpl(1, 4, -1)))

  compare(
    SpecificPulsarOffset(SerializedOffset(kso1.json)),
    SpecificPulsarOffset(SerializedOffset(kso2.json)))

  test("basic serialization - deserialization") {
    assert(
      SpecificPulsarOffset.getTopicOffsets(kso1) ==
        SpecificPulsarOffset.getTopicOffsets(SerializedOffset(kso1.json)))
  }

  test("OffsetSeqLog serialization - deserialization") {
    withTempDir { temp =>
      // use non-existent directory to test whether log make the dir
      val dir = new File(temp, "dir")
      val metadataLog = new OffsetSeqLog(spark, dir.getAbsolutePath)
      val batch0 = OffsetSeq.fill(kso1)
      val batch1 = OffsetSeq.fill(kso2, kso3)

      val batch0Serialized =
        OffsetSeq.fill(batch0.offsets.flatMap(_.map(o => SerializedOffset(o.json))): _*)

      val batch1Serialized =
        OffsetSeq.fill(batch1.offsets.flatMap(_.map(o => SerializedOffset(o.json))): _*)

      assert(metadataLog.add(0, batch0))
      assert(metadataLog.getLatest() === Some(0 -> batch0Serialized))
      assert(metadataLog.get(0) === Some(batch0Serialized))

      assert(metadataLog.add(1, batch1))
      assert(metadataLog.get(0) === Some(batch0Serialized))
      assert(metadataLog.get(1) === Some(batch1Serialized))
      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
      assert(
        metadataLog.get(None, Some(1)) ===
          Array(0 -> batch0Serialized, 1 -> batch1Serialized))

      // Adding the same batch does nothing
      metadataLog.add(1, OffsetSeq.fill(LongOffset(3)))
      assert(metadataLog.get(0) === Some(batch0Serialized))
      assert(metadataLog.get(1) === Some(batch1Serialized))
      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
      assert(
        metadataLog.get(None, Some(1)) ===
          Array(0 -> batch0Serialized, 1 -> batch1Serialized))
    }
  }

}

org.apache.spark.sql.execution.streaming.SerializedOffset Scala Examples