org.apache.kafka.connect.data.SchemaBuilder Scala Example

Source File: ProjectionMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.source.mapper

import cats.data.NonEmptyList
import com.landoop.streamreactor.connect.hive.StructMapper
import com.landoop.streamreactor.connect.hive.source.config.ProjectionField
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}

class ProjectionMapper(projection: NonEmptyList[ProjectionField]) extends StructMapper {

  override def map(input: Struct): Struct = {
    val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, projectionField) =>
      Option(input.schema.field(projectionField.name))
        .fold(sys.error(s"Projection field ${projectionField.name} cannot be found in input")) { field =>
          builder.field(projectionField.alias, field.schema)
        }
    }
    val schema = builder.build()
    projection.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.alias, input.get(field.name))
    }
  }
}

Source File: ReThinkSourceReadersFactory.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.rethink.source

import java.util
import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.atomic.AtomicBoolean

import com.datamountaineer.streamreactor.connect.rethink.ReThinkConnection
import com.datamountaineer.streamreactor.connect.rethink.config.{ReThinkSourceConfig, ReThinkSourceSetting, ReThinkSourceSettings}
import com.rethinkdb.RethinkDB
import com.rethinkdb.net.{Connection, Cursor}
import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.SchemaBuilder
import org.apache.kafka.connect.source.SourceRecord

import scala.collection.JavaConverters._
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.Future

object ReThinkSourceReadersFactory {

  def apply(config: ReThinkSourceConfig, r: RethinkDB): Set[ReThinkSourceReader] = {
    val conn = Some(ReThinkConnection(r, config))
    val settings = ReThinkSourceSettings(config)
    settings.map(s => new ReThinkSourceReader(r, conn.get, s))
  }
}

class ReThinkSourceReader(rethink: RethinkDB, conn: Connection, setting: ReThinkSourceSetting)
  extends StrictLogging {

  logger.info(s"Initialising ReThink Reader for ${setting.source}")
  private val keySchema = SchemaBuilder.string().optional().build()
  private val valueSchema = ChangeFeedStructBuilder.schema
  private val sourcePartition = Map.empty[String, String]
  private val offset = Map.empty[String, String]
  private val stopFeed = new AtomicBoolean(false)
  private val handlingFeed = new AtomicBoolean(false)
  private var feed : Cursor[util.HashMap[String, String]] = _
  val queue = new LinkedBlockingQueue[SourceRecord]()
  val batchSize = setting.batchSize

  def start() = {
    feed = getChangeFeed()
    startFeed(feed)
  }

  def stop() = {
    logger.info(s"Closing change feed for ${setting.source}")
    stopFeed.set(true)
    while (handlingFeed.get()) {
      logger.debug("Waiting for feed to shutdown...")
      Thread.sleep(1000)
    }
    feed.close()
    logger.info(s"Change feed closed for ${setting.source}")
  }

  
  private def handleFeed(feed: Cursor[util.HashMap[String, String]]) = {
    handlingFeed.set(true)

    //feed.next is blocking
    while(!stopFeed.get()) {
      logger.debug(s"Waiting for next change feed event for ${setting.source}")
      val cdc = convert(feed.next().asScala.toMap)
      queue.put(cdc)
    }
    handlingFeed.set(false)
  }

  private def getChangeFeed(): Cursor[util.HashMap[String, String]] = {
    logger.info(s"Initialising change feed for ${setting.source}")
    rethink
      .db(setting.db)
      .table(setting.source)
      .changes()
      .optArg("include_states", true)
      .optArg("include_initial", setting.initialise)
      .optArg("include_types", true)
      .run(conn)
  }

  private def convert(feed: Map[String, String]) = {
    new SourceRecord(sourcePartition.asJava, offset.asJava, setting.target, keySchema, setting.source, valueSchema,
      ChangeFeedStructBuilder(feed))
  }
}

Source File: ChangeFeedStructBuilder.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.rethink.source

import com.fasterxml.jackson.databind.ObjectMapper
import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}



object ChangeFeedStructBuilder extends StrictLogging {

  val mapper = new ObjectMapper()
  val oldVal = "old_val"
  val newVal = "new_val"
  val state = "state"
  val `type` = "type"

  val schema: Schema = SchemaBuilder.struct.name("ReThinkChangeFeed")
    .version(1)
    .field(state, Schema.OPTIONAL_STRING_SCHEMA)
    .field(oldVal, Schema.OPTIONAL_STRING_SCHEMA)
    .field(newVal, Schema.OPTIONAL_STRING_SCHEMA)
    .field(`type`, Schema.OPTIONAL_STRING_SCHEMA)
    .build

  def apply(hm: Map[String, Object]): Struct = {
    val struct = new Struct(schema)
    hm.foreach({ case (k, v) => if (v != null) struct.put(k, v.toString) })
    struct
  }
}

Source File: PulsarWriterTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.pulsar.sink

import com.datamountaineer.streamreactor.connect.pulsar.ProducerConfigFactory
import com.datamountaineer.streamreactor.connect.pulsar.config.{PulsarConfigConstants, PulsarSinkConfig, PulsarSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.pulsar.client.api.{Message, MessageId, Producer, PulsarClient}
import org.mockito.ArgumentMatchers.any
import org.mockito.MockitoSugar
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._


class PulsarWriterTest extends AnyWordSpec with MockitoSugar with Matchers {
  val pulsarTopic = "persistent://landoop/standalone/connect/kafka-topic"

  def getSchema: Schema = {
    SchemaBuilder.struct
      .field("int8", SchemaBuilder.int8().defaultValue(2.toByte).doc("int8 field").build())
      .field("int16", Schema.INT16_SCHEMA)
      .field("int32", Schema.INT32_SCHEMA)
      .field("int64", Schema.INT64_SCHEMA)
      .field("float32", Schema.FLOAT32_SCHEMA)
      .field("float64", Schema.FLOAT64_SCHEMA)
      .field("boolean", Schema.BOOLEAN_SCHEMA)
      .field("string", Schema.STRING_SCHEMA)
      .build()
  }


  def getStruct(schema: Schema): Struct = {
    new Struct(schema)
      .put("int8", 12.toByte)
      .put("int16", 12.toShort)
      .put("int32", 12)
      .put("int64", 12L)
      .put("float32", 12.2f)
      .put("float64", 12.2)
      .put("boolean", true)
      .put("string", "foo")
  }


  "should write messages" in {

    val config = PulsarSinkConfig(Map(
      PulsarConfigConstants.HOSTS_CONFIG -> "pulsar://localhost:6650",
      PulsarConfigConstants.KCQL_CONFIG -> s"INSERT INTO $pulsarTopic SELECT * FROM kafka_topic BATCH = 10 WITHPARTITIONER = SinglePartition WITHCOMPRESSION = ZLIB WITHDELAY = 1000"
    ).asJava)

    val schema = getSchema
    val struct = getStruct(schema)
    val record1 = new SinkRecord("kafka_topic", 0, null, null, schema, struct, 1)

    val settings = PulsarSinkSettings(config)
    val producerConfig = ProducerConfigFactory("test", settings.kcql)

    val client = mock[PulsarClient]
    val producer = mock[Producer]
    val messageId = mock[MessageId]

    when(client.createProducer(pulsarTopic, producerConfig(pulsarTopic))).thenReturn(producer)
    when(producer.send(any[Message])).thenReturn(messageId)

    val writer = PulsarWriter(client, "test", settings)
    writer.write(List(record1))
  }
}

Source File: RedisInsertSortedSetTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.redis.sink.writer

import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import redis.clients.jedis.Jedis
import redis.embedded.RedisServer

import scala.collection.JavaConverters._

class RedisInsertSortedSetTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar {

  val redisServer = new RedisServer(6379)

  override def beforeAll() = redisServer.start()

  override def afterAll() = redisServer.stop()

  "Redis INSERT into Sorted Set (SS) writer" should {

    "write Kafka records to a Redis Sorted Set" in {

      val TOPIC = "cpuTopic"
      val KCQL = s"INSERT INTO cpu_stats SELECT * from $TOPIC STOREAS SortedSet(score=ts)"
      println("Testing KCQL : " + KCQL)
      val props = Map(
        RedisConfigConstants.REDIS_HOST->"localhost",
        RedisConfigConstants.REDIS_PORT->"6379",
        RedisConfigConstants.KCQL_CONFIG->KCQL
      ).asJava

      val config = RedisConfig(props)
      val connectionInfo = new RedisConnectionInfo("localhost", 6379, None)
      val settings = RedisSinkSettings(config)
      val writer = new RedisInsertSortedSet(settings)
      writer.createClient(settings)

      val schema = SchemaBuilder.struct().name("com.example.Cpu")
        .field("type", Schema.STRING_SCHEMA)
        .field("temperature", Schema.FLOAT64_SCHEMA)
        .field("voltage", Schema.FLOAT64_SCHEMA)
        .field("ts", Schema.INT64_SCHEMA).build()

      val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L)
      val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L)
      val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L)

      val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1)
      val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2)
      val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3)

      val jedis = new Jedis(connectionInfo.host, connectionInfo.port)
      // Clean up in-memory jedis
      jedis.flushAll()

      writer.write(Seq(sinkRecord1))
      writer.write(Seq(sinkRecord2, sinkRecord3))

      // Redis cardinality should now be 3
      jedis.zcard("cpu_stats") shouldBe 3

      val allSSrecords = jedis.zrange("cpu_stats", 0, 999999999999L)
      val results = allSSrecords.asScala.toList
      results.head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}"""
      results(1) shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}"""
      results(2) shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}"""

    }

  }

}

Source File: RedisPubSubTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.redis.sink.writer

import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import redis.clients.jedis.{Jedis, JedisPubSub}
import redis.embedded.RedisServer

import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer

class RedisPubSubTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar {

  val redisServer = new RedisServer(6379)

  override def beforeAll() = redisServer.start()

  override def afterAll() = redisServer.stop()

  "Redis PUBSUB writer" should {

    "write Kafka records to a Redis PubSub" in {

      val TOPIC = "cpuTopic"
      val KCQL = s"SELECT * from $TOPIC STOREAS PubSub (channel=type)"
      println("Testing KCQL : " + KCQL)
      val props = Map(
        RedisConfigConstants.REDIS_HOST->"localhost",
        RedisConfigConstants.REDIS_PORT->"6379",
        RedisConfigConstants.KCQL_CONFIG->KCQL
      ).asJava

      val config = RedisConfig(props)
      val connectionInfo = new RedisConnectionInfo("localhost", 6379, None)
      val settings = RedisSinkSettings(config)
      val writer = new RedisPubSub(settings)
      writer.createClient(settings)

      val schema = SchemaBuilder.struct().name("com.example.Cpu")
        .field("type", Schema.STRING_SCHEMA)
        .field("temperature", Schema.FLOAT64_SCHEMA)
        .field("voltage", Schema.FLOAT64_SCHEMA)
        .field("ts", Schema.INT64_SCHEMA).build()

      val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L)
      val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L)
      val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L)

      val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1)
      val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2)
      val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3)

      val jedis = new Jedis(connectionInfo.host, connectionInfo.port)
      // Clean up in-memory jedis
      jedis.flushAll()

      val messagesMap = collection.mutable.Map[String, ListBuffer[String]]()

      val t = new Thread {
        private val pubsub = new JedisPubSub {
          override def onMessage(channel: String, message: String): Unit = {
            messagesMap.get(channel) match {
              case Some(msgs) => messagesMap.put(channel, msgs += message)
              case None => messagesMap.put(channel, ListBuffer(message))
            }
          }
        }

        override def run(): Unit = {
          jedis.subscribe(pubsub, "Xeon", "i7", "i7-i")
        }

        override def interrupt(): Unit = {
          pubsub.punsubscribe("*")
          super.interrupt()
        }
      }
      t.start()
      t.join(5000)
      if (t.isAlive) t.interrupt()

      writer.write(Seq(sinkRecord1))
      writer.write(Seq(sinkRecord2, sinkRecord3))

      messagesMap.size shouldBe 3

      messagesMap("Xeon").head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}"""
      messagesMap("i7").head shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}"""
      messagesMap("i7-i").head shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}"""
    }
  }
}

Source File: RedisStreamTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.redis.sink.writer

/*
 * Copyright 2017 Datamountaineer.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util

import com.datamountaineer.streamreactor.connect.redis.sink.RedisSinkTask
import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import redis.clients.jedis.{Jedis, StreamEntryID}

import scala.collection.JavaConverters._

class RedisStreamTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar {
//
//  val redisServer = new RedisServer(6379)
//
//  override def beforeAll() = redisServer.start()
//
//  override def afterAll() = redisServer.stop()

  "Redis Stream writer" should {

    "write Kafka records to a Redis Stream" in {

      val TOPIC = "cpuTopic"
      val KCQL = s"INSERT INTO stream1 SELECT * from $TOPIC STOREAS STREAM"
      println("Testing KCQL : " + KCQL)
      val props = Map(
        RedisConfigConstants.REDIS_HOST->"localhost",
        RedisConfigConstants.REDIS_PORT->"6379",
        RedisConfigConstants.KCQL_CONFIG->KCQL,
        RedisConfigConstants.REDIS_PASSWORD -> ""
      ).asJava

      val config = RedisConfig(props)
      val connectionInfo = new RedisConnectionInfo("localhost", 6379, None)
      val settings = RedisSinkSettings(config)
      val writer = new RedisStreams(settings)

      val schema = SchemaBuilder.struct().name("com.example.Cpu")
        .field("type", Schema.STRING_SCHEMA)
        .field("temperature", Schema.FLOAT64_SCHEMA)
        .field("voltage", Schema.FLOAT64_SCHEMA)
        .field("ts", Schema.INT64_SCHEMA).build()

      val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L)

      val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1)

      val jedis = mock[Jedis]
      writer.jedis = jedis

      val map = new util.HashMap[String, String]()
      map.put("type", "Xeon")
      map.put("temperature", "60.4")
      map.put("voltage", "90.1")
      map.put("ts", 1482180657010L.toString)

      when(jedis.auth("")).isLenient()
      when(jedis.xadd("stream1", null, map)).thenReturn(mock[StreamEntryID])
      writer.initialize(1, settings.errorPolicy)
      writer.write(Seq(sinkRecord1))
    }
  }
}

Source File: OrcSchemas.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.orc

import com.landoop.streamreactor.connect.hive.UnsupportedSchemaType
import org.apache.kafka.connect.data.{Decimal, Schema, SchemaBuilder}
import org.apache.orc.TypeDescription
import org.apache.orc.TypeDescription.Category

import scala.collection.JavaConverters._

object OrcSchemas {

  def toKafka(schema: TypeDescription): Schema = schema.getCategory match {
    case Category.BOOLEAN => Schema.OPTIONAL_BOOLEAN_SCHEMA
    case Category.BYTE => Schema.OPTIONAL_INT8_SCHEMA
    case Category.DOUBLE => Schema.OPTIONAL_FLOAT64_SCHEMA
    case Category.INT => Schema.OPTIONAL_INT32_SCHEMA
    case Category.FLOAT => Schema.OPTIONAL_FLOAT32_SCHEMA
    case Category.LONG => Schema.OPTIONAL_INT64_SCHEMA
    case Category.SHORT => Schema.OPTIONAL_INT16_SCHEMA
    case Category.STRING => Schema.OPTIONAL_STRING_SCHEMA
    case Category.VARCHAR => Schema.OPTIONAL_STRING_SCHEMA
    case Category.CHAR => Schema.OPTIONAL_STRING_SCHEMA
    case Category.DATE => Schema.OPTIONAL_STRING_SCHEMA
    case Category.TIMESTAMP => Schema.OPTIONAL_STRING_SCHEMA
    case Category.BYTE => Schema.OPTIONAL_BYTES_SCHEMA
    case Category.STRUCT => toKafkaStruct(schema)
  }

  def toKafkaStruct(schema: TypeDescription): Schema = {
    import scala.collection.JavaConverters._
    val builder = SchemaBuilder.struct().name("from_orc")
    schema.getFieldNames.asScala.zipWithIndex.foreach { case (field, k) =>
      builder.field(field, toKafka(schema.getChildren.get(k)))
    }
    builder.build()
  }

  def toOrc(schema: Schema): TypeDescription = {
    schema.`type`() match {
      case Schema.Type.STRING if schema.name() == Decimal.LOGICAL_NAME => TypeDescription.createDecimal()
      case Schema.Type.STRING => TypeDescription.createString()
      case Schema.Type.BOOLEAN => TypeDescription.createBoolean()
      case Schema.Type.FLOAT32 => TypeDescription.createFloat()
      case Schema.Type.FLOAT64 => TypeDescription.createDouble()
      case Schema.Type.INT8 => TypeDescription.createByte()
      case Schema.Type.INT16 => TypeDescription.createShort()
      case Schema.Type.INT32 => TypeDescription.createInt()
      case Schema.Type.INT64 => TypeDescription.createLong()
      case Schema.Type.BYTES if schema.name() == Decimal.LOGICAL_NAME => TypeDescription.createDecimal()
      case Schema.Type.BYTES => TypeDescription.createBinary()
      case Schema.Type.ARRAY => TypeDescription.createList(toOrc(schema.valueSchema()))
      case Schema.Type.MAP => TypeDescription.createMap(toOrc(schema.keySchema()), toOrc(schema.valueSchema()))
      case Schema.Type.STRUCT =>
        schema.fields().asScala.foldLeft(TypeDescription.createStruct) { case (struct, field) =>
          struct.addField(field.name, toOrc(field.schema))
        }
      case unsupportedDataType => throw UnsupportedSchemaType(unsupportedDataType.toString)
    }
  }
}

Source File: ValueConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

object ValueConverter {
  def apply(record: SinkRecord): Struct = record.value match {
    case struct: Struct => StructValueConverter.convert(struct)
    case map: Map[_, _] => MapValueConverter.convert(map)
    case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap)
    case string: String => StringValueConverter.convert(string)
    case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}")
  }
}

trait ValueConverter[T] {
  def convert(value: T): Struct
}

object StructValueConverter extends ValueConverter[Struct] {
  override def convert(struct: Struct): Struct = struct
}

object MapValueConverter extends ValueConverter[Map[_, _]] {
  def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = {
    value match {
      case s: String =>
        builder.field(key, Schema.OPTIONAL_STRING_SCHEMA)
        s
      case l: Long =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        l
      case i: Int =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        i.toLong
      case b: Boolean =>
        builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA)
        b
      case f: Float =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        f.toDouble
      case d: Double =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        d
      case innerMap: java.util.Map[_, _] =>
        val innerStruct = convert(innerMap.asScala.toMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct

      case innerMap: Map[_, _] =>
        val innerStruct = convert(innerMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct
    }
  }

  def convert(map: Map[_, _], optional: Boolean) = {
    val builder = SchemaBuilder.struct()
    val values = map.map { case (k, v) =>
      val key = k.toString
      val value = convertValue(v, key, builder)
      key -> value
    }.toList
    if (optional) builder.optional()
    val schema = builder.build
    val struct = new Struct(schema)
    values.foreach { case (key, value) =>
      struct.put(key.toString, value)
    }
    struct
  }
  override def convert(map: Map[_, _]): Struct = convert(map, false)
}

object StringValueConverter extends ValueConverter[String] {
  override def convert(string: String): Struct = {
    val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build()
    new Struct(schema).put("a", string)
  }
}

Source File: DropPartitionValuesMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import com.landoop.streamreactor.connect.hive.{PartitionPlan, StructMapper}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}


class DropPartitionValuesMapper(plan: PartitionPlan) extends StructMapper {

  import scala.collection.JavaConverters._

  override def map(input: Struct): Struct = {
    val partitionKeys = plan.keys.map(_.value).toList
    val dataFields = input.schema.fields().asScala.filterNot(field => partitionKeys.contains(field.name))
    val builder = dataFields.foldLeft(SchemaBuilder.struct) { (builder, field) =>
      builder.field(field.name, field.schema)
    }
    val schema = builder.build()
    dataFields.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.name, input.get(field.name))
    }
  }
}

Source File: ProjectionMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import cats.data.NonEmptyList
import com.datamountaineer.kcql.Field
import com.landoop.streamreactor.connect.hive.StructMapper
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}


class ProjectionMapper(projection: NonEmptyList[Field]) extends StructMapper {

  override def map(input: Struct): Struct = {
    // the compatible output schema built from projected fields with aliases applied
    val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, kcqlField) =>
      Option(input.schema.field(kcqlField.getName)).fold(sys.error(s"Missing field $kcqlField")) { field =>
        builder.field(kcqlField.getAlias, field.schema)
      }
    }
    val schema = builder.build()
    projection.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.getAlias, input.get(field.getName))
    }
  }
}

Source File: PartitionValueMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.source.mapper

import com.landoop.streamreactor.connect.hive.{Partition, StructMapper}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

import scala.collection.JavaConverters._

class PartitionValueMapper(partition: Partition) extends StructMapper {
  override def map(input: Struct): Struct = {

    val builder = SchemaBuilder.struct()
    input.schema.fields.asScala.foreach { field =>
      builder.field(field.name, field.schema)
    }
    partition.entries.toList.foreach { entry =>
      builder.field(entry._1.value, Schema.STRING_SCHEMA)
    }
    val schema = builder.build()

    val struct = new Struct(schema)
    input.schema.fields.asScala.foreach { field =>
      struct.put(field.name, input.get(field.name))
    }
    partition.entries.toList.foreach { entry =>
      struct.put(entry._1.value, entry._2)
    }
    struct
  }
}

Source File: ConnectSchema.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.bloomberg

import org.apache.kafka.connect.data.{Schema, SchemaBuilder}

import scala.collection.JavaConverters._


  def createSchema(name: String, value: Any): Schema = {
    value match {
      case _: Boolean => Schema.BOOLEAN_SCHEMA
      case _: Int => Schema.INT32_SCHEMA
      case _: Long => Schema.INT64_SCHEMA
      case _: Double => Schema.FLOAT64_SCHEMA
      case _: Char => Schema.STRING_SCHEMA
      case _: String => Schema.STRING_SCHEMA
      case _: Float => Schema.FLOAT32_SCHEMA
      case list: java.util.List[_] =>
        val firstItemSchema = if (list.isEmpty) Schema.OPTIONAL_STRING_SCHEMA else createSchema(name, list.get(0))
        SchemaBuilder.array(firstItemSchema).build()

      case map: java.util.LinkedHashMap[String @unchecked, _] =>
        val recordBuilder = SchemaBuilder.struct()
        recordBuilder.name(name)
        map.entrySet().asScala.foreach(kvp =>
          recordBuilder.field(kvp.getKey, createSchema(kvp.getKey, kvp.getValue)))
        recordBuilder.build()
      case v => sys.error(s"${v.getClass} is not handled.")
    }
  }
}

object ConnectSchema {
  val namespace = "com.datamountaineer.streamreactor.connect.bloomberg"

  val connectSchema = new ConnectSchema(namespace)

  implicit class BloombergDataToConnectSchema(val data: BloombergData) {
    def getConnectSchema  : Schema = {
      connectSchema.createSchema("BloombergData", data.data)
    }
  }
}

Source File: OrcTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.orc

import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, StructUtils, orc}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class OrcTest extends AnyFlatSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  "Orc" should "read and write orc files" in {

    val schema = SchemaBuilder.struct()
      .field("name", SchemaBuilder.string().optional().build())
      .field("age", SchemaBuilder.int32().optional().build())
      .field("salary", SchemaBuilder.float64().optional().build())
      .name("from_orc")
      .build()

    val users = Seq(
      new Struct(schema).put("name", "sammy").put("age", 38).put("salary", 54.67),
      new Struct(schema).put("name", "laura").put("age", 37).put("salary", 91.84)
    )

    val path = new Path("orctest.orc")
    val sink = orc.sink(path, schema, OrcSinkConfig(overwrite = true))
    users.foreach(sink.write)
    sink.close()

    val source = orc.source(path, OrcSourceConfig())
    val actual = source.iterator.toList
    actual.head.schema shouldBe schema
    actual.map(StructUtils.extractValues) shouldBe
      List(Vector("sammy", 38, 54.67), Vector("laura", 37, 91.84))

    fs.delete(path, false)
  }
}

Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.concurrent.duration._

class DefaultCommitPolicyTest extends AnyWordSpec with Matchers {

  val schema: Schema = SchemaBuilder.struct()
    .field("name", SchemaBuilder.string().required().build())
    .build()

  val struct = new Struct(schema)

  implicit val conf: Configuration = new Configuration()
  implicit val fs: LocalFileSystem = FileSystem.getLocal(conf)
  val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100))

  private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = {
    val status = fs.getFileStatus(path)
    policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime))
  }

  "DefaultCommitPolicy" should {
    "roll over after interval" in {

      val policy = DefaultCommitPolicy(None, Option(2.seconds), None)
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 10) shouldBe false
      Thread.sleep(2000)
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file count" in {
      val policy = DefaultCommitPolicy(None, None, Some(9))
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 7) shouldBe false
      shouldFlush(policy, path, 8) shouldBe false
      shouldFlush(policy, path, 9) shouldBe true
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file size" in {
      val policy = DefaultCommitPolicy(Some(10), None, None)
      val path = new Path("foo")
      val out = fs.create(path)
      shouldFlush(policy, path, 7) shouldBe false
      out.writeBytes("wibble wobble wabble wubble")
      out.close()
      shouldFlush(policy, path, 9) shouldBe true
      fs.delete(path, false)
    }
  }
}

Source File: DropPartitionValuesMapperTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import cats.data.NonEmptyList
import com.landoop.streamreactor.connect.hive.{PartitionKey, PartitionPlan, TableName}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class DropPartitionValuesMapperTest extends AnyFunSuite with Matchers {

  test("strip partition values") {

    val schema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("p", SchemaBuilder.string().required().build())
      .field("q", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().required().build())
      .build()

    val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q")))
    val struct = new Struct(schema).put("a", "a").put("p", "p").put("q", "q").put("z", "z")
    val output = new DropPartitionValuesMapper(plan).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z")
  }

  test("handle partition field is missing in input") {

    val schema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("q", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().required().build())
      .build()


    val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q")))
    val struct = new Struct(schema).put("a", "a").put("q", "q").put("z", "z")
    val output = new DropPartitionValuesMapper(plan).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z")
  }
}

Source File: MetastoreSchemaAlignMapperTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class MetastoreSchemaAlignMapperTest extends AnyFunSuite with Matchers {

  test("pad optional missing fields with null") {

    val recordSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .build()

    val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c")

    val metastoreSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().optional().build())
      .build()

    val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b", "c", "z")
  }

  test("drop fields not specified in metastore") {

    val recordSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .build()

    val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c")

    val metastoreSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .build()

    val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b")
  }
}

Source File: ParquetWriterTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive.StructUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class ParquetWriterTest extends AnyWordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  "ParquetWriter" should {
    "write parquet files" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", "mr").put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
    "support writing nulls" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", null).put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
  }
}

Source File: StructFieldsExtractorTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.voltdb

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class StructFieldsExtractorTest extends AnyWordSpec with Matchers {
  "StructFieldsExtractor" should {
    "return all the fields and their bytes value" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val min = System.currentTimeMillis()
      val record = StructFieldsExtractor("table", true, Map.empty).get(struct)
      val map = record
      map("firstName") shouldBe "Alex"
      map("lastName") shouldBe "Smith"
      map("age") shouldBe 30
    }

    "return all fields and apply the mapping" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = StructFieldsExtractor("table", includeAllFields = true, Map("lastName" -> "Name", "age" -> "a")).get(struct)
      map("firstName") shouldBe "Alex"
      map("Name") shouldBe "Smith"
      map("a") shouldBe 30

    }

    "return only the specified fields" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = StructFieldsExtractor("table", includeAllFields = false, Map("lastName" -> "Name", "age" -> "age")).get(struct)
      map("Name") shouldBe "Smith"
      map("age") shouldBe 30
      map.size shouldBe 2
    }
  }
}

Source File: SourceRecordProducers.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.ftp.source

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.source.SourceRecord


object SourceRecordProducers {
  type SourceRecordProducer = (ConnectFileMetaDataStore, String, FileMetaData, FileBody) => SourceRecord

  val fileInfoSchema = SchemaBuilder.struct()
    .field("name", Schema.STRING_SCHEMA)
    .field("offset", Schema.INT64_SCHEMA)
    .build()

  def stringKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord =
    new SourceRecord(
      store.fileMetasToConnectPartition(meta), // source part
      store.fileMetasToConnectOffset(meta), // source off
      topic, //topic
      Schema.STRING_SCHEMA, // key sch
      meta.attribs.path, // key
      Schema.BYTES_SCHEMA, // val sch
      body.bytes // val
    )

  def structKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord = {
    new SourceRecord(
      store.fileMetasToConnectPartition(meta), // source part
      store.fileMetasToConnectOffset(meta), // source off
      topic, //topic
      fileInfoSchema, // key sch
      new Struct(fileInfoSchema)
        .put("name",meta.attribs.path)
        .put("offset",body.offset),
      Schema.BYTES_SCHEMA, // val sch
      body.bytes // val
    )
  }
}

Source File: StringStructFieldsStringKeyBuilderTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.sink

import com.datamountaineer.streamreactor.connect.rowkeys.StringStructFieldsStringKeyBuilder
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class StringStructFieldsStringKeyBuilderTest extends AnyWordSpec with Matchers {
  "StructFieldsStringKeyBuilder" should {
    "raise an exception if the field is not present in the struct" in {
      intercept[IllegalArgumentException] {
        val schema = SchemaBuilder.struct().name("com.example.Person")
          .field("firstName", Schema.STRING_SCHEMA)
          .field("age", Schema.INT32_SCHEMA)
          .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

        val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

        val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
        StringStructFieldsStringKeyBuilder(Seq("threshold")).build(sinkRecord)
      }
    }

    "create the row key based on one single field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex"
    }

    "create the row key based on one single field with doc in the struct" in {
      val firstNameSchema = SchemaBuilder.`type`(Schema.Type.STRING).doc("first name")
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", firstNameSchema)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex"
    }

    "create the row key based on more thant one field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StringStructFieldsStringKeyBuilder(Seq("firstName", "age")).build(sinkRecord) shouldBe "Alex.30"
    }
  }
}

Source File: TestUtilsBase.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect

import java.util
import java.util.Collections

import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.kafka.connect.source.SourceTaskContext
import org.apache.kafka.connect.storage.OffsetStorageReader
import org.mockito.Mockito._
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfter
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._



    //set up partition
    val partition: util.Map[String, String] = Collections.singletonMap(lookupPartitionKey, table)
    //as a list to search for
    val partitionList: util.List[util.Map[String, String]] = List(partition).asJava
    //set up the offset
    val offset: util.Map[String, Object] = (Collections.singletonMap(offsetColumn,offsetValue ))
    //create offsets to initialize from
    val offsets :util.Map[util.Map[String, String],util.Map[String, Object]] = Map(partition -> offset).asJava

    //mock out reader and task context
    val taskContext = mock[SourceTaskContext]
    val reader = mock[OffsetStorageReader]
    when(reader.offsets(partitionList)).thenReturn(offsets)
    when(taskContext.offsetStorageReader()).thenReturn(reader)

    taskContext
  }
}

Source File: StructFieldExtractorTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.schemas

import org.apache.kafka.connect.data.{Date, Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class StructFieldExtractorTest extends AnyWordSpec with Matchers {
  "StructFieldExtractor" should {
    "return all the fields and their bytes value" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = new StructFieldsExtractor(true, Map.empty).get(struct).toMap

      map.get("firstName").get shouldBe "Alex"
      map.get("lastName").get shouldBe "Smith"
      map.get("age").get shouldBe 30
    }

    "return all fields and apply the mapping" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = new StructFieldsExtractor(true, Map("lastName" -> "Name", "age" -> "a")).get(struct).toMap

      map.get("firstName").get shouldBe "Alex"
      map.get("Name").get shouldBe "Smith"
      map.get("a").get shouldBe 30

    }

    "return only the specified fields" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = new StructFieldsExtractor(false, Map("lastName" -> "Name", "age" -> "age")).get(struct).toMap

      map.get("Name").get shouldBe "Smith"
      map.get("age").get shouldBe 30

      map.size shouldBe 2
    }
  }

  "handle Date fieldds" in {
    val dateSchema = Date.builder().build()
    val schema = SchemaBuilder.struct().name("com.example.Person")
      .field("firstName", Schema.STRING_SCHEMA)
      .field("lastName", Schema.STRING_SCHEMA)
      .field("age", Schema.INT32_SCHEMA)
      .field("date", dateSchema).build()

    val date =  java.sql.Date.valueOf("2017-04-25")
    val struct = new Struct(schema)
      .put("firstName", "Alex")
      .put("lastName", "Smith")
      .put("age", 30)
      .put("date", date)

    val map1 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap
    map1.get("date").get shouldBe date
    map1.size shouldBe 1

    val d = Date.toLogical(dateSchema, 10000)
    struct.put("date", d)

    val map2 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap
    map2.get("date").get shouldBe d
    map2.size shouldBe 1

  }

}

Source File: Output.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.cassandra.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String) {

  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    addr_tag_link.foreach(map.put("addr_tag_link", _))
    addr_tag_link.foreach(map.put("addr_tag", _))
    map.put("spent", spent)
    map.put("tx_index", tx_index)
    map.put("type", `type`)
    addr.foreach(map.put("addr", _))
    map.put("value", value)
    map.put("n", n)
    map.put("script", script)
    map
  }

}

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.OPTIONAL_INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

}

Source File: SchemaSpec.scala From kafka-connect-cassandra with Apache License 2.0

5 votes

package com.tuplejump.kafka.connect.cassandra

import com.datastax.driver.core.{ DataType, TestUtil}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord

class SchemaSpec extends AbstractFlatSpec {

  it should "convert a struct schema with single field" in {
    val topic = "topicx"

    val sc = sinkConfig(topic, "keyspacex", "tablex", List("id"))
    sc.options.consistency should be (TaskConfig.DefaultSinkConsistency)
    sc.schema.columnNames should === (List("id"))
    sc.query.cql should be ("INSERT INTO keyspacex.tablex(id) VALUES(?)")

    val schema = SchemaBuilder.struct.name("record").version(1).field("id", Schema.INT32_SCHEMA).build
    val value = new Struct(schema).put("id", 1)
    val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0)

    sc.schema.route.topic should be (record.topic)
    sc.schema.route.keyspace should be ("keyspacex")
    sc.schema.route.table should be ("tablex")

    sc.schema is record should be (true)
    val query = record.as(sc.schema.namespace)
    query.cql should be("INSERT INTO keyspacex.tablex(id) VALUES(1)")
  }

  it should "convert a struct schema with multiple fields" in {
    val topic = "test_kfk"
    val sc = sinkConfig(topic, "keyspacex", "tablex", List("available", "name", "age"))

    val schema = SchemaBuilder.struct.name("record").version(1)
      .field("available", Schema.BOOLEAN_SCHEMA)
      .field("name", Schema.STRING_SCHEMA)
      .field("age", Schema.INT32_SCHEMA).build
    val value = new Struct(schema).put("name", "user").put("available", false).put("age", 15)
    val record = new SinkRecord("test_kfk", 1, SchemaBuilder.struct.build, "key", schema, value, 0)

    schema.asColumnNames should be (sc.schema.columnNames)

    sc.schema.route.topic should be (record.topic)
    sc.schema is record should be (true)

    sc.query.cql should be ("INSERT INTO keyspacex.tablex(available,name,age) VALUES(?,?,?)")
    val query = record.as(sc.schema.namespace)
    query.cql should be("INSERT INTO keyspacex.tablex(available,name,age) VALUES(false,'user',15)")
  }

  it should "convert cassandra column defs to a source schema" in {
    val colDef = Map(
      "id" -> DataType.cint(),
      "name" -> DataType.varchar())

    val columns = TestUtil.getColumnDef(colDef)
    val expectedSchema = SchemaBuilder.struct()
      .field("id", Schema.INT32_SCHEMA)
      .field("name", Schema.STRING_SCHEMA).build()

    columns.asSchema should be(expectedSchema)
  }

  it should "convert kafka schema and struct to cassandra columns and schema mapping" in {
    import scala.collection.JavaConverters._
    val topic = "a"
    val route = InternalConfig.Route(TaskConfig.SinkRoute + topic, "ks1.t1").get
    val schemaMap = new InternalConfig.Schema(route, Nil, Nil, Nil, List("available","name","age"), "")

    val schema = SchemaBuilder.struct.name("record").version(1)
      .field("available", Schema.BOOLEAN_SCHEMA)
      .field("name", Schema.STRING_SCHEMA)
      .field("age", Schema.INT32_SCHEMA).build
    val struct = new Struct(schema).put("name", "user").put("available", false).put("age", 15)
    val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0)

    schema.asColumnNames should ===(schemaMap.columnNames)
    schemaMap.columnNames should ===(schema.fields.asScala.map(_.name).toList)
    schemaMap is record should be (true)
  }
}

Source File: IotMessageConverter.scala From toketi-kafka-connect-iothub with MIT License

5 votes

// Copyright (c) Microsoft. All rights reserved.

package com.microsoft.azure.iot.kafka.connect.source

import java.time.Instant
import java.util.Date

import com.microsoft.azure.eventhubs.impl.AmqpConstants
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

object IotMessageConverter {

  val offsetKey = "offset"

  private val schemaName          = "iothub.kafka.connect"
  private val schemaVersion       = 1
  private val deviceIdKey         = "deviceId"
  private val contentTypeKey      = "contentType"
  private val sequenceNumberKey   = "sequenceNumber"
  private val enqueuedTimeKey     = "enqueuedTime"
  private val contentKey          = "content"
  private val systemPropertiesKey = "systemProperties"
  private val propertiesKey       = "properties"
  private val deviceIdIotHubKey   = "iothub-connection-device-id"

  // Public for testing purposes
  lazy val schema: Schema = SchemaBuilder.struct()
    .name(schemaName)
    .version(schemaVersion)
    .field(deviceIdKey, Schema.STRING_SCHEMA)
    .field(offsetKey, Schema.STRING_SCHEMA)
    .field(contentTypeKey, Schema.OPTIONAL_STRING_SCHEMA)
    .field(enqueuedTimeKey, Schema.STRING_SCHEMA)
    .field(sequenceNumberKey, Schema.INT64_SCHEMA)
    .field(contentKey, Schema.STRING_SCHEMA)
    .field(systemPropertiesKey, propertiesMapSchema)
    .field(propertiesKey, propertiesMapSchema)

  private lazy val propertiesMapSchema: Schema = SchemaBuilder.map(Schema.STRING_SCHEMA, Schema.STRING_SCHEMA)

  def getIotMessageStruct(iotMessage: IotMessage): Struct = {

    val systemProperties = iotMessage.systemProperties
    val deviceId: String = getOrDefaultAndRemove(systemProperties, deviceIdIotHubKey, "")
    val offset: String = getOrDefaultAndRemove(systemProperties, AmqpConstants.OFFSET_ANNOTATION_NAME, "")
    val sequenceNumber: Long = getOrDefaultAndRemove(systemProperties, AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME, 0)
    val enqueuedTime: Option[Instant] = getEnqueuedTime(systemProperties)
    val enqueuedTimeStr = if(enqueuedTime.isDefined) enqueuedTime.get.toString else ""

    val properties = iotMessage.properties
    val contentType: String = getOrDefaultAndRemove(properties, contentTypeKey, "")

    val systemPropertiesMap = systemProperties.map(i => (i._1, i._2.toString))

    new Struct(schema)
      .put(deviceIdKey, deviceId)
      .put(offsetKey, offset)
      .put(contentTypeKey, contentType)
      .put(enqueuedTimeKey, enqueuedTimeStr)
      .put(sequenceNumberKey, sequenceNumber)
      .put(contentKey, iotMessage.content)
      .put(systemPropertiesKey, systemPropertiesMap.asJava)
      .put(propertiesKey, properties.asJava)
  }

  private def getEnqueuedTime(map: scala.collection.mutable.Map[String, Object]): Option[Instant] = {
    val enqueuedTimeValue: Date = getOrDefaultAndRemove(map, AmqpConstants.ENQUEUED_TIME_UTC_ANNOTATION_NAME, null)
    if (enqueuedTimeValue != null) Some(enqueuedTimeValue.toInstant) else None
  }

  private def getOrDefaultAndRemove[T: ClassTag, S: ClassTag](map: scala.collection.mutable.Map[String, S],
      key: String, defaultVal: T): T = {

    if (map.contains(key)) {
      val retVal: T = map(key).asInstanceOf[T]
      map.remove(key)
      retVal
    } else {
      defaultVal
    }
  }
}

Source File: ConnectMongoConverterSpec.scala From kafka-connect-mongodb with Apache License 2.0

5 votes

package com.startapp.data

import java.lang.Boolean
import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.{FlatSpec, Matchers}

class ConnectMongoConverterSpec extends FlatSpec with Matchers{
  private val FIELD1_NAME = "fieldInt"
  private val FIELD1_VALUE = new Integer(5)
  private val FIELD2_NAME = "fieldString"
  private val FIELD2_VALUE = "str"
  private val FIELD3_NAME = "fieldBoolean"
  private val FIELD3_VALUE = new Boolean(true)

  val schema = SchemaBuilder.struct().name("test schema")
    .field(FIELD1_NAME, Schema.INT32_SCHEMA)
    .field(FIELD2_NAME, Schema.STRING_SCHEMA)
    .field(FIELD3_NAME, Schema.BOOLEAN_SCHEMA)
    .build()

  "No Schema Connect Mongo Converter Bad Data" should "throw an exception" in {
    var exceptionThrown = false

    val badData = new Struct(schema)

    try{
      checkJsonMap(NoSchemaConnectMongoConverter, badData)
    }
    catch {
      case _ : java.lang.ClassCastException => exceptionThrown = true
    }

    exceptionThrown should be(true)
  }

  "No Schema Connect Mongo Converter Good Data" should "return the same map" in {
    val jsonMap = new util.HashMap[String, Object]()
    jsonMap.put(FIELD1_NAME, FIELD1_VALUE)
    jsonMap.put(FIELD2_NAME, FIELD2_VALUE)
    jsonMap.put(FIELD3_NAME, FIELD3_VALUE)

    checkJsonMap(NoSchemaConnectMongoConverter, jsonMap)
  }

  "Schema Connect Mongo Converter Bad Data" should "throw an exception" in {
    var exceptionThrown = false

    val badData = new util.HashMap[String, Object]()
    badData.put(FIELD1_NAME, FIELD1_VALUE)

    try {
      checkJsonMap(SchemaConnectMongoConverter, badData)
    }
    catch {
      case _ : java.lang.ClassCastException => exceptionThrown = true
    }

    exceptionThrown should be(true)
  }

  "Schema Connect Mongo Converter Good Data" should "convert data to json map" in {
    val data = new Struct(schema)
      .put(FIELD1_NAME, FIELD1_VALUE)
      .put(FIELD2_NAME, FIELD2_VALUE)
      .put(FIELD3_NAME, FIELD3_VALUE)

    checkJsonMap(SchemaConnectMongoConverter, data)
  }

  private def checkJsonMap(converter : ConnectMongoConverter, value: Object): Unit ={
    val newJsonMap = converter.toJsonMap(value).toMap

    newJsonMap(FIELD1_NAME) should be(FIELD1_VALUE)
    newJsonMap(FIELD2_NAME) should be(FIELD2_VALUE)
    newJsonMap(FIELD3_NAME) should be(FIELD3_VALUE)
  }

}

Source File: Input.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.blockchain.data

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String)

object Input {
  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po=>struct.put("prev_out", po.toStruct()))
      struct
    }
  }

}

Source File: Output.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.blockchain.data

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String)

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

}

Source File: Transaction.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.blockchain.data

import java.util

import com.datamountaineer.streamreactor.connect.blockchain.data.Input._
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.source.SourceRecord

case class Transaction(lock_time: Long,
                       ver: Int,
                       size: Long,
                       inputs: Seq[Input],
                       rbf: Option[Boolean],
                       time: Long,
                       tx_index: Long,
                       vin_sz: Int,
                       hash: String,
                       vout_sz: Int,
                       relayed_by: String,
                       out: Seq[Output])


object Transaction {
  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.transaction")
    .field("lock_time", Schema.INT64_SCHEMA)
    .field("ver", Schema.INT32_SCHEMA)
    .field("size", Schema.INT64_SCHEMA)
    .field("inputs", SchemaBuilder.array(Input.ConnectSchema).optional().build())
    .field("rbf", Schema.OPTIONAL_BOOLEAN_SCHEMA)
    .field("time", Schema.INT64_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("vin_sz", Schema.INT32_SCHEMA)
    .field("hash", Schema.STRING_SCHEMA)
    .field("vout_sz", Schema.INT32_SCHEMA)
    .field("relayed_by", Schema.STRING_SCHEMA)
    .field("out", SchemaBuilder.array(Output.ConnectSchema).optional().build())
    .build()

  implicit class TransactionToSourceRecordConverter(val tx: Transaction) extends AnyVal {
    def toSourceRecord(topic: String, partition: Int, key: Option[String]): SourceRecord = {
      new SourceRecord(
        null,
        null,
        topic,
        partition,
        key.map(_ => Schema.STRING_SCHEMA).orNull,
        key.orNull,
        ConnectSchema,
        tx.toStruct()
      )
    }

    //private def getOffset() = Collections.singletonMap("position", System.currentTimeMillis())

    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("lock_time", tx.lock_time)
        .put("ver", tx.ver)
        .put("size", tx.size)
        .put("time", tx.time)
        .put("tx_index", tx.tx_index)
        .put("vin_sz", tx.vin_sz)
        .put("hash", tx.hash)
        .put("vout_sz", tx.vout_sz)
        .put("relayed_by", tx.relayed_by)

      tx.out.headOption.foreach { _ =>
        import scala.collection.JavaConverters._
        struct.put("out", tx.out.map(_.toStruct()).asJava)
      }
      tx.rbf.foreach(struct.put("rbf", _))
      tx.inputs.headOption.foreach { _ =>
        val inputs = new util.ArrayList[Struct]
        tx.inputs.foreach(i => inputs.add(i.toStruct()))
        struct.put("inputs", inputs)
      }
      tx.out.headOption.foreach { _ =>
        val outputs = new util.ArrayList[Struct]
        tx.out.foreach(output => outputs.add(output.toStruct()))
      }

      struct
    }
  }

}

Source File: Input.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String) {
  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    map.put("sequence", sequence)
    prev_out.foreach(p => map.put("prev_out", p.toHashMap))
    map.put("script", script)
    map
  }
}

object Input {
  val ConnectSchema = SchemaBuilder.struct
    .name("input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po => struct.put("prev_out", po.toStruct()))
      struct
    }
  }

}

Source File: Output.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String) {

  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    addr_tag_link.foreach(map.put("addr_tag_link", _))
    addr_tag_link.foreach(map.put("addr_tag", _))
    map.put("spent", spent)
    map.put("tx_index", tx_index)
    map.put("type", `type`)
    addr.foreach(map.put("addr", _))
    map.put("value", value)
    map.put("n", n)
    map.put("script", script)
    map
  }

}

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.OPTIONAL_INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

}

Source File: Input.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.mongodb

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String) {
  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    map.put("sequence", sequence)
    prev_out.foreach(p => map.put("prev_out", p.toHashMap))
    map.put("script", script)
    map
  }
}

object Input {
  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po => struct.put("prev_out", po.toStruct()))
      struct
    }
  }

}

Source File: Output.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.mongodb

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String) {

  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    addr_tag_link.foreach(map.put("addr_tag_link", _))
    addr_tag_link.foreach(map.put("addr_tag", _))
    map.put("spent", spent)
    map.put("tx_index", tx_index)
    map.put("type", `type`)
    addr.foreach(map.put("addr", _))
    map.put("value", value)
    map.put("n", n)
    map.put("script", script)
    map
  }

}

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.OPTIONAL_INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

}

Source File: Input.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.cassandra.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String) {
  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    map.put("sequence", sequence)
    prev_out.foreach(p => map.put("prev_out", p.toHashMap))
    map.put("script", script)
    map
  }
}

object Input {
  val ConnectSchema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po => struct.put("prev_out", po.toStruct()))
      struct
    }
  }

}

Source File: CassandraSinkTaskSpec.scala From kafka-connect-cassandra with Apache License 2.0

5 votes

package com.tuplejump.kafka.connect.cassandra

import scala.collection.JavaConverters._
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.{SinkRecord, SinkTaskContext}

class CassandraSinkTaskSpec extends AbstractFlatSpec {

  val topicName = "test_kv_topic"
  val tableName = "test.kv"
  val config = sinkProperties(Map(topicName -> tableName))

  it should "start sink task" in {
    val sinkTask = new CassandraSinkTask()
    val mockContext = mock[SinkTaskContext]

    sinkTask.initialize(mockContext)
    sinkTask.start(config.asJava)
    sinkTask.stop()
  }

  it should "save records in cassandra" in {
    val sinkTask = new CassandraSinkTask()
    val mockContext = mock[SinkTaskContext]

    sinkTask.initialize(mockContext)
    sinkTask.start(config.asJava)

    val valueSchema = SchemaBuilder.struct.name("record").version(1)
      .field("key", Schema.STRING_SCHEMA)
      .field("value", Schema.INT32_SCHEMA).build
    val value1 = new Struct(valueSchema).put("key", "pqr").put("value", 15)
    val value2 = new Struct(valueSchema).put("key", "abc").put("value", 17)

    val record1 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value1, 0)
    val record2 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value2, 0)

    sinkTask.put(List(record1, record2).asJavaCollection)

    sinkTask.stop()

    val cc = CassandraCluster.local
    val session = cc.session
    val result = session.execute(s"select count(1) from $tableName").one()
    val rowCount = result.getLong(0)
    rowCount should be(2)
    cc.shutdown()
  }
}

Source File: StructFieldsRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class StructFieldsRowKeyBuilderTest extends AnyWordSpec with Matchers {
  "StructFieldsRowKeyBuilder" should {
    "raise an exception if the field is not present in the struct" in {
      intercept[IllegalArgumentException] {
        val schema = SchemaBuilder.struct().name("com.example.Person")
          .field("firstName", Schema.STRING_SCHEMA)
          .field("age", Schema.INT32_SCHEMA)
          .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

        val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

        val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
        //val field = Field("threshold", "threshold", false)

        StructFieldsRowKeyBuilderBytes(List("threshold")).build(sinkRecord, null)
      }
    }

    "create the row key based on one single field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName")).build(sinkRecord, null) shouldBe "Alex".fromString
    }

    "create the row key based on more thant one field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      //val field2 = Field("age", "age", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName", "age")).build(sinkRecord, null) shouldBe
        Bytes.add("Alex".fromString(), "\n".fromString(), 30.fromInt())
    }
  }
}

Source File: ValueConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

object ValueConverter {
  def apply(record: SinkRecord): Struct = record.value match {
    case struct: Struct => StructValueConverter.convert(struct)
    case map: Map[_, _] => MapValueConverter.convert(map)
    case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap)
    case string: String => StringValueConverter.convert(string)
    case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}")
  }
}

trait ValueConverter[T] {
  def convert(value: T): Struct
}

object StructValueConverter extends ValueConverter[Struct] {
  override def convert(struct: Struct): Struct = struct
}

object MapValueConverter extends ValueConverter[Map[_, _]] {
  def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = {
    value match {
      case s: String =>
        builder.field(key, Schema.OPTIONAL_STRING_SCHEMA)
        s
      case l: Long =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        l
      case i: Int =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        i.toLong
      case b: Boolean =>
        builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA)
        b
      case f: Float =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        f.toDouble
      case d: Double =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        d
      case innerMap: java.util.Map[_, _] =>
        val innerStruct = convert(innerMap.asScala.toMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct

      case innerMap: Map[_, _] =>
        val innerStruct = convert(innerMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct
    }
  }

  def convert(map: Map[_, _], optional: Boolean) = {
    val builder = SchemaBuilder.struct()
    val values = map.map { case (k, v) =>
      val key = k.toString
      val value = convertValue(v, key, builder)
      key -> value
    }.toList
    if (optional) builder.optional()
    val schema = builder.build
    val struct = new Struct(schema)
    values.foreach { case (key, value) =>
      struct.put(key.toString, value)
    }
    struct
  }
  override def convert(map: Map[_, _]): Struct = convert(map, false)
}

object StringValueConverter extends ValueConverter[String] {
  override def convert(string: String): Struct = {
    val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build()
    new Struct(schema).put("a", string)
  }
}

Source File: DropPartitionValuesMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import com.landoop.streamreactor.connect.hive.{PartitionPlan, StructMapper}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}


class DropPartitionValuesMapper(plan: PartitionPlan) extends StructMapper {

  import scala.collection.JavaConverters._

  override def map(input: Struct): Struct = {
    val partitionKeys = plan.keys.map(_.value).toList
    val dataFields = input.schema.fields().asScala.filterNot(field => partitionKeys.contains(field.name))
    val builder = dataFields.foldLeft(SchemaBuilder.struct) { (builder, field) =>
      builder.field(field.name, field.schema)
    }
    val schema = builder.build()
    dataFields.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.name, input.get(field.name))
    }
  }
}

Source File: ProjectionMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import cats.data.NonEmptyList
import com.datamountaineer.kcql.Field
import com.landoop.streamreactor.connect.hive.StructMapper
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}


class ProjectionMapper(projection: NonEmptyList[Field]) extends StructMapper {

  override def map(input: Struct): Struct = {
    // the compatible output schema built from projected fields with aliases applied
    val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, kcqlField) =>
      Option(input.schema.field(kcqlField.getName)).fold(sys.error(s"Missing field $kcqlField")) { field =>
        builder.field(kcqlField.getAlias, field.schema)
      }
    }
    val schema = builder.build()
    projection.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.getAlias, input.get(field.getName))
    }
  }
}

Source File: PartitionValueMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.source.mapper

import com.landoop.streamreactor.connect.hive.{Partition, StructMapper}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

import scala.collection.JavaConverters._

class PartitionValueMapper(partition: Partition) extends StructMapper {
  override def map(input: Struct): Struct = {

    val builder = SchemaBuilder.struct()
    input.schema.fields.asScala.foreach { field =>
      builder.field(field.name, field.schema)
    }
    partition.entries.toList.foreach { entry =>
      builder.field(entry._1.value, Schema.STRING_SCHEMA)
    }
    val schema = builder.build()

    val struct = new Struct(schema)
    input.schema.fields.asScala.foreach { field =>
      struct.put(field.name, input.get(field.name))
    }
    partition.entries.toList.foreach { entry =>
      struct.put(entry._1.value, entry._2)
    }
    struct
  }
}

Source File: ProjectionMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.source.mapper

import cats.data.NonEmptyList
import com.landoop.streamreactor.connect.hive.StructMapper
import com.landoop.streamreactor.connect.hive.source.config.ProjectionField
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}

class ProjectionMapper(projection: NonEmptyList[ProjectionField]) extends StructMapper {

  override def map(input: Struct): Struct = {
    val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, projectionField) =>
      Option(input.schema.field(projectionField.name))
        .fold(sys.error(s"Projection field ${projectionField.name} cannot be found in input")) { field =>
          builder.field(projectionField.alias, field.schema)
        }
    }
    val schema = builder.build()
    projection.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.alias, input.get(field.name))
    }
  }
}

Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.concurrent.duration._

class DefaultCommitPolicyTest extends AnyWordSpec with Matchers {

  val schema: Schema = SchemaBuilder.struct()
    .field("name", SchemaBuilder.string().required().build())
    .build()

  val struct = new Struct(schema)

  implicit val conf: Configuration = new Configuration()
  implicit val fs: LocalFileSystem = FileSystem.getLocal(conf)
  val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100))

  private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = {
    val status = fs.getFileStatus(path)
    policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime))
  }

  "DefaultCommitPolicy" should {
    "roll over after interval" in {

      val policy = DefaultCommitPolicy(None, Option(2.seconds), None)
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 10) shouldBe false
      Thread.sleep(2000)
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file count" in {
      val policy = DefaultCommitPolicy(None, None, Some(9))
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 7) shouldBe false
      shouldFlush(policy, path, 8) shouldBe false
      shouldFlush(policy, path, 9) shouldBe true
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file size" in {
      val policy = DefaultCommitPolicy(Some(10), None, None)
      val path = new Path("foo")
      val out = fs.create(path)
      shouldFlush(policy, path, 7) shouldBe false
      out.writeBytes("wibble wobble wabble wubble")
      out.close()
      shouldFlush(policy, path, 9) shouldBe true
      fs.delete(path, false)
    }
  }
}

Source File: DropPartitionValuesMapperTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import cats.data.NonEmptyList
import com.landoop.streamreactor.connect.hive.{PartitionKey, PartitionPlan, TableName}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class DropPartitionValuesMapperTest extends AnyFunSuite with Matchers {

  test("strip partition values") {

    val schema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("p", SchemaBuilder.string().required().build())
      .field("q", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().required().build())
      .build()

    val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q")))
    val struct = new Struct(schema).put("a", "a").put("p", "p").put("q", "q").put("z", "z")
    val output = new DropPartitionValuesMapper(plan).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z")
  }

  test("handle partition field is missing in input") {

    val schema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("q", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().required().build())
      .build()


    val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q")))
    val struct = new Struct(schema).put("a", "a").put("q", "q").put("z", "z")
    val output = new DropPartitionValuesMapper(plan).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z")
  }
}

Source File: MetastoreSchemaAlignMapperTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class MetastoreSchemaAlignMapperTest extends AnyFunSuite with Matchers {

  test("pad optional missing fields with null") {

    val recordSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .build()

    val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c")

    val metastoreSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().optional().build())
      .build()

    val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b", "c", "z")
  }

  test("drop fields not specified in metastore") {

    val recordSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .build()

    val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c")

    val metastoreSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .build()

    val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b")
  }
}

Source File: ParquetWriterTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive.StructUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class ParquetWriterTest extends AnyWordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  "ParquetWriter" should {
    "write parquet files" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", "mr").put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
    "support writing nulls" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", null).put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
  }
}

org.apache.kafka.connect.data.SchemaBuilder Scala Examples