org.apache.kafka.connect.data.Schema Scala Example

Source File: OrcSink.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.orc

import com.landoop.streamreactor.connect.hive.orc.vectors.{OrcVectorWriter, StructVectorWriter}
import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, StructUtils}
import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.collection.JavaConverters._

class OrcSink(path: Path,
              schema: Schema,
              config: OrcSinkConfig)(implicit fs: FileSystem) extends StrictLogging {

  private val typeDescription = OrcSchemas.toOrc(schema)
  private val structWriter = new StructVectorWriter(typeDescription.getChildren.asScala.map(OrcVectorWriter.fromSchema))
  private val batch = typeDescription.createRowBatch(config.batchSize)
  private val vector = new StructColumnVector(batch.numCols, batch.cols: _*)
  private val orcWriter = createOrcWriter(path, typeDescription, config)
  private var n = 0

  def flush(): Unit = {
    logger.debug(s"Writing orc batch [size=$n, path=$path]")
    batch.size = n
    orcWriter.addRowBatch(batch)
    orcWriter.writeIntermediateFooter
    batch.reset()
    n = 0
  }

  def write(struct: Struct): Unit = {
    structWriter.write(vector, n, Some(StructUtils.extractValues(struct)))
    n = n + 1
    if (n == config.batchSize)
      flush()
  }

  def close(): Unit = {
    if (n > 0)
      flush()
    orcWriter.close()
  }
}

Source File: RootGroupConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.io.api.{Converter, GroupConverter}

import scala.collection.JavaConverters._

class RootGroupConverter(schema: Schema) extends GroupConverter with StrictLogging {
  require(schema.`type`() == Schema.Type.STRUCT)

  var struct: Struct = _
  private val builder = scala.collection.mutable.Map.empty[String, Any]
  private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq

  override def getConverter(k: Int): Converter = converters(k)
  override def start(): Unit = builder.clear()
  override def end(): Unit = struct = {
    val struct = new Struct(schema)
    schema.fields.asScala.map { field =>
      val value = builder.getOrElse(field.name, null)
      try {
        struct.put(field, value)
      } catch {
        case t: Exception =>
          throw t
      }
    }
    struct
  }
}

Source File: PartitionValueMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.source.mapper

import com.landoop.streamreactor.connect.hive.{Partition, StructMapper}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

import scala.collection.JavaConverters._

class PartitionValueMapper(partition: Partition) extends StructMapper {
  override def map(input: Struct): Struct = {

    val builder = SchemaBuilder.struct()
    input.schema.fields.asScala.foreach { field =>
      builder.field(field.name, field.schema)
    }
    partition.entries.toList.foreach { entry =>
      builder.field(entry._1.value, Schema.STRING_SCHEMA)
    }
    val schema = builder.build()

    val struct = new Struct(schema)
    input.schema.fields.asScala.foreach { field =>
      struct.put(field.name, input.get(field.name))
    }
    partition.entries.toList.foreach { entry =>
      struct.put(entry._1.value, entry._2)
    }
    struct
  }
}

Source File: MetastoreSchemaAlignMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import com.landoop.streamreactor.connect.hive.StructMapper
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.util.Try


class MetastoreSchemaAlignMapper(schema: Schema) extends StructMapper {

  import scala.collection.JavaConverters._

  override def map(input: Struct): Struct = {
    //hive converts everything to lowercase
    val inputFieldsMapping = input.schema().fields().asScala.map { f => f.name().toLowerCase() -> f.name() }.toMap
    val struct = schema.fields.asScala.foldLeft(new Struct(schema)) { (struct, field) =>
      Try(input.get(inputFieldsMapping(field.name))).toOption match {
        case Some(value) => struct.put(field.name, value)
        case None if field.schema.isOptional => struct.put(field.name, null)
        case None => sys.error(s"Cannot map struct to required schema; ${field.name} is missing, no default value has been supplied and null is not permitted")
      }
    }
    struct
  }
}

Source File: HiveWriterManager.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive.{Offset, TopicPartition, TopicPartitionOffset}
import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveWriter}
import com.landoop.streamreactor.connect.hive.sink.staging.StageManager
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.Schema


  def flush(offsets: Map[TopicPartition, Offset]): Unit = {
    logger.info(s"Flushing offsets $offsets")
    // we may not have an offset for a given topic/partition if no data was written to that TP
    writers.foreach { case (key, writer) =>
      writer.close()
      offsets.get(key.tp).foreach { offset =>
        stageManager.commit(writer.file, key.tp.withOffset(offset))
      }
      writers.remove(key)
    }
  }

  def getWriters: Seq[OpenWriter] = writers.map { case (key, writer) => OpenWriter(key.tp, key.dir, writer) }.toList
}

Source File: ValueConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

object ValueConverter {
  def apply(record: SinkRecord): Struct = record.value match {
    case struct: Struct => StructValueConverter.convert(struct)
    case map: Map[_, _] => MapValueConverter.convert(map)
    case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap)
    case string: String => StringValueConverter.convert(string)
    case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}")
  }
}

trait ValueConverter[T] {
  def convert(value: T): Struct
}

object StructValueConverter extends ValueConverter[Struct] {
  override def convert(struct: Struct): Struct = struct
}

object MapValueConverter extends ValueConverter[Map[_, _]] {
  def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = {
    value match {
      case s: String =>
        builder.field(key, Schema.OPTIONAL_STRING_SCHEMA)
        s
      case l: Long =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        l
      case i: Int =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        i.toLong
      case b: Boolean =>
        builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA)
        b
      case f: Float =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        f.toDouble
      case d: Double =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        d
      case innerMap: java.util.Map[_, _] =>
        val innerStruct = convert(innerMap.asScala.toMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct

      case innerMap: Map[_, _] =>
        val innerStruct = convert(innerMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct
    }
  }

  def convert(map: Map[_, _], optional: Boolean) = {
    val builder = SchemaBuilder.struct()
    val values = map.map { case (k, v) =>
      val key = k.toString
      val value = convertValue(v, key, builder)
      key -> value
    }.toList
    if (optional) builder.optional()
    val schema = builder.build
    val struct = new Struct(schema)
    values.foreach { case (key, value) =>
      struct.put(key.toString, value)
    }
    struct
  }
  override def convert(map: Map[_, _]): Struct = convert(map, false)
}

object StringValueConverter extends ValueConverter[String] {
  override def convert(string: String): Struct = {
    val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build()
    new Struct(schema).put("a", string)
  }
}

Source File: AddEvolutionPolicy.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.evolution

import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.kafka.connect.data.Schema

import scala.collection.JavaConverters._
import scala.util.Try


object AddEvolutionPolicy extends EvolutionPolicy {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def evolve(dbName: DatabaseName,
                      tableName: TableName,
                      metastoreSchema: Schema,
                      inputSchema: Schema)
                     (implicit client: IMetaStoreClient): Try[Schema] = Try {

    val missing = inputSchema.fields.asScala
      .filter(f => metastoreSchema.field(f.name) == null)
      .map(HiveSchemas.toFieldSchema)

    if (missing.nonEmpty) {
      logger.info(s"Evolving hive metastore to add: ${missing.mkString(",")}")

      val table = client.getTable(dbName.value, tableName.value)
      val cols = table.getSd.getCols
      missing.foreach(field => cols.add(field))
      table.getSd.setCols(cols)
      client.alter_table(dbName.value, tableName.value, table)

      HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value))

    } else {
      metastoreSchema
    }
  }
}

Source File: IgnoreEvolutionPolicy.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.evolution

import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.kafka.connect.data.Schema

import scala.collection.JavaConverters._
import scala.util.Try


object IgnoreEvolutionPolicy extends EvolutionPolicy {

  override def evolve(dbName: DatabaseName,
                      tableName: TableName,
                      metastoreSchema: Schema,
                      inputSchema: Schema)
                     (implicit client: IMetaStoreClient): Try[Schema] = Try {
    HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value))
  }.map { schema =>
    val compatible = schema.fields().asScala.forall { field =>
      inputSchema.field(field.name) != null ||
        field.schema().isOptional ||
        field.schema().defaultValue() != null
    }
    if (compatible) schema else sys.error("Input Schema is not compatible with the metastore")
  }
}

Source File: StrictEvolutionPolicy.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.evolution

import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.kafka.connect.data.Schema

import scala.collection.JavaConverters._
import scala.util.Try


object StrictEvolutionPolicy extends EvolutionPolicy {

  override def evolve(dbName: DatabaseName,
                      tableName: TableName,
                      metastoreSchema: Schema,
                      inputSchema: Schema)
                     (implicit client: IMetaStoreClient): Try[Schema] = Try {
    val schema = HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value))
    schema
  }.map { schema =>
    //Hive keeps the fields in lowercase
    val inputFields = inputSchema.fields().asScala.map { f =>
      f.name().toLowerCase()
    }.toSet
    schema.fields().asScala.foreach { field =>
      val exists = inputFields.contains(field.name)
      val optional = field.schema().isOptional
      val default = field.schema().defaultValue()
      val compatible = exists || optional || default != null
      if (!compatible) {
        sys.error(s"Input Schema is not compatible with the metastore for field [${field.name()}]")
      }
    }
    schema
  }
}

Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.sink.config.TableOptions
import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.Table
import org.apache.kafka.connect.data.{Schema, Struct}

case class HiveSinkState(offsets: Map[TopicPartition, Offset],
                         committedOffsets: Map[TopicPartition, Offset],
                         table: Table,
                         tableLocation: Path,
                         plan: Option[PartitionPlan],
                         metastoreSchema: Schema,
                         mapper: Struct => Struct,
                         lastSchema: Schema) {
  def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = {
    copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset))
  }

  def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(offsets = offsets + (tp -> offset))
  }

  def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = {
    copy(committedOffsets = committedOffsets ++ offsets)
  }

  def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(committedOffsets = committedOffsets + (tp -> offset))
  }

  def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema)
}

object HiveSinkState {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def from(schema: Schema,
           table: TableOptions,
           dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = {
    logger.info(s"Init sink for schema $schema")

    val hiveTable = getOrCreateTable(table, dbName, schema)
    val tableLocation = new Path(hiveTable.getSd.getLocation)
    val plan = hive.partitionPlan(hiveTable)
    val metastoreSchema = table.evolutionPolicy
      .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema)
      .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema"))

    val mapperFns: Seq[Struct => Struct] = Seq(
      table.projection.map(new ProjectionMapper(_)),
      Some(new MetastoreSchemaAlignMapper(metastoreSchema)),
      plan.map(new DropPartitionValuesMapper(_))
    ).flatten.map(mapper => mapper.map _)

    val mapper = Function.chain(mapperFns)

    HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema)
  }

  def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema)
                      (implicit client: IMetaStoreClient, fs: FileSystem): Table = {

    def create: Table = {
      val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",")
      logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]")
      hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format)
    }

    logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}")
    client.tableExists(dbName.value, table.tableName.value) match {
      case true if table.overwriteTable =>
        hive.dropTable(dbName, table.tableName, true)
        create
      case true => client.getTable(dbName.value, table.tableName.value)
      case false if table.createTable => create
      case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist")
    }
  }
}

Source File: domain.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import cats.Show
import cats.data.NonEmptyList
import org.apache.hadoop.fs.Path
import org.apache.kafka.common.{TopicPartition => KafkaTopicPartition}
import org.apache.kafka.connect.data.Schema

case class Topic(value: String) {
  require(value != null && value.trim.nonEmpty)
}

case class Offset(value: Long) {
  require(value >= 0)
}

case class TopicPartition(topic: Topic, partition: Int) {
  def withOffset(offset: Offset): TopicPartitionOffset = TopicPartitionOffset(topic, partition, offset)
  def toKafka = new KafkaTopicPartition(topic.value, partition)
}

case class TopicPartitionOffset(topic: Topic, partition: Int, offset: Offset) {
  def toTopicPartition = TopicPartition(topic, partition)
}

case class DatabaseName(value: String) {
  require(value != null && value.trim.nonEmpty)
}

case class TableName(value: String) {
  require(value != null && value.trim.nonEmpty)
}

// contains all the partition keys for a particular table
case class PartitionPlan(tableName: TableName, keys: NonEmptyList[PartitionKey])

// contains a partition key, which you can think of as like a partition column name
case class PartitionKey(value: String)

// defines a partition key field
case class PartitionField(name: String, schema: Schema = Schema.STRING_SCHEMA, comment: Option[String] = None) {
  require(name != null && name.trim.nonEmpty)
}

// contains a single partition in a table, that is one set of unique values, one per partition key
case class Partition(entries: NonEmptyList[(PartitionKey, String)], location: Option[Path])

case class Serde(serializationLib: String, inputFormat: String, outputFormat: String, params: Map[String, String])

// generates the default hive metatstore location string for a partition
object DefaultPartitionLocation extends Show[Partition] {
  override def show(t: Partition): String = {
    t.entries.map { case (key, value) => key.value + "=" + value }.toList.mkString("/")
  }
}

Source File: package.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.Schema
import org.apache.orc.OrcFile.EncodingStrategy
import org.apache.orc._

package object orc {

  def createOrcWriter(path: Path, schema: TypeDescription, config: OrcSinkConfig)
                     (implicit fs: FileSystem): Writer = {

    val options = OrcFile.writerOptions(null, fs.getConf).setSchema(schema)

    options.compress(config.compressionKind)
    options.encodingStrategy(config.encodingStrategy)
    options.blockPadding(config.blockPadding)
    options.version(OrcFile.Version.V_0_12)

    config.bloomFilterColumns.map(_.mkString(",")).foreach(options.bloomFilterColumns)
    config.rowIndexStride.foreach(options.rowIndexStride)
    config.blockSize.foreach(options.blockSize)
    config.stripeSize.foreach(options.stripeSize)

    if (config.overwrite && fs.exists(path))
      fs.delete(path, false)

    OrcFile.createWriter(path, options)
  }

  def source(path: Path, config: OrcSourceConfig)
            (implicit fs: FileSystem) = new OrcSource(path, config)

  def sink(path: Path, schema: Schema, config: OrcSinkConfig)
          (implicit fs: FileSystem) = new OrcSink(path, schema, config)
}

case class OrcSourceConfig()

case class OrcSinkConfig(overwrite: Boolean = false,
                         batchSize: Int = 1024, // orc default is 1024
                         encodingStrategy: EncodingStrategy = EncodingStrategy.COMPRESSION,
                         compressionKind: CompressionKind = CompressionKind.SNAPPY,
                         blockPadding: Boolean = true,
                         blockSize: Option[Long] = None,
                         stripeSize: Option[Long] = None,
                         bloomFilterColumns: Seq[String] = Nil,
                         rowIndexStride: Option[Int] = None)

Source File: NestedGroupConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Field, Schema}
import org.apache.parquet.io.api.{Converter, GroupConverter}

import scala.collection.JavaConverters._

class NestedGroupConverter(schema: Schema,
                           field: Field,
                           parentBuilder: scala.collection.mutable.Map[String, Any])
  extends GroupConverter with StrictLogging {
  private[parquet] val builder = scala.collection.mutable.Map.empty[String, Any]
  private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq
  override def getConverter(k: Int): Converter = converters(k)
  override def start(): Unit = builder.clear()
  override def end(): Unit = parentBuilder.put(field.name, builder.result)
}

Source File: OrcSchemas.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.orc

import com.landoop.streamreactor.connect.hive.UnsupportedSchemaType
import org.apache.kafka.connect.data.{Decimal, Schema, SchemaBuilder}
import org.apache.orc.TypeDescription
import org.apache.orc.TypeDescription.Category

import scala.collection.JavaConverters._

object OrcSchemas {

  def toKafka(schema: TypeDescription): Schema = schema.getCategory match {
    case Category.BOOLEAN => Schema.OPTIONAL_BOOLEAN_SCHEMA
    case Category.BYTE => Schema.OPTIONAL_INT8_SCHEMA
    case Category.DOUBLE => Schema.OPTIONAL_FLOAT64_SCHEMA
    case Category.INT => Schema.OPTIONAL_INT32_SCHEMA
    case Category.FLOAT => Schema.OPTIONAL_FLOAT32_SCHEMA
    case Category.LONG => Schema.OPTIONAL_INT64_SCHEMA
    case Category.SHORT => Schema.OPTIONAL_INT16_SCHEMA
    case Category.STRING => Schema.OPTIONAL_STRING_SCHEMA
    case Category.VARCHAR => Schema.OPTIONAL_STRING_SCHEMA
    case Category.CHAR => Schema.OPTIONAL_STRING_SCHEMA
    case Category.DATE => Schema.OPTIONAL_STRING_SCHEMA
    case Category.TIMESTAMP => Schema.OPTIONAL_STRING_SCHEMA
    case Category.BYTE => Schema.OPTIONAL_BYTES_SCHEMA
    case Category.STRUCT => toKafkaStruct(schema)
  }

  def toKafkaStruct(schema: TypeDescription): Schema = {
    import scala.collection.JavaConverters._
    val builder = SchemaBuilder.struct().name("from_orc")
    schema.getFieldNames.asScala.zipWithIndex.foreach { case (field, k) =>
      builder.field(field, toKafka(schema.getChildren.get(k)))
    }
    builder.build()
  }

  def toOrc(schema: Schema): TypeDescription = {
    schema.`type`() match {
      case Schema.Type.STRING if schema.name() == Decimal.LOGICAL_NAME => TypeDescription.createDecimal()
      case Schema.Type.STRING => TypeDescription.createString()
      case Schema.Type.BOOLEAN => TypeDescription.createBoolean()
      case Schema.Type.FLOAT32 => TypeDescription.createFloat()
      case Schema.Type.FLOAT64 => TypeDescription.createDouble()
      case Schema.Type.INT8 => TypeDescription.createByte()
      case Schema.Type.INT16 => TypeDescription.createShort()
      case Schema.Type.INT32 => TypeDescription.createInt()
      case Schema.Type.INT64 => TypeDescription.createLong()
      case Schema.Type.BYTES if schema.name() == Decimal.LOGICAL_NAME => TypeDescription.createDecimal()
      case Schema.Type.BYTES => TypeDescription.createBinary()
      case Schema.Type.ARRAY => TypeDescription.createList(toOrc(schema.valueSchema()))
      case Schema.Type.MAP => TypeDescription.createMap(toOrc(schema.keySchema()), toOrc(schema.valueSchema()))
      case Schema.Type.STRUCT =>
        schema.fields().asScala.foldLeft(TypeDescription.createStruct) { case (struct, field) =>
          struct.addField(field.name, toOrc(field.schema))
        }
      case unsupportedDataType => throw UnsupportedSchemaType(unsupportedDataType.toString)
    }
  }
}

Source File: OrcHiveFormat.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, Serde}
import com.landoop.streamreactor.connect.hive.orc.OrcSink
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.util.Try

object OrcHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.orc.OrcSerde",
    "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
    "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
    Map("org.apache.hadoop.hive.ql.io.orc.OrcSerde" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {
    logger.debug(s"Creating orc writer at $path")

    val sink: OrcSink = com.landoop.streamreactor.connect.hive.orc.sink(path, schema, OrcSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val cretedTimestamp: Long = System.currentTimeMillis()
    var lastKnownFileSize:Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      sink.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing orc writer at path $path")
      sink.close()
    }
    override def file: Path = path
    override def currentCount: Long = count
    override def createdTime: Long = cretedTimestamp
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating orc reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.orc.source(path, OrcSourceConfig())
    var offset = startAt

    override def iterator: Iterator[Record] = reader.iterator.map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
}

Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.Serde
import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.ParquetWriter

import scala.util.Try

object ParquetHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
    Map("serialization.format" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {

    logger.debug(s"Creating parquet writer at $path")

    val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val createdTimestamp: Long = System.currentTimeMillis()
    var lastKnownFileSize:Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      writer.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing writer at path $path")
      writer.close()
    }

    override def currentCount: Long = count
    override def file: Path = path
    override def createdTime: Long = createdTimestamp
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating parquet reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path)
    var offset = startAt

    override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
}

Source File: RedisStreamTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.redis.sink.writer

/*
 * Copyright 2017 Datamountaineer.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util

import com.datamountaineer.streamreactor.connect.redis.sink.RedisSinkTask
import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import redis.clients.jedis.{Jedis, StreamEntryID}

import scala.collection.JavaConverters._

class RedisStreamTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar {
//
//  val redisServer = new RedisServer(6379)
//
//  override def beforeAll() = redisServer.start()
//
//  override def afterAll() = redisServer.stop()

  "Redis Stream writer" should {

    "write Kafka records to a Redis Stream" in {

      val TOPIC = "cpuTopic"
      val KCQL = s"INSERT INTO stream1 SELECT * from $TOPIC STOREAS STREAM"
      println("Testing KCQL : " + KCQL)
      val props = Map(
        RedisConfigConstants.REDIS_HOST->"localhost",
        RedisConfigConstants.REDIS_PORT->"6379",
        RedisConfigConstants.KCQL_CONFIG->KCQL,
        RedisConfigConstants.REDIS_PASSWORD -> ""
      ).asJava

      val config = RedisConfig(props)
      val connectionInfo = new RedisConnectionInfo("localhost", 6379, None)
      val settings = RedisSinkSettings(config)
      val writer = new RedisStreams(settings)

      val schema = SchemaBuilder.struct().name("com.example.Cpu")
        .field("type", Schema.STRING_SCHEMA)
        .field("temperature", Schema.FLOAT64_SCHEMA)
        .field("voltage", Schema.FLOAT64_SCHEMA)
        .field("ts", Schema.INT64_SCHEMA).build()

      val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L)

      val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1)

      val jedis = mock[Jedis]
      writer.jedis = jedis

      val map = new util.HashMap[String, String]()
      map.put("type", "Xeon")
      map.put("temperature", "60.4")
      map.put("voltage", "90.1")
      map.put("ts", 1482180657010L.toString)

      when(jedis.auth("")).isLenient()
      when(jedis.xadd("stream1", null, map)).thenReturn(mock[StreamEntryID])
      writer.initialize(1, settings.errorPolicy)
      writer.write(Seq(sinkRecord1))
    }
  }
}

Source File: RedisPubSubTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.redis.sink.writer

import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import redis.clients.jedis.{Jedis, JedisPubSub}
import redis.embedded.RedisServer

import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer

class RedisPubSubTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar {

  val redisServer = new RedisServer(6379)

  override def beforeAll() = redisServer.start()

  override def afterAll() = redisServer.stop()

  "Redis PUBSUB writer" should {

    "write Kafka records to a Redis PubSub" in {

      val TOPIC = "cpuTopic"
      val KCQL = s"SELECT * from $TOPIC STOREAS PubSub (channel=type)"
      println("Testing KCQL : " + KCQL)
      val props = Map(
        RedisConfigConstants.REDIS_HOST->"localhost",
        RedisConfigConstants.REDIS_PORT->"6379",
        RedisConfigConstants.KCQL_CONFIG->KCQL
      ).asJava

      val config = RedisConfig(props)
      val connectionInfo = new RedisConnectionInfo("localhost", 6379, None)
      val settings = RedisSinkSettings(config)
      val writer = new RedisPubSub(settings)
      writer.createClient(settings)

      val schema = SchemaBuilder.struct().name("com.example.Cpu")
        .field("type", Schema.STRING_SCHEMA)
        .field("temperature", Schema.FLOAT64_SCHEMA)
        .field("voltage", Schema.FLOAT64_SCHEMA)
        .field("ts", Schema.INT64_SCHEMA).build()

      val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L)
      val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L)
      val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L)

      val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1)
      val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2)
      val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3)

      val jedis = new Jedis(connectionInfo.host, connectionInfo.port)
      // Clean up in-memory jedis
      jedis.flushAll()

      val messagesMap = collection.mutable.Map[String, ListBuffer[String]]()

      val t = new Thread {
        private val pubsub = new JedisPubSub {
          override def onMessage(channel: String, message: String): Unit = {
            messagesMap.get(channel) match {
              case Some(msgs) => messagesMap.put(channel, msgs += message)
              case None => messagesMap.put(channel, ListBuffer(message))
            }
          }
        }

        override def run(): Unit = {
          jedis.subscribe(pubsub, "Xeon", "i7", "i7-i")
        }

        override def interrupt(): Unit = {
          pubsub.punsubscribe("*")
          super.interrupt()
        }
      }
      t.start()
      t.join(5000)
      if (t.isAlive) t.interrupt()

      writer.write(Seq(sinkRecord1))
      writer.write(Seq(sinkRecord2, sinkRecord3))

      messagesMap.size shouldBe 3

      messagesMap("Xeon").head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}"""
      messagesMap("i7").head shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}"""
      messagesMap("i7-i").head shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}"""
    }
  }
}

Source File: RedisInsertSortedSetTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.redis.sink.writer

import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import redis.clients.jedis.Jedis
import redis.embedded.RedisServer

import scala.collection.JavaConverters._

class RedisInsertSortedSetTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar {

  val redisServer = new RedisServer(6379)

  override def beforeAll() = redisServer.start()

  override def afterAll() = redisServer.stop()

  "Redis INSERT into Sorted Set (SS) writer" should {

    "write Kafka records to a Redis Sorted Set" in {

      val TOPIC = "cpuTopic"
      val KCQL = s"INSERT INTO cpu_stats SELECT * from $TOPIC STOREAS SortedSet(score=ts)"
      println("Testing KCQL : " + KCQL)
      val props = Map(
        RedisConfigConstants.REDIS_HOST->"localhost",
        RedisConfigConstants.REDIS_PORT->"6379",
        RedisConfigConstants.KCQL_CONFIG->KCQL
      ).asJava

      val config = RedisConfig(props)
      val connectionInfo = new RedisConnectionInfo("localhost", 6379, None)
      val settings = RedisSinkSettings(config)
      val writer = new RedisInsertSortedSet(settings)
      writer.createClient(settings)

      val schema = SchemaBuilder.struct().name("com.example.Cpu")
        .field("type", Schema.STRING_SCHEMA)
        .field("temperature", Schema.FLOAT64_SCHEMA)
        .field("voltage", Schema.FLOAT64_SCHEMA)
        .field("ts", Schema.INT64_SCHEMA).build()

      val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L)
      val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L)
      val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L)

      val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1)
      val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2)
      val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3)

      val jedis = new Jedis(connectionInfo.host, connectionInfo.port)
      // Clean up in-memory jedis
      jedis.flushAll()

      writer.write(Seq(sinkRecord1))
      writer.write(Seq(sinkRecord2, sinkRecord3))

      // Redis cardinality should now be 3
      jedis.zcard("cpu_stats") shouldBe 3

      val allSSrecords = jedis.zrange("cpu_stats", 0, 999999999999L)
      val results = allSSrecords.asScala.toList
      results.head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}"""
      results(1) shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}"""
      results(2) shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}"""

    }

  }

}

Source File: RedisFieldsKeyBuilder.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.redis.sink.writer

import com.datamountaineer.streamreactor.connect.rowkeys.StringKeyBuilder
import org.apache.kafka.connect.data.{Field, Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.annotation.tailrec
import scala.collection.JavaConverters._


  override def build(record: SinkRecord): String = {
    val struct: Struct = record.value.asInstanceOf[Struct]
    val schema: Schema = struct.schema

    def extractAvailableFieldNames(schema: Schema): Seq[String] = {
      if (schema.`type` == Schema.Type.STRUCT) {
        val fields = schema.fields
        fields.asScala.map(_.name) ++ fields.asScala.flatMap { f =>
          extractAvailableFieldNames(f.schema).map(name => f.name + "." + name)
        }
      } else Seq.empty
    }

    val availableFields = extractAvailableFieldNames(schema)
    val missingKeys = keys.filterNot(availableFields.contains)
    require(
      missingKeys.isEmpty,
      s"${missingKeys.mkString(",")} keys are not present in the SinkRecord payload: ${availableFields.mkString(", ")}"
    )

    def getValue(key: String): AnyRef = {
      @tailrec
      def findValue(keyParts: List[String], obj: AnyRef): Option[AnyRef] =
        (obj, keyParts) match {
          case (f: Field, k :: tail) => findValue(tail, f.schema.field(k))
          case (s: Struct, k :: tail) => findValue(tail, s.get(k))
          case (v, _) => Option(v)
        }

      findValue(key.split('.').toList, struct).getOrElse {
        throw new IllegalArgumentException(
          s"$key field value is null. Non null value is required for the fields creating the row key"
        )
      }
    }

    keys.map(getValue).mkString(pkDelimiter)
  }
}

Source File: PulsarWriterTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.pulsar.sink

import com.datamountaineer.streamreactor.connect.pulsar.ProducerConfigFactory
import com.datamountaineer.streamreactor.connect.pulsar.config.{PulsarConfigConstants, PulsarSinkConfig, PulsarSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.pulsar.client.api.{Message, MessageId, Producer, PulsarClient}
import org.mockito.ArgumentMatchers.any
import org.mockito.MockitoSugar
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._


class PulsarWriterTest extends AnyWordSpec with MockitoSugar with Matchers {
  val pulsarTopic = "persistent://landoop/standalone/connect/kafka-topic"

  def getSchema: Schema = {
    SchemaBuilder.struct
      .field("int8", SchemaBuilder.int8().defaultValue(2.toByte).doc("int8 field").build())
      .field("int16", Schema.INT16_SCHEMA)
      .field("int32", Schema.INT32_SCHEMA)
      .field("int64", Schema.INT64_SCHEMA)
      .field("float32", Schema.FLOAT32_SCHEMA)
      .field("float64", Schema.FLOAT64_SCHEMA)
      .field("boolean", Schema.BOOLEAN_SCHEMA)
      .field("string", Schema.STRING_SCHEMA)
      .build()
  }


  def getStruct(schema: Schema): Struct = {
    new Struct(schema)
      .put("int8", 12.toByte)
      .put("int16", 12.toShort)
      .put("int32", 12)
      .put("int64", 12L)
      .put("float32", 12.2f)
      .put("float64", 12.2)
      .put("boolean", true)
      .put("string", "foo")
  }


  "should write messages" in {

    val config = PulsarSinkConfig(Map(
      PulsarConfigConstants.HOSTS_CONFIG -> "pulsar://localhost:6650",
      PulsarConfigConstants.KCQL_CONFIG -> s"INSERT INTO $pulsarTopic SELECT * FROM kafka_topic BATCH = 10 WITHPARTITIONER = SinglePartition WITHCOMPRESSION = ZLIB WITHDELAY = 1000"
    ).asJava)

    val schema = getSchema
    val struct = getStruct(schema)
    val record1 = new SinkRecord("kafka_topic", 0, null, null, schema, struct, 1)

    val settings = PulsarSinkSettings(config)
    val producerConfig = ProducerConfigFactory("test", settings.kcql)

    val client = mock[PulsarClient]
    val producer = mock[Producer]
    val messageId = mock[MessageId]

    when(client.createProducer(pulsarTopic, producerConfig(pulsarTopic))).thenReturn(producer)
    when(producer.send(any[Message])).thenReturn(messageId)

    val writer = PulsarWriter(client, "test", settings)
    writer.write(List(record1))
  }
}

Source File: ChangeFeedStructBuilder.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.rethink.source

import com.fasterxml.jackson.databind.ObjectMapper
import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}



object ChangeFeedStructBuilder extends StrictLogging {

  val mapper = new ObjectMapper()
  val oldVal = "old_val"
  val newVal = "new_val"
  val state = "state"
  val `type` = "type"

  val schema: Schema = SchemaBuilder.struct.name("ReThinkChangeFeed")
    .version(1)
    .field(state, Schema.OPTIONAL_STRING_SCHEMA)
    .field(oldVal, Schema.OPTIONAL_STRING_SCHEMA)
    .field(newVal, Schema.OPTIONAL_STRING_SCHEMA)
    .field(`type`, Schema.OPTIONAL_STRING_SCHEMA)
    .build

  def apply(hm: Map[String, Object]): Struct = {
    val struct = new Struct(schema)
    hm.foreach({ case (k, v) => if (v != null) struct.put(k, v.toString) })
    struct
  }
}

Source File: ConnectSchema.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.bloomberg

import org.apache.kafka.connect.data.{Schema, SchemaBuilder}

import scala.collection.JavaConverters._


  def createSchema(name: String, value: Any): Schema = {
    value match {
      case _: Boolean => Schema.BOOLEAN_SCHEMA
      case _: Int => Schema.INT32_SCHEMA
      case _: Long => Schema.INT64_SCHEMA
      case _: Double => Schema.FLOAT64_SCHEMA
      case _: Char => Schema.STRING_SCHEMA
      case _: String => Schema.STRING_SCHEMA
      case _: Float => Schema.FLOAT32_SCHEMA
      case list: java.util.List[_] =>
        val firstItemSchema = if (list.isEmpty) Schema.OPTIONAL_STRING_SCHEMA else createSchema(name, list.get(0))
        SchemaBuilder.array(firstItemSchema).build()

      case map: java.util.LinkedHashMap[String @unchecked, _] =>
        val recordBuilder = SchemaBuilder.struct()
        recordBuilder.name(name)
        map.entrySet().asScala.foreach(kvp =>
          recordBuilder.field(kvp.getKey, createSchema(kvp.getKey, kvp.getValue)))
        recordBuilder.build()
      case v => sys.error(s"${v.getClass} is not handled.")
    }
  }
}

object ConnectSchema {
  val namespace = "com.datamountaineer.streamreactor.connect.bloomberg"

  val connectSchema = new ConnectSchema(namespace)

  implicit class BloombergDataToConnectSchema(val data: BloombergData) {
    def getConnectSchema  : Schema = {
      connectSchema.createSchema("BloombergData", data.data)
    }
  }
}

Source File: HiveSchemaTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.hive.it

import java.util.concurrent.TimeUnit

import com.landoop.streamreactor.connect.hive.{DatabaseName, TableName}
import org.apache.kafka.connect.data.Schema
import org.scalatest.concurrent.Eventually
import org.scalatest.matchers.should.Matchers
import org.scalatest.time.{Millis, Span}
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._
import scala.io.Source
import scala.util.Random

class HiveSchemaTest extends AnyWordSpec with Matchers with PersonTestData with Eventually with HiveTests {

  private implicit val patience: PatienceConfig = PatienceConfig(Span(60000, Millis), Span(5000, Millis))

  case class Foo(s: String, l: Long, b: Boolean, d: Double)
  def foo = Foo("string", Random.nextLong, Random.nextBoolean, Random.nextDouble)

  "Hive" should {
    "create correct schema for table" in {

      val topic = createTopic()
      val taskDef = Source.fromInputStream(getClass.getResourceAsStream("/hive_sink_task_no_partitions.json")).getLines().mkString("\n")
        .replace("{{TOPIC}}", topic)
        .replace("{{TABLE}}", topic)
        .replace("{{NAME}}", topic)
      postTask(taskDef)

      val producer = stringStringProducer()
      writeRecords(producer, topic, JacksonSupport.mapper.writeValueAsString(foo), 2000)
      producer.close(30, TimeUnit.SECONDS)

      // wait for some data to have been flushed
      eventually {
        withConn { conn =>
          val stmt = conn.createStatement
          val rs = stmt.executeQuery(s"select count(*) FROM $topic")
          rs.next()
          rs.getLong(1) should be > 0L
        }
      }

      // check that the schema is correct
      val schema = com.landoop.streamreactor.connect.hive.schema(DatabaseName("default"), TableName(topic))
      schema.fields().asScala.map(_.name).toSet shouldBe Set("s", "b", "l", "d")
      schema.field("s").schema().`type`() shouldBe Schema.Type.STRING
      schema.field("l").schema().`type`() shouldBe Schema.Type.INT64
      schema.field("d").schema().`type`() shouldBe Schema.Type.FLOAT64
      schema.field("b").schema().`type`() shouldBe Schema.Type.BOOLEAN

      stopTask(topic)
    }
  }
}

Source File: SinkRecordToJson.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.source

import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import com.fasterxml.jackson.databind.ObjectMapper
import com.landoop.json.sql.JacksonJson
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.json4s.jackson.JsonMethods._

import scala.util.Try


object SinkRecordToJson extends ConverterUtil {

  private val mapper = new ObjectMapper()

  def apply(record: SinkRecord,
            fields: Map[String, Map[String, String]],
            ignoreFields: Map[String, Set[String]]): String = {

    val schema = record.valueSchema()
    val value = record.value()

    if (schema == null) {
      if(value == null){
        throw new IllegalArgumentException(s"The sink record value is null.(topic=${record.topic()} partition=${record.kafkaPartition()} offset=${record.kafkaOffset()})".stripMargin)
      }
      //try to take it as string
      value match {
        case map: java.util.Map[_, _] =>
          val extracted = convertSchemalessJson(record,
            fields.getOrElse(record.topic(), Map.empty),
            ignoreFields.getOrElse(record.topic(), Set.empty))
            .asInstanceOf[java.util.Map[String, Any]]
          //not ideal; but the implementation is hashmap anyway
          mapper.writeValueAsString(extracted)

        case other => sys.error(
          s"""
             |For schemaless record only String and Map types are supported. Class =${Option(other).map(_.getClass.getCanonicalName).getOrElse("unknown(null value)}")}
             |Record info:
             |topic=${record.topic()} partition=${record.kafkaPartition()} offset=${record.kafkaOffset()}
             |${Try(JacksonJson.toJson(value)).getOrElse("")}""".stripMargin)
      }
    } else {
      schema.`type`() match {
        case Schema.Type.STRING =>
          val extracted = convertStringSchemaAndJson(record,
            fields.getOrElse(record.topic(), Map.empty),
            ignoreFields.getOrElse(record.topic(), Set.empty))
          compact(render(extracted))
        case Schema.Type.STRUCT =>
          val extracted = convert(record,
            fields.getOrElse(record.topic(), Map.empty),
            ignoreFields.getOrElse(record.topic(), Set.empty))

          simpleJsonConverter.fromConnectData(extracted.valueSchema(), extracted.value()).toString

        case other => sys.error(s"$other schema is not supported")
      }
    }
  }
}

Source File: StructFieldExtractorTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.schemas

import org.apache.kafka.connect.data.{Date, Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class StructFieldExtractorTest extends AnyWordSpec with Matchers {
  "StructFieldExtractor" should {
    "return all the fields and their bytes value" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = new StructFieldsExtractor(true, Map.empty).get(struct).toMap

      map.get("firstName").get shouldBe "Alex"
      map.get("lastName").get shouldBe "Smith"
      map.get("age").get shouldBe 30
    }

    "return all fields and apply the mapping" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = new StructFieldsExtractor(true, Map("lastName" -> "Name", "age" -> "a")).get(struct).toMap

      map.get("firstName").get shouldBe "Alex"
      map.get("Name").get shouldBe "Smith"
      map.get("a").get shouldBe 30

    }

    "return only the specified fields" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = new StructFieldsExtractor(false, Map("lastName" -> "Name", "age" -> "age")).get(struct).toMap

      map.get("Name").get shouldBe "Smith"
      map.get("age").get shouldBe 30

      map.size shouldBe 2
    }
  }

  "handle Date fieldds" in {
    val dateSchema = Date.builder().build()
    val schema = SchemaBuilder.struct().name("com.example.Person")
      .field("firstName", Schema.STRING_SCHEMA)
      .field("lastName", Schema.STRING_SCHEMA)
      .field("age", Schema.INT32_SCHEMA)
      .field("date", dateSchema).build()

    val date =  java.sql.Date.valueOf("2017-04-25")
    val struct = new Struct(schema)
      .put("firstName", "Alex")
      .put("lastName", "Smith")
      .put("age", 30)
      .put("date", date)

    val map1 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap
    map1.get("date").get shouldBe date
    map1.size shouldBe 1

    val d = Date.toLogical(dateSchema, 10000)
    struct.put("date", d)

    val map2 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap
    map2.get("date").get shouldBe d
    map2.size shouldBe 1

  }

}

Source File: TestUtilsBase.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect

import java.util
import java.util.Collections

import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.kafka.connect.source.SourceTaskContext
import org.apache.kafka.connect.storage.OffsetStorageReader
import org.mockito.Mockito._
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfter
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._



    //set up partition
    val partition: util.Map[String, String] = Collections.singletonMap(lookupPartitionKey, table)
    //as a list to search for
    val partitionList: util.List[util.Map[String, String]] = List(partition).asJava
    //set up the offset
    val offset: util.Map[String, Object] = (Collections.singletonMap(offsetColumn,offsetValue ))
    //create offsets to initialize from
    val offsets :util.Map[util.Map[String, String],util.Map[String, Object]] = Map(partition -> offset).asJava

    //mock out reader and task context
    val taskContext = mock[SourceTaskContext]
    val reader = mock[OffsetStorageReader]
    when(reader.offsets(partitionList)).thenReturn(offsets)
    when(taskContext.offsetStorageReader()).thenReturn(reader)

    taskContext
  }
}

Source File: BytesConverterTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.source

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import org.apache.kafka.connect.data.Schema
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class BytesConverterTest extends AnyWordSpec with Matchers {
  private val converter = new BytesConverter()
  private val topic = "topicA"

  "BytesConverter" should {
    "handle null payloads" in {
      val sourceRecord = converter.convert(topic, "somesource", "100", null)

      sourceRecord.keySchema() shouldBe MsgKey.schema
      sourceRecord.key() shouldBe MsgKey.getStruct("somesource", "100")
      sourceRecord.valueSchema() shouldBe Schema.BYTES_SCHEMA
      sourceRecord.value() shouldBe null
    }

    "handle non-null payloads" in {
      val expectedPayload: Array[Byte] = Array(245, 2, 10, 200, 22, 0, 0, 11).map(_.toByte)
      val sourceRecord = converter.convert(topic, "somesource", "1001", expectedPayload)

      sourceRecord.keySchema() shouldBe MsgKey.schema
      sourceRecord.key() shouldBe MsgKey.getStruct("somesource", "1001")
      sourceRecord.valueSchema() shouldBe Schema.BYTES_SCHEMA
      sourceRecord.value() shouldBe expectedPayload
    }
  }
}

Source File: BytesConverterTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.sink

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class BytesConverterTest extends AnyWordSpec with Matchers {
  private val converter = new BytesConverter()
  private val topic = "topicA"

  "Sink BytesConverter" should {
    "handle null payloads" in {
      val sinkRecord = converter.convert(topic, null)

      sinkRecord.keySchema() shouldBe null
      sinkRecord.key() shouldBe null
      sinkRecord.valueSchema() shouldBe Schema.BYTES_SCHEMA
      sinkRecord.value() shouldBe null
    }

    "handle non-null payloads" in {
      val expectedPayload: Array[Byte] = Array(245, 2, 10, 200, 22, 0, 0, 11).map(_.toByte)
      val data = new SinkRecord(topic, 0, null, "keyA", null, expectedPayload, 0)
      val sinkRecord = converter.convert(topic, data)

      sinkRecord.keySchema() shouldBe MsgKey.schema
      sinkRecord.key() shouldBe MsgKey.getStruct("topicA", "keyA")
      sinkRecord.valueSchema() shouldBe Schema.BYTES_SCHEMA
      sinkRecord.value() shouldBe expectedPayload
    }
  }
}

Source File: StringStructFieldsStringKeyBuilderTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.sink

import com.datamountaineer.streamreactor.connect.rowkeys.StringStructFieldsStringKeyBuilder
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class StringStructFieldsStringKeyBuilderTest extends AnyWordSpec with Matchers {
  "StructFieldsStringKeyBuilder" should {
    "raise an exception if the field is not present in the struct" in {
      intercept[IllegalArgumentException] {
        val schema = SchemaBuilder.struct().name("com.example.Person")
          .field("firstName", Schema.STRING_SCHEMA)
          .field("age", Schema.INT32_SCHEMA)
          .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

        val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

        val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
        StringStructFieldsStringKeyBuilder(Seq("threshold")).build(sinkRecord)
      }
    }

    "create the row key based on one single field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex"
    }

    "create the row key based on one single field with doc in the struct" in {
      val firstNameSchema = SchemaBuilder.`type`(Schema.Type.STRING).doc("first name")
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", firstNameSchema)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex"
    }

    "create the row key based on more thant one field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StringStructFieldsStringKeyBuilder(Seq("firstName", "age")).build(sinkRecord) shouldBe "Alex.30"
    }
  }
}

Source File: StringGenericRowKeyBuilderTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.sink

import com.datamountaineer.streamreactor.connect.rowkeys.StringGenericRowKeyBuilder
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class StringGenericRowKeyBuilderTest extends AnyWordSpec with Matchers {
  "StringGenericRowKeyBuilder" should {
    "use the topic, partition and offset to make the key" in {

      val topic = "sometopic"
      val partition = 2
      val offset = 1243L
      val sinkRecord = new SinkRecord(topic, partition, Schema.INT32_SCHEMA, 345, Schema.STRING_SCHEMA, "", offset)

      val keyBuilder = new StringGenericRowKeyBuilder()
      val expected = Seq(topic, partition, offset).mkString("|")
      keyBuilder.build(sinkRecord) shouldBe expected
    }
  }
}

Source File: StringSinkRecordKeyBuilderTest.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.sink

import com.datamountaineer.streamreactor.connect.rowkeys.StringSinkRecordKeyBuilder
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class StringSinkRecordKeyBuilderTest extends AnyWordSpec with Matchers {
  val keyRowKeyBuilder = new StringSinkRecordKeyBuilder()

  "SinkRecordKeyStringKeyBuilder" should {

    "create the right key from the Schema key value - Byte" in {
      val b = 123.toByte
      val sinkRecord = new SinkRecord("", 1, Schema.INT8_SCHEMA, b, Schema.FLOAT64_SCHEMA, Nil, 0)

      keyRowKeyBuilder.build(sinkRecord) shouldBe "123"

    }
    "create the right key from the Schema key value - String" in {
      val s = "somekey"
      val sinkRecord = new SinkRecord("", 1, Schema.STRING_SCHEMA, s, Schema.FLOAT64_SCHEMA, Nil, 0)

      keyRowKeyBuilder.build(sinkRecord) shouldBe s
    }

    "create the right key from the Schema key value - Bytes" in {
      val bArray = Array(23.toByte, 24.toByte, 242.toByte)
      val sinkRecord = new SinkRecord("", 1, Schema.BYTES_SCHEMA, bArray, Schema.FLOAT64_SCHEMA, Nil, 0)
      keyRowKeyBuilder.build(sinkRecord) shouldBe bArray.toString
    }
    "create the right key from the Schema key value - Boolean" in {
      val bool = true
      val sinkRecord = new SinkRecord("", 1, Schema.BOOLEAN_SCHEMA, bool, Schema.FLOAT64_SCHEMA, Nil, 0)

      keyRowKeyBuilder.build(sinkRecord) shouldBe "true"

    }
  }
}

Source File: BytesConverter.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.source

import java.util.Collections

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.source.SourceRecord

class BytesConverter extends Converter {
  override def convert(kafkaTopic: String,
                       sourceTopic: String,
                       messageId: String,
                       bytes: Array[Byte],
                       keys: Seq[String] = Seq.empty,
                       keyDelimiter: String = "."): SourceRecord = {
    new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic),
      null,
      kafkaTopic,
      MsgKey.schema,
      MsgKey.getStruct(sourceTopic, messageId),
      Schema.BYTES_SCHEMA,
      bytes)
  }
}

Source File: JsonResilientConverter.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.source

import java.util

import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.data.SchemaAndValue
import org.apache.kafka.connect.json.JsonConverter


class JsonResilientConverter extends JsonConverter {

  override def configure(configs: util.Map[String, _], isKey: Boolean) {
    super.configure(configs, isKey)
  }

  override def fromConnectData(topic: String, schema: Schema, value: Object): Array[Byte] = {
    try {
      super.fromConnectData(topic, schema, value)
    } catch {
      case t: Throwable =>
        t.printStackTrace()
        // Ignore exceptions
        null
    }
  }

  override def toConnectData(topic: String, value: Array[Byte]): SchemaAndValue = {
    try {
      super.toConnectData(topic, value)
    } catch {
      case t: Throwable =>
        t.printStackTrace()
        // Ignore exceptions
        SchemaAndValue.NULL
    }
  }

}

Source File: AvroConverter.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.source

import java.io.File
import java.util.Collections

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import io.confluent.connect.avro.AvroData
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import org.apache.avro.{Schema => AvroSchema}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.source.SourceRecord
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException


class AvroConverter extends Converter {
  private val avroData = new AvroData(8)
  private var sourceToSchemaMap: Map[String, AvroSchema] = Map.empty
  private var avroReadersMap: Map[String, GenericDatumReader[GenericRecord]] = Map.empty

  override def convert(kafkaTopic: String,
                       sourceTopic: String,
                       messageId: String,
                       bytes: Array[Byte],
                       keys: Seq[String] = Seq.empty,
                       keyDelimiter: String = "."): SourceRecord = {
    Option(bytes) match {
      case None =>
        new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic),
          null,
          kafkaTopic,
          avroData.toConnectSchema(sourceToSchemaMap(sourceTopic)),
          null)
      case Some(_) =>
        val reader = avroReadersMap.getOrElse(sourceTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $sourceTopic"))
        val decoder = DecoderFactory.get().binaryDecoder(bytes, null)
        val record = reader.read(null, decoder)
        val schemaAndValue = avroData.toConnectData(sourceToSchemaMap(sourceTopic.toLowerCase), record)
        val value = schemaAndValue.value()
        value match {
          case s: Struct if keys.nonEmpty =>
            val keysValue = keys.flatMap { key =>
              Option(KeyExtractor.extract(s, key.split('.').toVector)).map(_.toString)
            }.mkString(keyDelimiter)
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              Schema.STRING_SCHEMA,
              keysValue,
              schemaAndValue.schema(),
              schemaAndValue.value())
          case _ =>
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              MsgKey.schema,
              MsgKey.getStruct(sourceTopic, messageId),
              schemaAndValue.schema(),
              schemaAndValue.value())
        }

    }
  }

  override def initialize(config: Map[String, String]): Unit = {
    sourceToSchemaMap = AvroConverter.getSchemas(config)
    avroReadersMap = sourceToSchemaMap.map { case (key, schema) =>
      key -> new GenericDatumReader[GenericRecord](schema)
    }
  }
}

object AvroConverter {
  val SCHEMA_CONFIG = "connect.source.converter.avro.schemas"

  def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = {
    config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided"))
      .toString
      .split(';')
      .filter(_.trim.nonEmpty)
      .map(_.split("="))
      .map {
        case Array(source, path) =>
          val file = new File(path)
          if (!file.exists()) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!")
          }
          val s = source.trim.toLowerCase()
          if (s.isEmpty) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path")
          }
          s -> new AvroSchema.Parser().parse(file)
        case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Source->AVRO_FILE")
      }.toMap
  }
}

Source File: BytesConverter.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.sink

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord

class BytesConverter extends Converter {
  override def convert(sinkTopic: String,
                       data: SinkRecord): SinkRecord = {
    Option(data) match {
      case None =>
        new SinkRecord(
          sinkTopic,
          0,
          null,
          null,
          Schema.BYTES_SCHEMA,
          null,
          0
        )
      case Some(_) =>
        new SinkRecord(
          data.topic(),
          data.kafkaPartition(),
          MsgKey.schema,
          MsgKey.getStruct(sinkTopic, data.key().toString()),
          Schema.BYTES_SCHEMA,
          data.value(),
          0
        )
    }
  }
}

Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.concurrent.duration._

class DefaultCommitPolicyTest extends AnyWordSpec with Matchers {

  val schema: Schema = SchemaBuilder.struct()
    .field("name", SchemaBuilder.string().required().build())
    .build()

  val struct = new Struct(schema)

  implicit val conf: Configuration = new Configuration()
  implicit val fs: LocalFileSystem = FileSystem.getLocal(conf)
  val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100))

  private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = {
    val status = fs.getFileStatus(path)
    policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime))
  }

  "DefaultCommitPolicy" should {
    "roll over after interval" in {

      val policy = DefaultCommitPolicy(None, Option(2.seconds), None)
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 10) shouldBe false
      Thread.sleep(2000)
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file count" in {
      val policy = DefaultCommitPolicy(None, None, Some(9))
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 7) shouldBe false
      shouldFlush(policy, path, 8) shouldBe false
      shouldFlush(policy, path, 9) shouldBe true
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file size" in {
      val policy = DefaultCommitPolicy(Some(10), None, None)
      val path = new Path("foo")
      val out = fs.create(path)
      shouldFlush(policy, path, 7) shouldBe false
      out.writeBytes("wibble wobble wabble wubble")
      out.close()
      shouldFlush(policy, path, 9) shouldBe true
      fs.delete(path, false)
    }
  }
}

Source File: RowKeyBuilderString.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.rowkeys

import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._


  override def build(record: SinkRecord): String = {
    val struct = record.value().asInstanceOf[Struct]
    val schema = struct.schema

    val availableFields = schema.fields().asScala.map(_.name).toSet
    val missingKeys = keys.filterNot(availableFields.contains)
    require(missingKeys.isEmpty, s"${missingKeys.mkString(",")} keys are not present in the SinkRecord payload:${availableFields.mkString(",")}")

    keys.flatMap { case key =>
      val field = schema.field(key)
      val value = struct.get(field)

      require(value != null, s"$key field value is null. Non null value is required for the fileds creating the Hbase row key")
      if (availableSchemaTypes.contains(field.schema().`type`())) Some(value.toString)
      else None
    }.mkString(keyDelimiter)
  }
}

Source File: TwitterStatusReader.scala From kafka-connect-twitter with Apache License 2.0

5 votes

package com.eneco.trading.kafka.connect.twitter

import java.util
import java.util.concurrent.{TimeUnit, LinkedBlockingQueue, Executors}
import com.eneco.trading.kafka.connect.twitter.domain.TwitterStatus
import com.twitter.hbc.httpclient.BasicClient
import com.twitter.hbc.twitter4j.Twitter4jStatusClient
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.source.SourceRecord
import twitter4j._
import scala.collection.JavaConverters._
import Extensions._

class StatusEnqueuer(queue: LinkedBlockingQueue[Status]) extends StatusListener with Logging {
  override def onStallWarning(stallWarning: StallWarning) = log.warn("onStallWarning")
  override def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) = log.info("onDeletionNotice")

  override def onScrubGeo(l: Long, l1: Long) = {
    log.debug(s"onScrubGeo $l $l1")
  }

  override def onStatus(status: Status) = {
    log.debug("onStatus")
    queue.put(status)
  }

  override def onTrackLimitationNotice(i: Int) = log.info(s"onTrackLimitationNotice $i")
  override def onException(e: Exception)= log.warn("onException " + e.toString)
}

trait StatusToSourceRecord {
  def convert(status: Status, topic: String): SourceRecord
}

object StatusToStringKeyValue extends StatusToSourceRecord {
  def convert (status: Status, topic: String): SourceRecord = {
    new SourceRecord(
      Map("tweetSource" -> status.getSource).asJava, //source partitions?
      Map("tweetId" -> status.getId).asJava, //source offsets?
      topic,
      null,
      Schema.STRING_SCHEMA,
      status.getUser.getScreenName,
      Schema.STRING_SCHEMA,
      status.getText,
      status.getCreatedAt.getTime)
  }
}

object StatusToTwitterStatusStructure extends StatusToSourceRecord {
  def convert(status: Status, topic: String): SourceRecord = {
    //val ts = TwitterStatus.struct(TwitterStatus(status))
    new SourceRecord(
      Map("tweetSource" -> status.getSource).asJava, //source partitions?
      Map("tweetId" -> status.getId).asJava, //source offsets?
      topic,
      null,
      Schema.STRING_SCHEMA,
      status.getUser.getScreenName,
      TwitterStatus.schema,
      TwitterStatus.struct(status),
      status.getCreatedAt.getTime)
  }
}


  def stop() = {
    log.info("Stop Twitter client")
    client.stop()
  }


}

Source File: TwitterStatusReader.scala From kafka-tweet-producer with Apache License 2.0

5 votes

package com.eneco.trading.kafka.connect.twitter

import java.util
import java.util.concurrent.{TimeUnit, LinkedBlockingQueue, Executors}
import com.eneco.trading.kafka.connect.twitter.domain.TwitterStatus
import com.twitter.hbc.httpclient.BasicClient
import com.twitter.hbc.twitter4j.Twitter4jStatusClient
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.source.SourceRecord
import twitter4j._
import scala.collection.JavaConverters._
import Extensions._

class StatusEnqueuer(queue: LinkedBlockingQueue[Status]) extends StatusListener with Logging {
  override def onStallWarning(stallWarning: StallWarning) = log.warn("onStallWarning")
  override def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) = log.info("onDeletionNotice")

  override def onScrubGeo(l: Long, l1: Long) = {
    log.debug(s"onScrubGeo $l $l1")
  }

  override def onStatus(status: Status) = {
    log.debug("onStatus")
    queue.put(status)
  }

  override def onTrackLimitationNotice(i: Int) = log.info(s"onTrackLimitationNotice $i")
  override def onException(e: Exception)= log.warn("onException " + e.toString)
}

trait StatusToSourceRecord {
  def convert(status: Status, topic: String): SourceRecord
}

object StatusToStringKeyValue extends StatusToSourceRecord {
  def convert (status: Status, topic: String): SourceRecord = {
    new SourceRecord(
      Map("tweetSource" -> status.getSource).asJava, //source partitions?
      Map("tweetId" -> status.getId).asJava, //source offsets?
      topic,
      null,
      Schema.STRING_SCHEMA,
      status.getUser.getScreenName,
      Schema.STRING_SCHEMA,
      status.getText)
  }
}

object StatusToTwitterStatusStructure extends StatusToSourceRecord {
  def convert(status: Status, topic: String): SourceRecord = {
    //val ts = TwitterStatus.struct(TwitterStatus(status))
    new SourceRecord(
      Map("tweetSource" -> status.getSource).asJava, //source partitions?
      Map("tweetId" -> status.getId).asJava, //source offsets?
      topic,
      TwitterStatus.schema,
      TwitterStatus.struct(status))
  }
}


  def stop() = {
    log.info("Stop Twitter client")
    client.stop()
  }


}

Source File: SourceRecordProducers.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.ftp.source

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.source.SourceRecord


object SourceRecordProducers {
  type SourceRecordProducer = (ConnectFileMetaDataStore, String, FileMetaData, FileBody) => SourceRecord

  val fileInfoSchema = SchemaBuilder.struct()
    .field("name", Schema.STRING_SCHEMA)
    .field("offset", Schema.INT64_SCHEMA)
    .build()

  def stringKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord =
    new SourceRecord(
      store.fileMetasToConnectPartition(meta), // source part
      store.fileMetasToConnectOffset(meta), // source off
      topic, //topic
      Schema.STRING_SCHEMA, // key sch
      meta.attribs.path, // key
      Schema.BYTES_SCHEMA, // val sch
      body.bytes // val
    )

  def structKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord = {
    new SourceRecord(
      store.fileMetasToConnectPartition(meta), // source part
      store.fileMetasToConnectOffset(meta), // source off
      topic, //topic
      fileInfoSchema, // key sch
      new Struct(fileInfoSchema)
        .put("name",meta.attribs.path)
        .put("offset",body.offset),
      Schema.BYTES_SCHEMA, // val sch
      body.bytes // val
    )
  }
}

Source File: StructFieldsExtractorTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.voltdb

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class StructFieldsExtractorTest extends AnyWordSpec with Matchers {
  "StructFieldsExtractor" should {
    "return all the fields and their bytes value" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val min = System.currentTimeMillis()
      val record = StructFieldsExtractor("table", true, Map.empty).get(struct)
      val map = record
      map("firstName") shouldBe "Alex"
      map("lastName") shouldBe "Smith"
      map("age") shouldBe 30
    }

    "return all fields and apply the mapping" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = StructFieldsExtractor("table", includeAllFields = true, Map("lastName" -> "Name", "age" -> "a")).get(struct)
      map("firstName") shouldBe "Alex"
      map("Name") shouldBe "Smith"
      map("a") shouldBe 30

    }

    "return only the specified fields" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = StructFieldsExtractor("table", includeAllFields = false, Map("lastName" -> "Name", "age" -> "age")).get(struct)
      map("Name") shouldBe "Smith"
      map("age") shouldBe 30
      map.size shouldBe 2
    }
  }
}

Source File: HiveSchemaTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.hive.it

import java.util.concurrent.TimeUnit

import com.landoop.streamreactor.connect.hive.{DatabaseName, TableName}
import org.apache.kafka.connect.data.Schema
import org.scalatest.concurrent.Eventually
import org.scalatest.matchers.should.Matchers
import org.scalatest.time.{Millis, Span}
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._
import scala.io.Source
import scala.util.Random

class HiveSchemaTest extends AnyWordSpec with Matchers with PersonTestData with Eventually with HiveTests {

  private implicit val patience: PatienceConfig = PatienceConfig(Span(60000, Millis), Span(5000, Millis))

  case class Foo(s: String, l: Long, b: Boolean, d: Double)
  def foo = Foo("string", Random.nextLong, Random.nextBoolean, Random.nextDouble)

  "Hive" should {
    "create correct schema for table" in {

      val topic = createTopic()
      val taskDef = Source.fromInputStream(getClass.getResourceAsStream("/hive_sink_task_no_partitions.json")).getLines().mkString("\n")
        .replace("{{TOPIC}}", topic)
        .replace("{{TABLE}}", topic)
        .replace("{{NAME}}", topic)
      postTask(taskDef)

      val producer = stringStringProducer()
      writeRecords(producer, topic, JacksonSupport.mapper.writeValueAsString(foo), 2000)
      producer.close(30, TimeUnit.SECONDS)

      // wait for some data to have been flushed
      eventually {
        withConn { conn =>
          val stmt = conn.createStatement
          val rs = stmt.executeQuery(s"select count(*) FROM $topic")
          rs.next()
          rs.getLong(1) should be > 0L
        }
      }

      // check that the schema is correct
      val schema = com.landoop.streamreactor.connect.hive.schema(DatabaseName("default"), TableName(topic))
      schema.fields().asScala.map(_.name).toSet shouldBe Set("s", "b", "l", "d")
      schema.field("s").schema().`type`() shouldBe Schema.Type.STRING
      schema.field("l").schema().`type`() shouldBe Schema.Type.INT64
      schema.field("d").schema().`type`() shouldBe Schema.Type.FLOAT64
      schema.field("b").schema().`type`() shouldBe Schema.Type.BOOLEAN

      stopTask(topic)
    }
  }
}

Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.concurrent.duration._

class DefaultCommitPolicyTest extends AnyWordSpec with Matchers {

  val schema: Schema = SchemaBuilder.struct()
    .field("name", SchemaBuilder.string().required().build())
    .build()

  val struct = new Struct(schema)

  implicit val conf: Configuration = new Configuration()
  implicit val fs: LocalFileSystem = FileSystem.getLocal(conf)
  val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100))

  private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = {
    val status = fs.getFileStatus(path)
    policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime))
  }

  "DefaultCommitPolicy" should {
    "roll over after interval" in {

      val policy = DefaultCommitPolicy(None, Option(2.seconds), None)
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 10) shouldBe false
      Thread.sleep(2000)
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file count" in {
      val policy = DefaultCommitPolicy(None, None, Some(9))
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 7) shouldBe false
      shouldFlush(policy, path, 8) shouldBe false
      shouldFlush(policy, path, 9) shouldBe true
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file size" in {
      val policy = DefaultCommitPolicy(Some(10), None, None)
      val path = new Path("foo")
      val out = fs.create(path)
      shouldFlush(policy, path, 7) shouldBe false
      out.writeBytes("wibble wobble wabble wubble")
      out.close()
      shouldFlush(policy, path, 9) shouldBe true
      fs.delete(path, false)
    }
  }
}

Source File: MapValueConverterTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import com.landoop.json.sql.JacksonJson
import org.apache.kafka.connect.data.{Schema, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class MapValueConverterTest extends AnyFunSuite with Matchers {
  test("converts nested payload") {
    val json =
      """
        |{
        |  "idType": 3,
        |  "colorDepth": "",
        |  "threshold" : 45.77,
        |  "evars": {
        |    "evars": {
        |      "eVar1": "Tue Aug 27 2019 12:08:10",
        |      "eVar2": 156692207943934897
        |    }
        |  },
        |  "exclude": {
        |    "id": 0,
        |    "value": false
        |  }
        |}
        |""".stripMargin

    val map = JacksonJson.toMap[Any](json)

    val struct = MapValueConverter.convert(map)
    //Jackson transforming the json to Map the fields order is not retained
    struct.schema().fields().asScala.map(_.name()).sorted shouldBe List("idType", "colorDepth", "threshold", "evars", "exclude").sorted

    struct.schema().field("idType").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA

    struct.schema().field("colorDepth").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA

    struct.schema().field("threshold").schema() shouldBe Schema.OPTIONAL_FLOAT64_SCHEMA

    struct.schema().field("exclude").schema().`type`() shouldBe Schema.Type.STRUCT
    struct.schema().field("exclude").schema().isOptional shouldBe true

    struct.schema().field("evars").schema().`type`() shouldBe Schema.Type.STRUCT
    struct.schema().field("evars").schema().isOptional shouldBe true

    struct.schema().field("evars").schema().fields().asScala.map(_.name()) shouldBe List("evars")
    val evarsInner = struct.schema().field("evars").schema().field("evars")
    evarsInner.schema().`type`() shouldBe Schema.Type.STRUCT
    evarsInner.schema().isOptional shouldBe true
    evarsInner.schema().fields().asScala.map(_.name()).sorted shouldBe List("eVar1", "eVar2").sorted
    evarsInner.schema().field("eVar1").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA
    evarsInner.schema().field("eVar2").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA

    val exclude  = struct.schema().field("exclude").schema()
    exclude.schema().`type`() shouldBe Schema.Type.STRUCT
    exclude.schema().isOptional shouldBe true
    exclude.schema().fields().asScala.map(_.name()).sorted shouldBe List("id", "value").sorted
    exclude.schema().field("id").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA
    exclude.schema().field("value").schema() shouldBe Schema.OPTIONAL_BOOLEAN_SCHEMA

    struct.get("idType") shouldBe 3L
    struct.get("colorDepth") shouldBe ""
    struct.get("threshold") shouldBe 45.77D

    val evarsStruct = struct.get("evars").asInstanceOf[Struct].get("evars").asInstanceOf[Struct]
    evarsStruct.get("eVar1") shouldBe "Tue Aug 27 2019 12:08:10"
    evarsStruct.get("eVar2") shouldBe 156692207943934897L

    val excludeStruct = struct.get("exclude").asInstanceOf[Struct]
    excludeStruct.get("id") shouldBe 0L
    excludeStruct.get("value") shouldBe false
  }

}

Source File: package.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter}

package object parquet {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = {
    if (fs.isDirectory(path)) {
      logger.debug(s"$path is a directory, reading constituent files")
      val remote = fs.listFiles(path, false)
      new Iterator[Path] {
        override def hasNext: Boolean = remote.hasNext
        override def next(): Path = remote.next().getPath
      }.toList
    } else {
      logger.debug(s"Reading $path as a single file")
      List(path)
    }
  }

  def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = {
    ParquetReader.builder(new StructReadSupport, file)
      .withConf(fs.getConf)
      .build()
  }

  def parquetWriter(path: Path,
                    schema: Schema,
                    config: ParquetSinkConfig): ParquetWriter[Struct] = {
    new StructParquetWriterBuilder(path, schema)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(config.enableDictionary)
      .withValidation(config.validation)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withWriteMode(if (config.overwrite) {
        ParquetFileWriter.Mode.OVERWRITE
      } else {
        ParquetFileWriter.Mode.CREATE
      }).build()
  }
}

Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._

// derived from Apache Spark's parquet write support, archive and license here:
// https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)
  private val schemaName = if (schema.name() == null) "schema" else schema.name()
  private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName)

  private val metadata = new java.util.HashMap[String, String]()
  metadata.put("written_by", "streamreactor")

  // The Parquet `RecordConsumer` to which all structs are written
  private var consumer: RecordConsumer = _

  type ValueWriter = (Any) => Unit

  override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String])
  override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata)
  override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer

  override def write(struct: Struct): Unit = {
    writeMessage {
      writeStructFields(struct)
    }
  }

  private def writeStructFields(struct: Struct): Unit = {
    for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) {
      val value = struct.get(field)
      if (value != null) {
        val writer = valueWriter(field.schema())
        writeField(field.name, index) {
          writer(value)
        }
      }
    }
  }

  def valueWriter(schema: Schema): ValueWriter = {
    // todo perhaps introduce something like spark's SpecializedGetters
    schema.`type`() match {
      case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean])
      case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt)
      case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong)
      case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes))
      case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat)
      case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble)
      case Schema.Type.STRUCT => value => {
        logger.debug(s"Writing nested struct")
        val struct = value.asInstanceOf[Struct]
        writeGroup {
          schema.fields.asScala
            .map { field => field -> struct.get(field) }
            .zipWithIndex.foreach { case ((field, v), k) =>
            writeField(field.name, k) {
              valueWriter(field.schema)(v)
            }
          }
        }
      }
      case _ => throw UnsupportedSchemaType(schema.`type`.toString)
    }
  }

  private def writeMessage(f: => Unit): Unit = {
    consumer.startMessage()
    f
    consumer.endMessage()
  }

  private def writeGroup(f: => Unit): Unit = {
    consumer.startGroup()
    // consumer.startMessage()
    f
    //consumer.endMessage()
    consumer.endGroup()
  }

  private def writeField(name: String, k: Int)(f: => Unit): Unit = {
    consumer.startField(name, k)
    f
    consumer.endField(name, k)
  }
}

Source File: Converters.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.kafka.connect.data.{Field, Schema}
import org.apache.parquet.io.api.Converter

object Converters {
  def get(field: Field, builder: scala.collection.mutable.Map[String, Any]): Converter = {
    field.schema().`type`() match {
      case Schema.Type.STRUCT => new NestedGroupConverter(field.schema(), field, builder)
      case Schema.Type.INT64 | Schema.Type.INT32 | Schema.Type.INT16 | Schema.Type.INT8 => new AppendingPrimitiveConverter(field, builder)
      case Schema.Type.FLOAT64 | Schema.Type.FLOAT32 => new AppendingPrimitiveConverter(field, builder)
      // case Schema.Type.INT64 => new TimestampPrimitiveConverter(field, builder)
      case Schema.Type.STRING => new DictionaryStringPrimitiveConverter(field, builder)
      case Schema.Type.ARRAY => ???
      case other => throw UnsupportedSchemaType(s"Unsupported data type $other")
    }
  }
}

Source File: Input.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.blockchain.data

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String)

object Input {
  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po=>struct.put("prev_out", po.toStruct()))
      struct
    }
  }

}

Source File: Input.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.cassandra.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String) {
  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    map.put("sequence", sequence)
    prev_out.foreach(p => map.put("prev_out", p.toHashMap))
    map.put("script", script)
    map
  }
}

object Input {
  val ConnectSchema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po => struct.put("prev_out", po.toStruct()))
      struct
    }
  }

}

Source File: KeyUtils.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.cassandra.utils

import com.jayway.jsonpath.{Configuration, JsonPath}
import org.apache.kafka.connect.data.{Schema, Struct}

object KeyUtils {

  
  def keysFromStruct(struct: Struct, schema: Schema, fieldNames: Seq[String]): Seq[Object] =
    fieldNames.map(getKeyFromStruct(struct, _))

  private def getKeyFromStruct(struct: Struct, fieldName: String): Object = {
    if (fieldName.contains(".")) {
      val Array(nestedObject, nestedField) = fieldName.split("\\.", 2)
      getKeyFromStruct(struct.get(nestedObject).asInstanceOf[Struct], nestedField)
    } else {
      struct.get(fieldName)
    }
  }
}

Source File: Output.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.mongodb

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String) {

  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    addr_tag_link.foreach(map.put("addr_tag_link", _))
    addr_tag_link.foreach(map.put("addr_tag", _))
    map.put("spent", spent)
    map.put("tx_index", tx_index)
    map.put("type", `type`)
    addr.foreach(map.put("addr", _))
    map.put("value", value)
    map.put("n", n)
    map.put("script", script)
    map
  }

}

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.OPTIONAL_INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

}

Source File: Input.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.mongodb

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String) {
  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    map.put("sequence", sequence)
    prev_out.foreach(p => map.put("prev_out", p.toHashMap))
    map.put("script", script)
    map
  }
}

object Input {
  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po => struct.put("prev_out", po.toStruct()))
      struct
    }
  }

}

Source File: SinkRecordToDocument.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.mongodb.sink

import com.datamountaineer.streamreactor.connect.mongodb.config.MongoSettings
import com.datamountaineer.streamreactor.connect.mongodb.converters.SinkRecordConverter
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.bson.Document

object SinkRecordToDocument extends ConverterUtil {
  def apply(record: SinkRecord, keys: Set[String] = Set.empty)(implicit settings: MongoSettings): (Document, Iterable[(String, Any)]) = {
    val schema = record.valueSchema()
    val value = record.value()
    val fields = settings.fields.getOrElse(record.topic(), Map.empty)

    val allFields = if (fields.size == 1 && fields.head._1 == "*") true else false

    if (schema == null) {
      //try to take it as string
      value match {
        case _: java.util.Map[_, _] =>
          val extracted = convertSchemalessJson(
            record,
            fields,
            settings.ignoredField.getOrElse(record.topic(), Set.empty)
          )
          //not ideal; but the compile is hashmap anyway

          SinkRecordConverter.fromMap(extracted.asInstanceOf[java.util.Map[String, AnyRef]]) ->
            keys.headOption.map(_ => KeysExtractor.fromMap(extracted, keys)).getOrElse(Iterable.empty)
        case _ => sys.error("For schemaless record only String and Map types are supported")
      }
    } else {
      schema.`type`() match {
        case Schema.Type.STRING =>
          val extracted = convertStringSchemaAndJson(
            record,
            fields,
            settings.ignoredField.getOrElse(record.topic(), Set.empty),
            includeAllFields = allFields)
          SinkRecordConverter.fromJson(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty)

        case Schema.Type.STRUCT =>
          val extracted = convert(
            record,
            fields,
            settings.ignoredField.getOrElse(record.topic(), Set.empty)
          )
          SinkRecordConverter.fromStruct(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromStruct(extracted.value().asInstanceOf[Struct], keys)).getOrElse(Iterable.empty)

        case other => sys.error(s"$other schema is not supported")
      }
    }
  }
}

Source File: Output.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String) {

  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    addr_tag_link.foreach(map.put("addr_tag_link", _))
    addr_tag_link.foreach(map.put("addr_tag", _))
    map.put("spent", spent)
    map.put("tx_index", tx_index)
    map.put("type", `type`)
    addr.foreach(map.put("addr", _))
    map.put("value", value)
    map.put("n", n)
    map.put("script", script)
    map
  }

}

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.OPTIONAL_INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

}

Source File: Input.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String) {
  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    map.put("sequence", sequence)
    prev_out.foreach(p => map.put("prev_out", p.toHashMap))
    map.put("script", script)
    map
  }
}

object Input {
  val ConnectSchema = SchemaBuilder.struct
    .name("input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po => struct.put("prev_out", po.toStruct()))
      struct
    }
  }

}

Source File: SinkRecordToDocumentTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import com.datamountaineer.streamreactor.connect.azure.documentdb.Json
import com.datamountaineer.streamreactor.connect.azure.documentdb.config.DocumentDbSinkSettings
import com.datamountaineer.streamreactor.connect.errors.NoopErrorPolicy
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import com.microsoft.azure.documentdb.{ConsistencyLevel, Document}
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class SinkRecordToDocumentTest extends AnyWordSpec with Matchers with ConverterUtil {
  private val connection = "https://accountName.documents.azure.com:443/"

  "SinkRecordToDocument" should {
    "convert Kafka Struct to a Azure Document Db Document" in {
      for (i <- 1 to 4) {
        val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction$i.json").toURI.getPath).mkString
        val tx = Json.fromJson[Transaction](json)

        val record = new SinkRecord("topic1", 0, null, null, Transaction.ConnectSchema, tx.toStruct(), 0)

        implicit val settings = DocumentDbSinkSettings(
          connection,
          "secret",
          "database",
          Seq.empty,
          Map("topic1" -> Set.empty[String]),
          Map("topic1" -> Map.empty),
          Map("topic1" -> Set.empty),
          NoopErrorPolicy(),
          ConsistencyLevel.Session,
          false,
          None)
        val (document, _) = SinkRecordToDocument(record)
        val expected = new Document(json)

        //comparing string representation; we have more specific types given the schema
        document.toString shouldBe expected.toString
      }
    }

    "convert String Schema + Json payload to a Azure Document DB Document" in {
      for (i <- 1 to 4) {
        val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction$i.json").toURI.getPath).mkString

        val record = new SinkRecord("topic1", 0, null, null, Schema.STRING_SCHEMA, json, 0)

        implicit val settings = DocumentDbSinkSettings(
          connection,
          "secret",
          "database",
          Seq.empty,
          Map("topic1" -> Set.empty[String]),
          Map("topic1" -> Map.empty),
          Map("topic1" -> Set.empty),
          NoopErrorPolicy(),
          ConsistencyLevel.Session,
          false,
          None)

        val (document, _) = SinkRecordToDocument(record)
        val expected = new Document(json)

        //comparing string representation; we have more specific types given the schema
        document.toString() shouldBe expected.toString
      }
    }

    "convert Schemaless + Json payload to a Azure Document DB Document" in {
      for (i <- 1 to 4) {
        val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction$i.json").toURI.getPath).mkString


        val record = new SinkRecord("topic1", 0, null, null, Schema.STRING_SCHEMA, json, 0)

        implicit val settings = DocumentDbSinkSettings(
          connection,
          "secret",
          "database",
          Seq.empty,
          Map("topic1" -> Set.empty[String]),
          Map("topic1" -> Map.empty),
          Map("topic1" -> Set.empty),
          NoopErrorPolicy(),
          ConsistencyLevel.Session,
          false,
          None)

        val (document, _) = SinkRecordToDocument(record)
        val expected = new Document(json)

        //comparing string representation; we have more specific types given the schema
        document.toString() shouldBe expected.toString
      }
    }
  }
}

Source File: SinkRecordToDocument.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import com.datamountaineer.streamreactor.connect.azure.documentdb.config.DocumentDbSinkSettings
import com.datamountaineer.streamreactor.connect.azure.documentdb.converters.SinkRecordConverter
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import com.microsoft.azure.documentdb.Document
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

object SinkRecordToDocument extends ConverterUtil {
  def apply(record: SinkRecord, keys: Set[String] = Set.empty)(implicit settings: DocumentDbSinkSettings): (Document, Iterable[(String, Any)]) = {
    val schema = record.valueSchema()
    val value = record.value()

    if (schema == null) {
      //try to take it as string
      value match {
        case _: java.util.Map[_, _] =>

          val fields = settings.fields(record.topic())
          val extracted = convertSchemalessJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic()))
          //not ideal; but the compile is hashmap anyway

          SinkRecordConverter.fromMap(extracted.asInstanceOf[java.util.Map[String, AnyRef]]) ->
            keys.headOption.map(_ => KeysExtractor.fromMap(extracted, keys)).getOrElse(Iterable.empty)

        case _: String =>
          val extracted = convertStringSchemaAndJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic()))
          SinkRecordConverter.fromJson(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty)

        case _ => sys.error("For schemaless record only String and Map types are supported")
      }
    } else {
      schema.`type`() match {
        case Schema.Type.STRING =>
          val extracted = convertStringSchemaAndJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic()))
          SinkRecordConverter.fromJson(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty)

        case Schema.Type.STRUCT =>
          val extracted = convert(record, settings.fields(record.topic()), settings.ignoredField(record.topic()))
          SinkRecordConverter.fromStruct(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromStruct(extracted.value().asInstanceOf[Struct], keys)).getOrElse(Iterable.empty)

        case other => sys.error(s"$other schema is not supported")
      }
    }
  }
}

Source File: Transaction.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.blockchain.data

import java.util

import com.datamountaineer.streamreactor.connect.blockchain.data.Input._
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.source.SourceRecord

case class Transaction(lock_time: Long,
                       ver: Int,
                       size: Long,
                       inputs: Seq[Input],
                       rbf: Option[Boolean],
                       time: Long,
                       tx_index: Long,
                       vin_sz: Int,
                       hash: String,
                       vout_sz: Int,
                       relayed_by: String,
                       out: Seq[Output])


object Transaction {
  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.transaction")
    .field("lock_time", Schema.INT64_SCHEMA)
    .field("ver", Schema.INT32_SCHEMA)
    .field("size", Schema.INT64_SCHEMA)
    .field("inputs", SchemaBuilder.array(Input.ConnectSchema).optional().build())
    .field("rbf", Schema.OPTIONAL_BOOLEAN_SCHEMA)
    .field("time", Schema.INT64_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("vin_sz", Schema.INT32_SCHEMA)
    .field("hash", Schema.STRING_SCHEMA)
    .field("vout_sz", Schema.INT32_SCHEMA)
    .field("relayed_by", Schema.STRING_SCHEMA)
    .field("out", SchemaBuilder.array(Output.ConnectSchema).optional().build())
    .build()

  implicit class TransactionToSourceRecordConverter(val tx: Transaction) extends AnyVal {
    def toSourceRecord(topic: String, partition: Int, key: Option[String]): SourceRecord = {
      new SourceRecord(
        null,
        null,
        topic,
        partition,
        key.map(_ => Schema.STRING_SCHEMA).orNull,
        key.orNull,
        ConnectSchema,
        tx.toStruct()
      )
    }

    //private def getOffset() = Collections.singletonMap("position", System.currentTimeMillis())

    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("lock_time", tx.lock_time)
        .put("ver", tx.ver)
        .put("size", tx.size)
        .put("time", tx.time)
        .put("tx_index", tx.tx_index)
        .put("vin_sz", tx.vin_sz)
        .put("hash", tx.hash)
        .put("vout_sz", tx.vout_sz)
        .put("relayed_by", tx.relayed_by)

      tx.out.headOption.foreach { _ =>
        import scala.collection.JavaConverters._
        struct.put("out", tx.out.map(_.toStruct()).asJava)
      }
      tx.rbf.foreach(struct.put("rbf", _))
      tx.inputs.headOption.foreach { _ =>
        val inputs = new util.ArrayList[Struct]
        tx.inputs.foreach(i => inputs.add(i.toStruct()))
        struct.put("inputs", inputs)
      }
      tx.out.headOption.foreach { _ =>
        val outputs = new util.ArrayList[Struct]
        tx.out.foreach(output => outputs.add(output.toStruct()))
      }

      struct
    }
  }

}

Source File: Output.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.blockchain.data

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String)

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

}

Source File: Output.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.cassandra.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String) {

  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    addr_tag_link.foreach(map.put("addr_tag_link", _))
    addr_tag_link.foreach(map.put("addr_tag", _))
    map.put("spent", spent)
    map.put("tx_index", tx_index)
    map.put("type", `type`)
    addr.foreach(map.put("addr", _))
    map.put("value", value)
    map.put("n", n)
    map.put("script", script)
    map
  }

}

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.OPTIONAL_INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

}

Source File: ConnectMongoConverterSpec.scala From kafka-connect-mongodb with Apache License 2.0

5 votes

package com.startapp.data

import java.lang.Boolean
import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.{FlatSpec, Matchers}

class ConnectMongoConverterSpec extends FlatSpec with Matchers{
  private val FIELD1_NAME = "fieldInt"
  private val FIELD1_VALUE = new Integer(5)
  private val FIELD2_NAME = "fieldString"
  private val FIELD2_VALUE = "str"
  private val FIELD3_NAME = "fieldBoolean"
  private val FIELD3_VALUE = new Boolean(true)

  val schema = SchemaBuilder.struct().name("test schema")
    .field(FIELD1_NAME, Schema.INT32_SCHEMA)
    .field(FIELD2_NAME, Schema.STRING_SCHEMA)
    .field(FIELD3_NAME, Schema.BOOLEAN_SCHEMA)
    .build()

  "No Schema Connect Mongo Converter Bad Data" should "throw an exception" in {
    var exceptionThrown = false

    val badData = new Struct(schema)

    try{
      checkJsonMap(NoSchemaConnectMongoConverter, badData)
    }
    catch {
      case _ : java.lang.ClassCastException => exceptionThrown = true
    }

    exceptionThrown should be(true)
  }

  "No Schema Connect Mongo Converter Good Data" should "return the same map" in {
    val jsonMap = new util.HashMap[String, Object]()
    jsonMap.put(FIELD1_NAME, FIELD1_VALUE)
    jsonMap.put(FIELD2_NAME, FIELD2_VALUE)
    jsonMap.put(FIELD3_NAME, FIELD3_VALUE)

    checkJsonMap(NoSchemaConnectMongoConverter, jsonMap)
  }

  "Schema Connect Mongo Converter Bad Data" should "throw an exception" in {
    var exceptionThrown = false

    val badData = new util.HashMap[String, Object]()
    badData.put(FIELD1_NAME, FIELD1_VALUE)

    try {
      checkJsonMap(SchemaConnectMongoConverter, badData)
    }
    catch {
      case _ : java.lang.ClassCastException => exceptionThrown = true
    }

    exceptionThrown should be(true)
  }

  "Schema Connect Mongo Converter Good Data" should "convert data to json map" in {
    val data = new Struct(schema)
      .put(FIELD1_NAME, FIELD1_VALUE)
      .put(FIELD2_NAME, FIELD2_VALUE)
      .put(FIELD3_NAME, FIELD3_VALUE)

    checkJsonMap(SchemaConnectMongoConverter, data)
  }

  private def checkJsonMap(converter : ConnectMongoConverter, value: Object): Unit ={
    val newJsonMap = converter.toJsonMap(value).toMap

    newJsonMap(FIELD1_NAME) should be(FIELD1_VALUE)
    newJsonMap(FIELD2_NAME) should be(FIELD2_VALUE)
    newJsonMap(FIELD3_NAME) should be(FIELD3_VALUE)
  }

}

Source File: HANASourceTaskConversionTest.scala From kafka-connect-sap with Apache License 2.0

5 votes

package com.sap.kafka.connect.source

import com.sap.kafka.client.MetaSchema
import org.apache.kafka.connect.data.Schema.Type
import org.apache.kafka.connect.data.{Field, Schema, Struct}
import org.apache.kafka.connect.source.SourceRecord

import scala.collection.JavaConverters._

class HANASourceTaskConversionTest extends HANASourceTaskTestBase {

  override def beforeAll(): Unit = {
    super.beforeAll()
    task.start(singleTableConfig())
  }

  override def afterAll(): Unit = {
    task.stop()
    super.afterAll()
  }

  test("boolean type") {
    typeConversion(Schema.BOOLEAN_SCHEMA, true, java.lang.Boolean.FALSE,
      Schema.BOOLEAN_SCHEMA, java.lang.Boolean.FALSE)
  }

  test("int type") {
    typeConversion(Schema.INT32_SCHEMA, true, new java.lang.Integer(1),
      Schema.INT32_SCHEMA, new Integer(1))
  }

  test("long type") {
    typeConversion(Schema.INT64_SCHEMA, true, new java.lang.Long(1),
      Schema.INT64_SCHEMA, new java.lang.Long(1))
  }

  test("double type") {
    typeConversion(Schema.FLOAT64_SCHEMA, true, new java.lang.Double(1.0),
      Schema.FLOAT64_SCHEMA, new java.lang.Double(1.0))
  }

  test("string type") {
    typeConversion(Schema.STRING_SCHEMA, true, "'a'",
      Schema.STRING_SCHEMA, "a")
  }

  private def typeConversion(sqlType: Schema, nullable: Boolean,
                             sqlValue: Object, convertedSchema: Schema,
                             convertedValue: Object): Unit = {
    val fields = Seq(new Field("id", 1, sqlType))
    jdbcClient.createTable(Some("TEST"), "EMPLOYEES_SOURCE", MetaSchema(null, fields),
      3000)
    val connection = jdbcClient.getConnection
    val stmt = connection.createStatement()
    stmt.execute("insert into \"TEST\".\"EMPLOYEES_SOURCE\" values(" + sqlValue.toString + ")")
    val records = task.poll()
    validateRecords(records.asScala.toList, convertedSchema, convertedValue)
    stmt.execute("drop table \"TEST\".\"EMPLOYEES_SOURCE\"")
  }

  private def validateRecords(records: List[SourceRecord], expectedFieldSchema: Schema,
                              expectedValue: Object): Unit = {
    assert(records.size === 1)
    val objValue = records.head.value()
    assert(objValue.isInstanceOf[Struct])
    val value = objValue.asInstanceOf[Struct]

    val schema = value.schema()
    assert(Type.STRUCT === schema.`type`())
    val fields = schema.fields()

    assert(fields.size() === 1)

    val fieldSchema = fields.get(0).schema()
    assert(expectedFieldSchema === fieldSchema)

    assert(expectedValue === value.get(fields.get(0)))
  }
}

Source File: TableQuerier.scala From kafka-connect-sap with Apache License 2.0

5 votes

package com.sap.kafka.connect.source.querier

import com.sap.kafka.client.hana.HANAJdbcClient
import com.sap.kafka.connect.config.{BaseConfig, BaseConfigConstants}
import com.sap.kafka.connect.config.hana.HANAConfig
import com.sap.kafka.utils.hana.HANAJdbcTypeConverter
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.source.SourceRecord
import org.slf4j.LoggerFactory

import scala.util.Random

abstract class TableQuerier(mode: String, tableOrQuery: String,
                            topic: String, config: BaseConfig,
                            var jdbcClient: Option[HANAJdbcClient])
                extends Comparable[TableQuerier] {
  var tableName: String = if (mode.equals(BaseConfigConstants.QUERY_MODE_TABLE)) tableOrQuery else null
  var query: String = if (mode.equals(BaseConfigConstants.QUERY_MODE_SQL)) tableOrQuery else null

  var lastUpdate: Long = 0
  var schema: Schema = _
  var queryString: Option[String] = None
  var resultList: Option[List[Struct]] = None

  val log = LoggerFactory.getLogger(getClass)

  def getLastUpdate(): Long = lastUpdate

  def getOrCreateQueryString(): Option[String] = {
    createQueryString()
    queryString
  }

  def createQueryString(): Unit

  def querying(): Boolean = resultList.isDefined

  def maybeStartQuery(): Unit = {
    if (resultList.isEmpty) {
      schema = getSchema()
      queryString = getOrCreateQueryString()

      val batchMaxRows = config.batchMaxRows
      resultList = getOrCreateJdbcClient().get.executeQuery(schema, queryString.get,
        0, batchMaxRows)
      log.info(resultList.size.toString)
    }
  }

  def extractRecords(): List[SourceRecord]

  def close(now: Long): Unit = {
    resultList = None
    schema = null

    lastUpdate = now
  }

  protected def getOrCreateJdbcClient(): Option[HANAJdbcClient] = {
    if (jdbcClient.isDefined) {
      return jdbcClient
    }

    config match {
      case hanaConfig: HANAConfig => Some(HANAJdbcClient(hanaConfig))
      case _ => throw new RuntimeException("Cannot create Jdbc Client")
    }
  }

  private def getSchema(): Schema = {
    mode match {
      case BaseConfigConstants.QUERY_MODE_TABLE =>
        if (getOrCreateJdbcClient().get.isInstanceOf[HANAJdbcClient]) {
          val metadata = getOrCreateJdbcClient().get.getMetaData(tableOrQuery, None)
          HANAJdbcTypeConverter.convertHANAMetadataToSchema(tableName, metadata)
        } else {
          throw new RuntimeException("Jdbc Client is not available")
        }
      case BaseConfigConstants.QUERY_MODE_SQL =>
        if (getOrCreateJdbcClient().get.isInstanceOf[HANAJdbcClient]) {
          val metadata = getOrCreateJdbcClient().get.getMetadata(tableOrQuery)
          HANAJdbcTypeConverter.convertHANAMetadataToSchema("Query" + Random.nextInt, metadata)
        } else {
          throw new RuntimeException("Jdbc Client is not available")
        }
      case _ =>
        throw new RuntimeException("Other Query modes are not supported")
    }
  }

  override def compareTo(other: TableQuerier): Int = {
    if (this.lastUpdate < other.lastUpdate) {
      -1
    } else if (this.lastUpdate > other.lastUpdate) {
      0
    } else {
      this.tableName.compareTo(other.tableName)
    }
  }
}

Source File: SQSSourceTask.scala From sqs-kafka-connect with Apache License 2.0

5 votes

package com.hivehome.kafka.connect.sqs

import java.util.{List => JList, Map => JMap}
import javax.jms._

import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.source.{SourceRecord, SourceTask}
import org.slf4j.LoggerFactory

import scala.collection.JavaConverters._
import scala.util.Try
import scala.util.control.NonFatal

object SQSSourceTask {
  private val SqsQueueField: String = "queue"
  private val MessageId: String = "messageId"
  private val ValueSchema = Schema.STRING_SCHEMA
}

class SQSSourceTask extends SourceTask {
  val logger = LoggerFactory.getLogger(getClass.getName)
  private var conf: Conf = _
  private var consumer: MessageConsumer = null
  // MessageId to MessageHandle used to ack the message on the commitRecord method invocation
  private var unAcknowledgedMessages = Map[String, Message]()

  def version: String = Version()

  def start(props: JMap[String, String]): Unit = {
    conf = Conf.parse(props.asScala.toMap).get

    logger.debug("Creating consumer...")
    synchronized {
      try {
        consumer = SQSConsumer(conf)
        logger.info("Created consumer to  SQS topic {} for reading", conf.queueName)
      }
      catch {
        case NonFatal(e) => logger.error("Exception", e)
      }
    }
  }

  import com.hivehome.kafka.connect.sqs.SQSSourceTask._

  @throws(classOf[InterruptedException])
  def poll: JList[SourceRecord] = {
    def toRecord(msg: Message): SourceRecord = {
      val extracted = MessageExtractor(msg)
      val key = Map(SqsQueueField -> conf.queueName.get).asJava
      val value = Map(MessageId -> msg.getJMSMessageID).asJava
      new SourceRecord(key, value, conf.topicName.get, ValueSchema, extracted)
    }

    assert(consumer != null) // should be initialised as part of start()
    Try {
      Option(consumer.receive).map { msg =>
        logger.info("Received message {}", msg)

        // This operation is not threadsafe as a result the plugin is not threadsafe.
        // However KafkaConnect assigns a single thread to each task and the poll
        // method is always called by a single thread.
        unAcknowledgedMessages = unAcknowledgedMessages.updated(msg.getJMSMessageID, msg)

        toRecord(msg)
      }.toSeq
    }.recover {
      case NonFatal(e) =>
        logger.error("Exception while processing message", e)
        List.empty
    }.get.asJava
  }

  @throws(classOf[InterruptedException])
  override def commitRecord(record: SourceRecord): Unit = {
    val msgId = record.sourceOffset().get(MessageId).asInstanceOf[String]
    val maybeMsg = unAcknowledgedMessages.get(msgId)
    maybeMsg.foreach(_.acknowledge())
    unAcknowledgedMessages = unAcknowledgedMessages - msgId
  }

  def stop() {
    logger.debug("Stopping task")
    synchronized {
      unAcknowledgedMessages = Map()
      try {
        if (consumer != null) {
          consumer.close()
          logger.debug("Closed input stream")
        }
      }
      catch {
        case NonFatal(e) => logger.error("Failed to close consumer stream: ", e)
      }
      this.notify()
    }
  }
}

Source File: FieldValueGetter.scala From kafka-connect-kcql-smt with Apache License 2.0

5 votes

package com.landoop.connect.sql

import org.apache.kafka.connect.data.{Schema, Struct}

trait FieldValueGetter {

  def get(value: Any, schema: Schema, path: Seq[String]): Option[Any] = {
    path.headOption.map { parent =>
      schema.`type`() match {
        case Schema.Type.STRUCT => if (Option(value).isEmpty) None else fromRecord(value, schema, path)
        case Schema.Type.MAP => if (Option(value).isEmpty) None else fromMap(value, schema, path)
        case _ => throw new IllegalArgumentException(s"Can't select $parent field from schema:$schema")
      }
    }.getOrElse {
      schema.`type`() match {
        case Schema.Type.BOOLEAN |
             Schema.Type.FLOAT64 | Schema.Type.FLOAT32 |
             Schema.Type.INT64 | Schema.Type.INT32 | Schema.Type.INT16 | Schema.Type.INT8 |
             Schema.Type.BYTES | Schema.Type.STRING => Option(value)

        case Schema.Type.ARRAY | Schema.Type.MAP | Schema.Type.STRUCT =>
          throw new IllegalArgumentException(s"Can't select an element from an array(schema:$schema)")

        case other => throw new IllegalArgumentException(s"Invalid Avro schema type:$other")
      }
    }
  }


  private def fromRecord(value: Any, schema: Schema, path: Seq[String]) = {
    val field = Option(schema.field(path.head))
      .getOrElse(throw new IllegalArgumentException(s"Can't find field:${path.head} in schema:$schema"))
    val v = value.asInstanceOf[Struct].get(path.head)
    get(v, field.schema(), path.tail)
  }


  private def fromMap(value: Any, schema: Schema, path: Seq[String]) = {
    val field = Option(schema.field(path.head))
      .getOrElse(throw new IllegalArgumentException(s"Can't find field:${path.head} in schema:$schema"))
    val v = value.asInstanceOf[Struct].get(path.head)
    get(v, field.schema(), path.tail)
  }

}

Source File: IotMessageConverter.scala From toketi-kafka-connect-iothub with MIT License

5 votes

// Copyright (c) Microsoft. All rights reserved.

package com.microsoft.azure.iot.kafka.connect.source

import java.time.Instant
import java.util.Date

import com.microsoft.azure.eventhubs.impl.AmqpConstants
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

object IotMessageConverter {

  val offsetKey = "offset"

  private val schemaName          = "iothub.kafka.connect"
  private val schemaVersion       = 1
  private val deviceIdKey         = "deviceId"
  private val contentTypeKey      = "contentType"
  private val sequenceNumberKey   = "sequenceNumber"
  private val enqueuedTimeKey     = "enqueuedTime"
  private val contentKey          = "content"
  private val systemPropertiesKey = "systemProperties"
  private val propertiesKey       = "properties"
  private val deviceIdIotHubKey   = "iothub-connection-device-id"

  // Public for testing purposes
  lazy val schema: Schema = SchemaBuilder.struct()
    .name(schemaName)
    .version(schemaVersion)
    .field(deviceIdKey, Schema.STRING_SCHEMA)
    .field(offsetKey, Schema.STRING_SCHEMA)
    .field(contentTypeKey, Schema.OPTIONAL_STRING_SCHEMA)
    .field(enqueuedTimeKey, Schema.STRING_SCHEMA)
    .field(sequenceNumberKey, Schema.INT64_SCHEMA)
    .field(contentKey, Schema.STRING_SCHEMA)
    .field(systemPropertiesKey, propertiesMapSchema)
    .field(propertiesKey, propertiesMapSchema)

  private lazy val propertiesMapSchema: Schema = SchemaBuilder.map(Schema.STRING_SCHEMA, Schema.STRING_SCHEMA)

  def getIotMessageStruct(iotMessage: IotMessage): Struct = {

    val systemProperties = iotMessage.systemProperties
    val deviceId: String = getOrDefaultAndRemove(systemProperties, deviceIdIotHubKey, "")
    val offset: String = getOrDefaultAndRemove(systemProperties, AmqpConstants.OFFSET_ANNOTATION_NAME, "")
    val sequenceNumber: Long = getOrDefaultAndRemove(systemProperties, AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME, 0)
    val enqueuedTime: Option[Instant] = getEnqueuedTime(systemProperties)
    val enqueuedTimeStr = if(enqueuedTime.isDefined) enqueuedTime.get.toString else ""

    val properties = iotMessage.properties
    val contentType: String = getOrDefaultAndRemove(properties, contentTypeKey, "")

    val systemPropertiesMap = systemProperties.map(i => (i._1, i._2.toString))

    new Struct(schema)
      .put(deviceIdKey, deviceId)
      .put(offsetKey, offset)
      .put(contentTypeKey, contentType)
      .put(enqueuedTimeKey, enqueuedTimeStr)
      .put(sequenceNumberKey, sequenceNumber)
      .put(contentKey, iotMessage.content)
      .put(systemPropertiesKey, systemPropertiesMap.asJava)
      .put(propertiesKey, properties.asJava)
  }

  private def getEnqueuedTime(map: scala.collection.mutable.Map[String, Object]): Option[Instant] = {
    val enqueuedTimeValue: Date = getOrDefaultAndRemove(map, AmqpConstants.ENQUEUED_TIME_UTC_ANNOTATION_NAME, null)
    if (enqueuedTimeValue != null) Some(enqueuedTimeValue.toInstant) else None
  }

  private def getOrDefaultAndRemove[T: ClassTag, S: ClassTag](map: scala.collection.mutable.Map[String, S],
      key: String, defaultVal: T): T = {

    if (map.contains(key)) {
      val retVal: T = map(key).asInstanceOf[T]
      map.remove(key)
      retVal
    } else {
      defaultVal
    }
  }
}

Source File: DataServiceTest.scala From kafka-jdbc-connector with Apache License 2.0

5 votes

package com.agoda.kafka.connector.jdbc.services

import java.sql.{Connection, PreparedStatement, ResultSet, ResultSetMetaData}

import com.agoda.kafka.connector.jdbc.utils.DataConverter
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.source.SourceRecord
import org.scalatest.mockito.MockitoSugar
import org.mockito.Mockito._
import org.scalatest.{Matchers, WordSpec}

import scala.concurrent.duration._
import scala.util.Success

class DataServiceTest extends WordSpec with Matchers with MockitoSugar {

  "Data Service" should {

    val spName = "stored-procedure"
    val connection = mock[Connection]
    val converter = mock[DataConverter]
    val sourceRecord1 = mock[SourceRecord]
    val sourceRecord2 = mock[SourceRecord]
    val resultSet = mock[ResultSet]
    val resultSetMetadata = mock[ResultSetMetaData]
    val preparedStatement = mock[PreparedStatement]
    val schema = mock[Schema]

    val dataService = new DataService {

      override def storedProcedureName: String = spName

      override protected def createPreparedStatement(connection: Connection) = Success(preparedStatement)

      override protected def extractRecords(resultSet: ResultSet, schema: Schema) = Success(Seq(sourceRecord1, sourceRecord2))

      override def dataConverter: DataConverter = converter
    }

    "get records" in {
      doNothing().when(preparedStatement).setQueryTimeout(1)
      when(preparedStatement.executeQuery).thenReturn(resultSet)
      when(resultSet.getMetaData).thenReturn(resultSetMetadata)
      when(converter.convertSchema(spName, resultSetMetadata)).thenReturn(Success(schema))

      dataService.getRecords(connection, 1.second) shouldBe Success(Seq(sourceRecord1, sourceRecord2))

      verify(preparedStatement).setQueryTimeout(1)
      verify(preparedStatement).executeQuery
      verify(resultSet).getMetaData
      verify(converter).convertSchema(spName, resultSetMetadata)
    }
  }
}

Source File: DataService.scala From kafka-jdbc-connector with Apache License 2.0

5 votes

package com.agoda.kafka.connector.jdbc.services

import java.sql.{Connection, PreparedStatement, ResultSet}

import com.agoda.kafka.connector.jdbc.utils.DataConverter
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.source.SourceRecord

import scala.concurrent.duration.Duration
import scala.util.Try

trait DataService {


  def getRecords(connection: Connection, timeout: Duration): Try[Seq[SourceRecord]] = {
    for {
      preparedStatement <- createPreparedStatement(connection)
      resultSet         <- executeStoredProcedure(preparedStatement, timeout)
      schema            <- dataConverter.convertSchema(storedProcedureName, resultSet.getMetaData)
      records           <- extractRecords(resultSet, schema)
    } yield records
  }

  protected def createPreparedStatement(connection: Connection): Try[PreparedStatement]

  protected def extractRecords(resultSet: ResultSet, schema: Schema): Try[Seq[SourceRecord]]

  private def executeStoredProcedure(preparedStatement: PreparedStatement, timeout: Duration): Try[ResultSet] = Try {
    preparedStatement.setQueryTimeout(timeout.toSeconds.toInt)
    preparedStatement.executeQuery
  }
}

Source File: TimeBasedDataService.scala From kafka-jdbc-connector with Apache License 2.0

5 votes

package com.agoda.kafka.connector.jdbc.services

import java.sql.{Connection, PreparedStatement, ResultSet, Timestamp}
import java.util.{Date, GregorianCalendar, TimeZone}

import com.agoda.kafka.connector.jdbc.JdbcSourceConnectorConstants
import com.agoda.kafka.connector.jdbc.models.DatabaseProduct
import com.agoda.kafka.connector.jdbc.models.DatabaseProduct.{MsSQL, MySQL}
import com.agoda.kafka.connector.jdbc.models.Mode.TimestampMode
import com.agoda.kafka.connector.jdbc.utils.DataConverter
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.source.SourceRecord

import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer
import scala.util.Try


case class TimeBasedDataService(databaseProduct: DatabaseProduct,
                                storedProcedureName: String,
                                batchSize: Int,
                                batchSizeVariableName: String,
                                timestampVariableName: String,
                                var timestampOffset: Long,
                                timestampFieldName: String,
                                topic: String,
                                keyFieldOpt: Option[String],
                                dataConverter: DataConverter,
                                calendar: GregorianCalendar = new GregorianCalendar(TimeZone.getTimeZone("UTC"))
                               ) extends DataService {

  override def createPreparedStatement(connection: Connection): Try[PreparedStatement] = Try {
    val preparedStatement = databaseProduct match {
      case MsSQL => connection.prepareStatement(s"EXECUTE $storedProcedureName @$timestampVariableName = ?, @$batchSizeVariableName = ?")
      case MySQL => connection.prepareStatement(s"CALL $storedProcedureName (@$timestampVariableName := ?, @$batchSizeVariableName := ?)")
    }
    preparedStatement.setTimestamp(1, new Timestamp(timestampOffset), calendar)
    preparedStatement.setObject(2, batchSize)
    preparedStatement
  }

  override def extractRecords(resultSet: ResultSet, schema: Schema): Try[Seq[SourceRecord]] = Try {
    val sourceRecords = ListBuffer.empty[SourceRecord]
    var max = timestampOffset
    while (resultSet.next()) {
      dataConverter.convertRecord(schema, resultSet) map { record =>
        val time = record.get(timestampFieldName).asInstanceOf[Date].getTime
        max = if(time > max) {
          keyFieldOpt match {
            case Some(keyField) =>
              sourceRecords += new SourceRecord(
                Map(JdbcSourceConnectorConstants.STORED_PROCEDURE_NAME_KEY -> storedProcedureName).asJava,
                Map(TimestampMode.entryName -> time).asJava, topic, null, schema, record.get(keyField), schema, record
              )
            case None           =>
              sourceRecords += new SourceRecord(
                Map(JdbcSourceConnectorConstants.STORED_PROCEDURE_NAME_KEY -> storedProcedureName).asJava,
                Map(TimestampMode.entryName -> time).asJava, topic, schema, record
              )
          }
          time
        } else max
      }
    }
    timestampOffset = max
    sourceRecords
  }

  override def toString: String = {
    s"""
       |{
       |   "name" : "${this.getClass.getSimpleName}"
       |   "mode" : "${TimestampMode.entryName}"
       |   "stored-procedure.name" : "$storedProcedureName"
       |}
    """.stripMargin
  }
}

Source File: SchemaSpec.scala From kafka-connect-cassandra with Apache License 2.0

5 votes

package com.tuplejump.kafka.connect.cassandra

import com.datastax.driver.core.{ DataType, TestUtil}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord

class SchemaSpec extends AbstractFlatSpec {

  it should "convert a struct schema with single field" in {
    val topic = "topicx"

    val sc = sinkConfig(topic, "keyspacex", "tablex", List("id"))
    sc.options.consistency should be (TaskConfig.DefaultSinkConsistency)
    sc.schema.columnNames should === (List("id"))
    sc.query.cql should be ("INSERT INTO keyspacex.tablex(id) VALUES(?)")

    val schema = SchemaBuilder.struct.name("record").version(1).field("id", Schema.INT32_SCHEMA).build
    val value = new Struct(schema).put("id", 1)
    val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0)

    sc.schema.route.topic should be (record.topic)
    sc.schema.route.keyspace should be ("keyspacex")
    sc.schema.route.table should be ("tablex")

    sc.schema is record should be (true)
    val query = record.as(sc.schema.namespace)
    query.cql should be("INSERT INTO keyspacex.tablex(id) VALUES(1)")
  }

  it should "convert a struct schema with multiple fields" in {
    val topic = "test_kfk"
    val sc = sinkConfig(topic, "keyspacex", "tablex", List("available", "name", "age"))

    val schema = SchemaBuilder.struct.name("record").version(1)
      .field("available", Schema.BOOLEAN_SCHEMA)
      .field("name", Schema.STRING_SCHEMA)
      .field("age", Schema.INT32_SCHEMA).build
    val value = new Struct(schema).put("name", "user").put("available", false).put("age", 15)
    val record = new SinkRecord("test_kfk", 1, SchemaBuilder.struct.build, "key", schema, value, 0)

    schema.asColumnNames should be (sc.schema.columnNames)

    sc.schema.route.topic should be (record.topic)
    sc.schema is record should be (true)

    sc.query.cql should be ("INSERT INTO keyspacex.tablex(available,name,age) VALUES(?,?,?)")
    val query = record.as(sc.schema.namespace)
    query.cql should be("INSERT INTO keyspacex.tablex(available,name,age) VALUES(false,'user',15)")
  }

  it should "convert cassandra column defs to a source schema" in {
    val colDef = Map(
      "id" -> DataType.cint(),
      "name" -> DataType.varchar())

    val columns = TestUtil.getColumnDef(colDef)
    val expectedSchema = SchemaBuilder.struct()
      .field("id", Schema.INT32_SCHEMA)
      .field("name", Schema.STRING_SCHEMA).build()

    columns.asSchema should be(expectedSchema)
  }

  it should "convert kafka schema and struct to cassandra columns and schema mapping" in {
    import scala.collection.JavaConverters._
    val topic = "a"
    val route = InternalConfig.Route(TaskConfig.SinkRoute + topic, "ks1.t1").get
    val schemaMap = new InternalConfig.Schema(route, Nil, Nil, Nil, List("available","name","age"), "")

    val schema = SchemaBuilder.struct.name("record").version(1)
      .field("available", Schema.BOOLEAN_SCHEMA)
      .field("name", Schema.STRING_SCHEMA)
      .field("age", Schema.INT32_SCHEMA).build
    val struct = new Struct(schema).put("name", "user").put("available", false).put("age", 15)
    val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0)

    schema.asColumnNames should ===(schemaMap.columnNames)
    schemaMap.columnNames should ===(schema.fields.asScala.map(_.name).toList)
    schemaMap is record should be (true)
  }
}

Source File: CassandraSinkTaskSpec.scala From kafka-connect-cassandra with Apache License 2.0

5 votes

package com.tuplejump.kafka.connect.cassandra

import scala.collection.JavaConverters._
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.{SinkRecord, SinkTaskContext}

class CassandraSinkTaskSpec extends AbstractFlatSpec {

  val topicName = "test_kv_topic"
  val tableName = "test.kv"
  val config = sinkProperties(Map(topicName -> tableName))

  it should "start sink task" in {
    val sinkTask = new CassandraSinkTask()
    val mockContext = mock[SinkTaskContext]

    sinkTask.initialize(mockContext)
    sinkTask.start(config.asJava)
    sinkTask.stop()
  }

  it should "save records in cassandra" in {
    val sinkTask = new CassandraSinkTask()
    val mockContext = mock[SinkTaskContext]

    sinkTask.initialize(mockContext)
    sinkTask.start(config.asJava)

    val valueSchema = SchemaBuilder.struct.name("record").version(1)
      .field("key", Schema.STRING_SCHEMA)
      .field("value", Schema.INT32_SCHEMA).build
    val value1 = new Struct(valueSchema).put("key", "pqr").put("value", 15)
    val value2 = new Struct(valueSchema).put("key", "abc").put("value", 17)

    val record1 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value1, 0)
    val record2 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value2, 0)

    sinkTask.put(List(record1, record2).asJavaCollection)

    sinkTask.stop()

    val cc = CassandraCluster.local
    val session = cc.session
    val result = session.execute(s"select count(1) from $tableName").one()
    val rowCount = result.getLong(0)
    rowCount should be(2)
    cc.shutdown()
  }
}

Source File: IgnoreEvolutionPolicy.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.evolution

import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.kafka.connect.data.Schema

import scala.collection.JavaConverters._
import scala.util.Try


object IgnoreEvolutionPolicy extends EvolutionPolicy {

  override def evolve(dbName: DatabaseName,
                      tableName: TableName,
                      metastoreSchema: Schema,
                      inputSchema: Schema)
                     (implicit client: IMetaStoreClient): Try[Schema] = Try {
    HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value))
  }.map { schema =>
    val compatible = schema.fields().asScala.forall { field =>
      inputSchema.field(field.name) != null ||
        field.schema().isOptional ||
        field.schema().defaultValue() != null
    }
    if (compatible) schema else sys.error("Input Schema is not compatible with the metastore")
  }
}

Source File: MapValueConverterTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import com.landoop.json.sql.JacksonJson
import org.apache.kafka.connect.data.{Schema, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class MapValueConverterTest extends AnyFunSuite with Matchers {
  test("converts nested payload") {
    val json =
      """
        |{
        |  "idType": 3,
        |  "colorDepth": "",
        |  "threshold" : 45.77,
        |  "evars": {
        |    "evars": {
        |      "eVar1": "Tue Aug 27 2019 12:08:10",
        |      "eVar2": 156692207943934897
        |    }
        |  },
        |  "exclude": {
        |    "id": 0,
        |    "value": false
        |  }
        |}
        |""".stripMargin

    val map = JacksonJson.toMap[Any](json)
    val struct = MapValueConverter.convert(map)
    //Jackson transforming the json to Map the fields order is not retained
    struct.schema().fields().asScala.map(_.name()).sorted shouldBe List("idType", "colorDepth", "threshold", "evars", "exclude").sorted

    struct.schema().field("idType").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA

    struct.schema().field("colorDepth").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA

    struct.schema().field("threshold").schema() shouldBe Schema.OPTIONAL_FLOAT64_SCHEMA

    struct.schema().field("exclude").schema().`type`() shouldBe Schema.Type.STRUCT
    struct.schema().field("exclude").schema().isOptional shouldBe true

    struct.schema().field("evars").schema().`type`() shouldBe Schema.Type.STRUCT
    struct.schema().field("evars").schema().isOptional shouldBe true

    struct.schema().field("evars").schema().fields().asScala.map(_.name()) shouldBe List("evars")
    val evarsInner = struct.schema().field("evars").schema().field("evars")
    evarsInner.schema().`type`() shouldBe Schema.Type.STRUCT
    evarsInner.schema().isOptional shouldBe true
    evarsInner.schema().fields().asScala.map(_.name()).sorted shouldBe List("eVar1", "eVar2").sorted
    evarsInner.schema().field("eVar1").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA
    evarsInner.schema().field("eVar2").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA

    val exclude  = struct.schema().field("exclude").schema()
    exclude.schema().`type`() shouldBe Schema.Type.STRUCT
    exclude.schema().isOptional shouldBe true
    exclude.schema().fields().asScala.map(_.name()).sorted shouldBe List("id", "value").sorted
    exclude.schema().field("id").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA
    exclude.schema().field("value").schema() shouldBe Schema.OPTIONAL_BOOLEAN_SCHEMA

    struct.get("idType") shouldBe 3L
    struct.get("colorDepth") shouldBe ""
    struct.get("threshold") shouldBe 45.77D

    val evarsStruct = struct.get("evars").asInstanceOf[Struct].get("evars").asInstanceOf[Struct]
    evarsStruct.get("eVar1") shouldBe "Tue Aug 27 2019 12:08:10"
    evarsStruct.get("eVar2") shouldBe 156692207943934897L

    val excludeStruct = struct.get("exclude").asInstanceOf[Struct]
    excludeStruct.get("id") shouldBe 0L
    excludeStruct.get("value") shouldBe false
  }

}

Source File: package.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter}

package object parquet {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = {
    if (fs.isDirectory(path)) {
      logger.debug(s"$path is a directory, reading constituent files")
      val remote = fs.listFiles(path, false)
      new Iterator[Path] {
        override def hasNext: Boolean = remote.hasNext
        override def next(): Path = remote.next().getPath
      }.toList
    } else {
      logger.debug(s"Reading $path as a single file")
      List(path)
    }
  }

  def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = {
    ParquetReader.builder(new StructReadSupport, file)
      .withConf(fs.getConf)
      .build()
  }

  def parquetWriter(path: Path,
                    schema: Schema,
                    config: ParquetSinkConfig): ParquetWriter[Struct] = {
    new StructParquetWriterBuilder(path, schema)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(config.enableDictionary)
      .withValidation(config.validation)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withWriteMode(if (config.overwrite) {
        ParquetFileWriter.Mode.OVERWRITE
      } else {
        ParquetFileWriter.Mode.CREATE
      }).build()
  }
}

Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._

// derived from Apache Spark's parquet write support, archive and license here:
// https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)
  private val schemaName = if (schema.name() == null) "schema" else schema.name()
  private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName)

  private val metadata = new java.util.HashMap[String, String]()
  metadata.put("written_by", "streamreactor")

  // The Parquet `RecordConsumer` to which all structs are written
  private var consumer: RecordConsumer = _

  type ValueWriter = (Any) => Unit

  override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String])
  override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata)
  override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer

  override def write(struct: Struct): Unit = {
    writeMessage {
      writeStructFields(struct)
    }
  }

  private def writeStructFields(struct: Struct): Unit = {
    for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) {
      val value = struct.get(field)
      if (value != null) {
        val writer = valueWriter(field.schema())
        writeField(field.name, index) {
          writer(value)
        }
      }
    }
  }

  def valueWriter(schema: Schema): ValueWriter = {
    // todo perhaps introduce something like spark's SpecializedGetters
    schema.`type`() match {
      case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean])
      case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt)
      case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong)
      case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes))
      case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat)
      case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble)
      case Schema.Type.STRUCT => value => {
        logger.debug(s"Writing nested struct")
        val struct = value.asInstanceOf[Struct]
        writeGroup {
          schema.fields.asScala
            .map { field => field -> struct.get(field) }
            .zipWithIndex.foreach { case ((field, v), k) =>
            writeField(field.name, k) {
              valueWriter(field.schema)(v)
            }
          }
        }
      }
      case _ => throw UnsupportedSchemaType(schema.`type`.toString)
    }
  }

  private def writeMessage(f: => Unit): Unit = {
    consumer.startMessage()
    f
    consumer.endMessage()
  }

  private def writeGroup(f: => Unit): Unit = {
    consumer.startGroup()
    // consumer.startMessage()
    f
    //consumer.endMessage()
    consumer.endGroup()
  }

  private def writeField(name: String, k: Int)(f: => Unit): Unit = {
    consumer.startField(name, k)
    f
    consumer.endField(name, k)
  }
}

Source File: Converters.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.kafka.connect.data.{Field, Schema}
import org.apache.parquet.io.api.Converter

object Converters {
  def get(field: Field, builder: scala.collection.mutable.Map[String, Any]): Converter = {
    field.schema().`type`() match {
      case Schema.Type.STRUCT => new NestedGroupConverter(field.schema(), field, builder)
      case Schema.Type.INT64 | Schema.Type.INT32 | Schema.Type.INT16 | Schema.Type.INT8 => new AppendingPrimitiveConverter(field, builder)
      case Schema.Type.FLOAT64 | Schema.Type.FLOAT32 => new AppendingPrimitiveConverter(field, builder)
      // case Schema.Type.INT64 => new TimestampPrimitiveConverter(field, builder)
      case Schema.Type.STRING => new DictionaryStringPrimitiveConverter(field, builder)
      case Schema.Type.ARRAY => ???
      case other => throw UnsupportedSchemaType(s"Unsupported data type $other")
    }
  }
}

Source File: NestedGroupConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Field, Schema}
import org.apache.parquet.io.api.{Converter, GroupConverter}

import scala.collection.JavaConverters._

class NestedGroupConverter(schema: Schema,
                           field: Field,
                           parentBuilder: scala.collection.mutable.Map[String, Any])
  extends GroupConverter with StrictLogging {
  private[parquet] val builder = scala.collection.mutable.Map.empty[String, Any]
  private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq
  override def getConverter(k: Int): Converter = converters(k)
  override def start(): Unit = builder.clear()
  override def end(): Unit = parentBuilder.put(field.name, builder.result)
}

Source File: RootGroupConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.io.api.{Converter, GroupConverter}

import scala.collection.JavaConverters._

class RootGroupConverter(schema: Schema) extends GroupConverter with StrictLogging {
  require(schema.`type`() == Schema.Type.STRUCT)

  var struct: Struct = _
  private val builder = scala.collection.mutable.Map.empty[String, Any]
  private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq

  override def getConverter(k: Int): Converter = converters(k)
  override def start(): Unit = builder.clear()
  override def end(): Unit = struct = {
    val struct = new Struct(schema)
    schema.fields.asScala.map { field =>
      val value = builder.getOrElse(field.name, null)
      try {
        struct.put(field, value)
      } catch {
        case t: Exception =>
          throw t
      }
    }
    struct
  }
}

Source File: PartitionValueMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.source.mapper

import com.landoop.streamreactor.connect.hive.{Partition, StructMapper}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

import scala.collection.JavaConverters._

class PartitionValueMapper(partition: Partition) extends StructMapper {
  override def map(input: Struct): Struct = {

    val builder = SchemaBuilder.struct()
    input.schema.fields.asScala.foreach { field =>
      builder.field(field.name, field.schema)
    }
    partition.entries.toList.foreach { entry =>
      builder.field(entry._1.value, Schema.STRING_SCHEMA)
    }
    val schema = builder.build()

    val struct = new Struct(schema)
    input.schema.fields.asScala.foreach { field =>
      struct.put(field.name, input.get(field.name))
    }
    partition.entries.toList.foreach { entry =>
      struct.put(entry._1.value, entry._2)
    }
    struct
  }
}

Source File: MetastoreSchemaAlignMapper.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.mapper

import com.landoop.streamreactor.connect.hive.StructMapper
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.util.Try


class MetastoreSchemaAlignMapper(schema: Schema) extends StructMapper {

  import scala.collection.JavaConverters._

  override def map(input: Struct): Struct = {
    //hive converts everything to lowercase
    val inputFieldsMapping = input.schema().fields().asScala.map { f => f.name().toLowerCase() -> f.name() }.toMap
    val struct = schema.fields.asScala.foldLeft(new Struct(schema)) { (struct, field) =>
      Try(input.get(inputFieldsMapping(field.name))).toOption match {
        case Some(value) => struct.put(field.name, value)
        case None if field.schema.isOptional => struct.put(field.name, null)
        case None => sys.error(s"Cannot map struct to required schema; ${field.name} is missing, no default value has been supplied and null is not permitted")
      }
    }
    struct
  }
}

Source File: HiveWriterManager.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive.{Offset, TopicPartition, TopicPartitionOffset}
import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveWriter}
import com.landoop.streamreactor.connect.hive.sink.staging.StageManager
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.Schema


  def flush(offsets: Map[TopicPartition, Offset]): Unit = {
    logger.info(s"Flushing offsets $offsets")
    // we may not have an offset for a given topic/partition if no data was written to that TP
    writers.foreach { case (key, writer) =>
      writer.close()
      offsets.get(key.tp).foreach { offset =>
        stageManager.commit(writer.file, key.tp.withOffset(offset))
      }
      writers.remove(key)
    }
  }

  def getWriters: Seq[OpenWriter] = writers.map { case (key, writer) => OpenWriter(key.tp, key.dir, writer) }.toList
}

Source File: ValueConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

object ValueConverter {
  def apply(record: SinkRecord): Struct = record.value match {
    case struct: Struct => StructValueConverter.convert(struct)
    case map: Map[_, _] => MapValueConverter.convert(map)
    case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap)
    case string: String => StringValueConverter.convert(string)
    case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}")
  }
}

trait ValueConverter[T] {
  def convert(value: T): Struct
}

object StructValueConverter extends ValueConverter[Struct] {
  override def convert(struct: Struct): Struct = struct
}

object MapValueConverter extends ValueConverter[Map[_, _]] {
  def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = {
    value match {
      case s: String =>
        builder.field(key, Schema.OPTIONAL_STRING_SCHEMA)
        s
      case l: Long =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        l
      case i: Int =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        i.toLong
      case b: Boolean =>
        builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA)
        b
      case f: Float =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        f.toDouble
      case d: Double =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        d
      case innerMap: java.util.Map[_, _] =>
        val innerStruct = convert(innerMap.asScala.toMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct

      case innerMap: Map[_, _] =>
        val innerStruct = convert(innerMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct
    }
  }

  def convert(map: Map[_, _], optional: Boolean) = {
    val builder = SchemaBuilder.struct()
    val values = map.map { case (k, v) =>
      val key = k.toString
      val value = convertValue(v, key, builder)
      key -> value
    }.toList
    if (optional) builder.optional()
    val schema = builder.build
    val struct = new Struct(schema)
    values.foreach { case (key, value) =>
      struct.put(key.toString, value)
    }
    struct
  }
  override def convert(map: Map[_, _]): Struct = convert(map, false)
}

object StringValueConverter extends ValueConverter[String] {
  override def convert(string: String): Struct = {
    val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build()
    new Struct(schema).put("a", string)
  }
}

Source File: AddEvolutionPolicy.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.evolution

import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.kafka.connect.data.Schema

import scala.collection.JavaConverters._
import scala.util.Try


object AddEvolutionPolicy extends EvolutionPolicy {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def evolve(dbName: DatabaseName,
                      tableName: TableName,
                      metastoreSchema: Schema,
                      inputSchema: Schema)
                     (implicit client: IMetaStoreClient): Try[Schema] = Try {

    val missing = inputSchema.fields.asScala
      .filter(f => metastoreSchema.field(f.name) == null)
      .map(HiveSchemas.toFieldSchema)

    if (missing.nonEmpty) {
      logger.info(s"Evolving hive metastore to add: ${missing.mkString(",")}")

      val table = client.getTable(dbName.value, tableName.value)
      val cols = table.getSd.getCols
      missing.foreach(field => cols.add(field))
      table.getSd.setCols(cols)
      client.alter_table(dbName.value, tableName.value, table)

      HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value))

    } else {
      metastoreSchema
    }
  }
}

Source File: ValidatorTask.scala From ohara with Apache License 2.0

5 votes

package oharastream.ohara.connector.validation

import java.util
import java.util.concurrent.TimeUnit

import oharastream.ohara.client.configurator.InspectApi.{RdbInfo, RdbQuery}
import oharastream.ohara.client.configurator.{ErrorApi, InspectApi}
import oharastream.ohara.client.database.DatabaseClient
import oharastream.ohara.common.data.Serializer
import oharastream.ohara.common.util.VersionUtils
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.source.{SourceRecord, SourceTask}
import spray.json.{JsObject, _}

import scala.jdk.CollectionConverters._
class ValidatorTask extends SourceTask {
  private[this] var done                       = false
  private[this] var props: Map[String, String] = _
  private[this] val topic: String              = InspectApi.INTERNAL_TOPIC_KEY.topicNameOnKafka
  private[this] var requestId: String          = _
  override def start(props: util.Map[String, String]): Unit = {
    this.props = props.asScala.toMap
    requestId = require(InspectApi.REQUEST_ID)
  }

  override def poll(): util.List[SourceRecord] =
    if (done) {
      // just wait the configurator to close this connector
      TimeUnit.SECONDS.sleep(2)
      null
    } else
      try information match {
        case query: RdbQuery => toSourceRecord(validate(query))
      } catch {
        case e: Throwable => toSourceRecord(ErrorApi.of(e))
      } finally done = true

  override def stop(): Unit = {
    // do nothing
  }

  override def version(): String = VersionUtils.VERSION

  private[this] def validate(query: RdbQuery): RdbInfo = {
    val client = DatabaseClient.builder.url(query.url).user(query.user).password(query.password).build
    try RdbInfo(
      name = client.databaseType,
      tables = client.tableQuery
        .catalog(query.catalogPattern.orNull)
        .schema(query.schemaPattern.orNull)
        .tableName(query.tableName.orNull)
        .execute()
    )
    finally client.close()
  }

  private[this] def toJsObject: JsObject = props(InspectApi.SETTINGS_KEY).parseJson.asJsObject
  private[this] def information = require(InspectApi.TARGET_KEY) match {
    case InspectApi.RDB_KIND => InspectApi.RDB_QUERY_FORMAT.read(toJsObject)
    case other: String =>
      throw new IllegalArgumentException(
        s"valid targets are ${InspectApi.RDB_KIND}. current is $other"
      )
  }

  private[this] def toSourceRecord(data: Object): util.List[SourceRecord] =
    util.Arrays.asList(
      new SourceRecord(
        null,
        null,
        topic,
        Schema.BYTES_SCHEMA,
        Serializer.STRING.to(requestId),
        Schema.BYTES_SCHEMA,
        Serializer.OBJECT.to(data)
      )
    )

  private[this] def require(key: String): String =
    props.getOrElse(key, throw new IllegalArgumentException(s"the $key is required"))
}

Source File: StrictEvolutionPolicy.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink.evolution

import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.kafka.connect.data.Schema

import scala.collection.JavaConverters._
import scala.util.Try


object StrictEvolutionPolicy extends EvolutionPolicy {

  override def evolve(dbName: DatabaseName,
                      tableName: TableName,
                      metastoreSchema: Schema,
                      inputSchema: Schema)
                     (implicit client: IMetaStoreClient): Try[Schema] = Try {
    val schema = HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value))
    schema
  }.map { schema =>
    //Hive keeps the fields in lowercase
    val inputFields = inputSchema.fields().asScala.map { f =>
      f.name().toLowerCase()
    }.toSet
    schema.fields().asScala.foreach { field =>
      val exists = inputFields.contains(field.name)
      val optional = field.schema().isOptional
      val default = field.schema().defaultValue()
      val compatible = exists || optional || default != null
      if (!compatible) {
        sys.error(s"Input Schema is not compatible with the metastore for field [${field.name()}]")
      }
    }
    schema
  }
}

Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.sink.config.TableOptions
import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.Table
import org.apache.kafka.connect.data.{Schema, Struct}

case class HiveSinkState(offsets: Map[TopicPartition, Offset],
                         committedOffsets: Map[TopicPartition, Offset],
                         table: Table,
                         tableLocation: Path,
                         plan: Option[PartitionPlan],
                         metastoreSchema: Schema,
                         mapper: Struct => Struct,
                         lastSchema: Schema) {
  def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = {
    copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset))
  }

  def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(offsets = offsets + (tp -> offset))
  }

  def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = {
    copy(committedOffsets = committedOffsets ++ offsets)
  }

  def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(committedOffsets = committedOffsets + (tp -> offset))
  }

  def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema)
}

object HiveSinkState {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def from(schema: Schema,
           table: TableOptions,
           dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = {
    logger.info(s"Init sink for schema $schema")

    val hiveTable = getOrCreateTable(table, dbName, schema)
    val tableLocation = new Path(hiveTable.getSd.getLocation)
    val plan = hive.partitionPlan(hiveTable)
    val metastoreSchema = table.evolutionPolicy
      .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema)
      .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema"))

    val mapperFns: Seq[Struct => Struct] = Seq(
      table.projection.map(new ProjectionMapper(_)),
      Some(new MetastoreSchemaAlignMapper(metastoreSchema)),
      plan.map(new DropPartitionValuesMapper(_))
    ).flatten.map(mapper => mapper.map _)

    val mapper = Function.chain(mapperFns)

    HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema)
  }

  def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema)
                      (implicit client: IMetaStoreClient, fs: FileSystem): Table = {

    def create: Table = {
      val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",")
      logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]")
      hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format)
    }

    logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}")
    client.tableExists(dbName.value, table.tableName.value) match {
      case true if table.overwriteTable =>
        hive.dropTable(dbName, table.tableName, true)
        create
      case true => client.getTable(dbName.value, table.tableName.value)
      case false if table.createTable => create
      case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist")
    }
  }
}

Source File: domain.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import cats.Show
import cats.data.NonEmptyList
import org.apache.hadoop.fs.Path
import org.apache.kafka.common.{TopicPartition => KafkaTopicPartition}
import org.apache.kafka.connect.data.Schema

case class Topic(value: String) {
  require(value != null && value.trim.nonEmpty)
}

case class Offset(value: Long) {
  require(value >= 0)
}

case class TopicPartition(topic: Topic, partition: Int) {
  def withOffset(offset: Offset): TopicPartitionOffset = TopicPartitionOffset(topic, partition, offset)
  def toKafka = new KafkaTopicPartition(topic.value, partition)
}

case class TopicPartitionOffset(topic: Topic, partition: Int, offset: Offset) {
  def toTopicPartition = TopicPartition(topic, partition)
}

case class DatabaseName(value: String) {
  require(value != null && value.trim.nonEmpty)
}

case class TableName(value: String) {
  require(value != null && value.trim.nonEmpty)
}

// contains all the partition keys for a particular table
case class PartitionPlan(tableName: TableName, keys: NonEmptyList[PartitionKey])

// contains a partition key, which you can think of as like a partition column name
case class PartitionKey(value: String)

// defines a partition key field
case class PartitionField(name: String, schema: Schema = Schema.STRING_SCHEMA, comment: Option[String] = None) {
  require(name != null && name.trim.nonEmpty)
}

// contains a single partition in a table, that is one set of unique values, one per partition key
case class Partition(entries: NonEmptyList[(PartitionKey, String)], location: Option[Path])

case class Serde(serializationLib: String, inputFormat: String, outputFormat: String, params: Map[String, String])

// generates the default hive metatstore location string for a partition
object DefaultPartitionLocation extends Show[Partition] {
  override def show(t: Partition): String = {
    t.entries.map { case (key, value) => key.value + "=" + value }.toList.mkString("/")
  }
}

Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.Serde
import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.ParquetWriter

import scala.util.Try

object ParquetHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
    Map("serialization.format" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {

    logger.debug(s"Creating parquet writer at $path")

    val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val createdTime: Long = System.currentTimeMillis()
    var lastKnownFileSize: Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      writer.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing writer at path $path")
      writer.close()
    }

    override def currentCount: Long = count
    override def file: Path = path
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating parquet reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path)
    var offset = startAt

    override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
}

Source File: SinkRecordParser.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.influx.converters

import com.datamountaineer.streamreactor.connect.influx.helpers.Util
import com.datamountaineer.streamreactor.connect.influx.writers.KcqlDetails.Path
import com.datamountaineer.streamreactor.connect.influx.writers.ValuesExtractor
import com.fasterxml.jackson.databind.JsonNode
import com.landoop.json.sql.JacksonJson
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.util.Try

object SinkRecordParser {
  type Field = String

  trait ParsedSinkRecord {
    def valueFields(ignored: Set[Path]): Seq[(String, Any)]

    def field(path: Path): Option[Any]
  }

  trait ParsedKeyValueSinkRecord extends ParsedSinkRecord {
    def keyFields(ignored: Set[Path]): Seq[(String, Any)]
  }

  private case class JsonSinkRecord(json: JsonNode) extends ParsedSinkRecord {
    override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(json, ignored.map(_.value.last))

    override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(json, path.value))
  }

  private case class StructSinkRecord(struct: Struct) extends ParsedSinkRecord {
    override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(struct, ignored.map(_.value.last))

    override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(struct, path.value))
  }

  private case class MapSinkRecord(map: java.util.Map[String, Any]) extends ParsedSinkRecord {
    override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(map, ignored.map(_.value.last))

    override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(map, path.value))
  }

  private case class KeyValueRecord(key: ParsedSinkRecord, value: ParsedSinkRecord) extends ParsedKeyValueSinkRecord {
    override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = value.valueFields(ignored)

    override def field(path: Path): Option[Any] = path.value.headOption match {
      case Some(fieldName) if Util.caseInsensitiveComparison(fieldName, Util.KEY_CONSTANT) => key.field(Path(path.value.tail))
      case Some(_) => value.field(path)
      case None => throw new IllegalArgumentException("Unreachable situation detected. Path should never be empty")
    }

    override def keyFields(ignored: Set[Path]): Seq[(String, Any)] = key.valueFields(ignored)
  }

  def build(record: SinkRecord): Try[ParsedKeyValueSinkRecord] = {

    val key = Option(record.keySchema()).map(_.`type`()) match {
      case Some(Schema.Type.STRING) => Try(JsonSinkRecord(JacksonJson.asJson(record.key().asInstanceOf[String])))
      case Some(Schema.Type.STRUCT) => Try(StructSinkRecord(record.key().asInstanceOf[Struct]))
      case None => Try(MapSinkRecord(record.key().asInstanceOf[java.util.Map[String, Any]]))
    }

    val value = Option(record.valueSchema()).map(_.`type`()) match {
      case Some(Schema.Type.STRING) =>
        Try(require(record.value() != null && record.value().getClass == classOf[String], "The SinkRecord payload should be of type String")).flatMap(_ => Try(JsonSinkRecord(JacksonJson.asJson(record.value().asInstanceOf[String]))))
      case Some(Schema.Type.STRUCT) =>
        Try(require(record.value() != null && record.value().getClass == classOf[Struct], "The SinkRecord payload should be of type Struct")).flatMap(_ => Try(StructSinkRecord(record.value().asInstanceOf[Struct])))
      case None =>
        Try(require(record.value() != null && record.value().isInstanceOf[java.util.Map[_, _]], "The SinkRecord payload should be of type java.util.Map[String, Any]")).flatMap(_ => Try(MapSinkRecord(record.value().asInstanceOf[java.util.Map[String, Any]])))
    }

    key
      .flatMap(key => value.map(key -> _))
      .map { case (k, v) => KeyValueRecord(k, v) }
  }
}

Source File: SinkRecordKeyRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class SinkRecordKeyRowKeyBuilderTest extends AnyWordSpec with Matchers with MockitoSugar {
  val keyRowKeyBuilder = new SinkRecordKeyRowKeyBuilderBytes()

  "SinkRecordKeyRowKeyBuilder" should {
    "create the right key from the Schema key value - Byte" in {
      val b = 123.toByte
      val sinkRecord = new SinkRecord("", 1, Schema.INT8_SCHEMA, b, Schema.FLOAT64_SCHEMA, Nil, 0)

      keyRowKeyBuilder.build(sinkRecord, "Should not matter") shouldBe Array(b)

    }
    "create the right key from the Schema key value - String" in {
      val s = "somekey"
      val sinkRecord = new SinkRecord("", 1, Schema.STRING_SCHEMA, s, Schema.FLOAT64_SCHEMA, Nil, 0)

      keyRowKeyBuilder.build(sinkRecord, Nil) shouldBe s.fromString()
    }

    "create the right key from the Schema key value - Bytes" in {
      val bArray = Array(23.toByte, 24.toByte, 242.toByte)
      val sinkRecord = new SinkRecord("", 1, Schema.BYTES_SCHEMA, bArray, Schema.FLOAT64_SCHEMA, Nil, 0)
      keyRowKeyBuilder.build(sinkRecord, Nil) shouldBe bArray
    }
    "create the right key from the Schema key value - Boolean" in {
      val bool = true
      val sinkRecord = new SinkRecord("", 1, Schema.BOOLEAN_SCHEMA, bool, Schema.FLOAT64_SCHEMA, Nil, 0)

      keyRowKeyBuilder.build(sinkRecord, Nil) shouldBe bool.fromBoolean()

    }

  }
}

Source File: StructFieldsRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class StructFieldsRowKeyBuilderTest extends AnyWordSpec with Matchers {
  "StructFieldsRowKeyBuilder" should {
    "raise an exception if the field is not present in the struct" in {
      intercept[IllegalArgumentException] {
        val schema = SchemaBuilder.struct().name("com.example.Person")
          .field("firstName", Schema.STRING_SCHEMA)
          .field("age", Schema.INT32_SCHEMA)
          .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

        val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

        val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
        //val field = Field("threshold", "threshold", false)

        StructFieldsRowKeyBuilderBytes(List("threshold")).build(sinkRecord, null)
      }
    }

    "create the row key based on one single field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName")).build(sinkRecord, null) shouldBe "Alex".fromString
    }

    "create the row key based on more thant one field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      //val field2 = Field("age", "age", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName", "age")).build(sinkRecord, null) shouldBe
        Bytes.add("Alex".fromString(), "\n".fromString(), 30.fromInt())
    }
  }
}

Source File: GenericRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class GenericRowKeyBuilderTest extends AnyWordSpec with Matchers {
  "GenericRowKeyBuilder" should {
    "use the topic, partition and offset to make the key" in {

      val topic = "sometopic"
      val partition = 2
      val offset = 1243L
      val sinkRecord = new SinkRecord(topic, partition, Schema.INT32_SCHEMA, 345, Schema.STRING_SCHEMA, "", offset)

      val keyBuilder = new GenericRowKeyBuilderBytes()
      val expected = Bytes.add(Array(topic.fromString(), keyBuilder.delimiterBytes, partition.fromString(),
        keyBuilder.delimiterBytes, offset.fromString()))
      keyBuilder.build(sinkRecord, Nil) shouldBe expected
    }
  }
}

Source File: ObjectMessageConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.jms.sink.converters

import com.datamountaineer.streamreactor.connect.jms.config.JMSSetting
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import javax.jms.{ObjectMessage, Session}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

class ObjectMessageConverter extends JMSMessageConverter with ConverterUtil {
  override def convert(record: SinkRecord, session: Session, setting: JMSSetting): (String, ObjectMessage) = {
    val converted =  super[ConverterUtil].convert(record, setting.fields, setting.ignoreField)
    val msg = session.createObjectMessage()
    val value = converted.value()
    val schema = converted.valueSchema()
    schema.`type`() match {
      case Schema.Type.STRUCT =>
        val struct = value.asInstanceOf[Struct]
        struct.schema().fields().asScala.foreach { f =>
          ObjectMessageConverterFn(f.name(), struct.get(f), f.schema(), msg, session)
        }

      case _ => ObjectMessageConverterFn("field", value, schema, msg, session)
    }
    (setting.source, msg)
  }
}

object ObjectMessageConverterFn {
  def apply(fieldName: String, value: AnyRef, schema: Schema, msg: ObjectMessage, session: Session): Unit = {
    schema.`type`() match {
      case Schema.Type.BYTES => msg.setObjectProperty(fieldName, value.asInstanceOf[Array[Byte]].toList.asJava)
      case Schema.Type.BOOLEAN => msg.setBooleanProperty(fieldName, value.asInstanceOf[Boolean])
      case Schema.Type.FLOAT32 => msg.setFloatProperty(fieldName, value.asInstanceOf[Float])
      case Schema.Type.FLOAT64 => msg.setDoubleProperty(fieldName, value.asInstanceOf[Double])
      case Schema.Type.INT8 => msg.setByteProperty(fieldName, value.asInstanceOf[Byte])
      case Schema.Type.INT16 => msg.setShortProperty(fieldName, value.asInstanceOf[Short])
      case Schema.Type.INT32 => msg.setIntProperty(fieldName, value.asInstanceOf[Int])
      case Schema.Type.INT64 => msg.setLongProperty(fieldName, value.asInstanceOf[Long])
      case Schema.Type.STRING => msg.setStringProperty(fieldName, value.asInstanceOf[String])
      case Schema.Type.MAP => msg.setObjectProperty(fieldName, value)
      case Schema.Type.ARRAY => msg.setObjectProperty(fieldName, value)
      case Schema.Type.STRUCT =>
        val nestedMsg = session.createObjectMessage()
        val struct = value.asInstanceOf[Struct]
        struct.schema().fields().asScala.foreach { f =>
          ObjectMessageConverterFn(f.name(), struct.get(f), f.schema(), nestedMsg, session)
        }
        msg.setObjectProperty(fieldName, nestedMsg)
    }
  }
}

Source File: MapMessageConverter.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.jms.sink.converters

import com.datamountaineer.streamreactor.connect.jms.config.JMSSetting
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import javax.jms.{MapMessage, Session}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

class MapMessageConverter extends JMSMessageConverter with ConverterUtil {
  override def convert(record: SinkRecord, session: Session, setting: JMSSetting): (String, MapMessage) = {
    val converted =  super[ConverterUtil].convert(record, setting.fields, setting.ignoreField)
    val msg = session.createMapMessage()
    val value = converted.value()
    val schema = converted.valueSchema()
    schema.`type`() match {
      case Schema.Type.STRUCT =>
        val struct = value.asInstanceOf[Struct]
        struct.schema().fields().asScala.foreach { f =>
          MapMessageBuilderFn(f.name(), struct.get(f), f.schema(), msg, session)
        }

      case _ => MapMessageBuilderFn("field", value, schema, msg, session)
    }
    (setting.source, msg)
  }
}


object MapMessageBuilderFn {
  def apply(fieldName: String, value: AnyRef, schema: Schema, msg: MapMessage, session: Session): Unit = {
    schema.`type`() match {
      case Schema.Type.BYTES => msg.setBytes(fieldName, value.asInstanceOf[Array[Byte]])
      case Schema.Type.BOOLEAN => msg.setBoolean(fieldName, value.asInstanceOf[Boolean])
      case Schema.Type.FLOAT32 => msg.setFloat(fieldName, value.asInstanceOf[Float])
      case Schema.Type.FLOAT64 => msg.setDouble(fieldName, value.asInstanceOf[Double])
      case Schema.Type.INT8 => msg.setByte(fieldName, value.asInstanceOf[Byte])
      case Schema.Type.INT16 => msg.setShort(fieldName, value.asInstanceOf[Short])
      case Schema.Type.INT32 => msg.setInt(fieldName, value.asInstanceOf[Int])
      case Schema.Type.INT64 => msg.setLong(fieldName, value.asInstanceOf[Long])
      case Schema.Type.STRING => msg.setString(fieldName, value.asInstanceOf[String])
      case Schema.Type.MAP => msg.setObject(fieldName, value)
      case Schema.Type.ARRAY => msg.setObject(fieldName, value)
      case Schema.Type.STRUCT =>
        val nestedMsg = session.createMapMessage()
        val struct = value.asInstanceOf[Struct]
        struct.schema().fields().asScala.foreach { f =>
          MapMessageBuilderFn(f.name(), struct.get(f), f.schema(), nestedMsg, session)
        }
        msg.setObject(fieldName, nestedMsg)
    }
  }
}

org.apache.kafka.connect.data.Schema Scala Examples