org.apache.kafka.connect.data.Struct Scala Examples

The following examples show how to use org.apache.kafka.connect.data.Struct. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: StructVectorReader.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.orc.vectors

import com.landoop.streamreactor.connect.hive.orc.OrcSchemas
import org.apache.hadoop.hive.ql.exec.vector.{ColumnVector, StructColumnVector}
import org.apache.kafka.connect.data.Struct
import org.apache.orc.TypeDescription

import scala.collection.JavaConverters._

class StructVectorReader(readers: IndexedSeq[OrcVectorReader[_, _]],
                         typeDescription: TypeDescription) extends OrcVectorReader[StructColumnVector, Struct] {

  val schema = OrcSchemas.toKafka(typeDescription)

  override def read(offset: Int, vector: StructColumnVector): Option[Struct] = {
    val struct = new Struct(schema)
    val y = if (vector.isRepeating) 0 else offset
    typeDescription.getFieldNames.asScala.zipWithIndex.foreach { case (name, k) =>
      val fieldReader = readers(k).asInstanceOf[OrcVectorReader[ColumnVector, Any]]
      val fieldVector = vector.fields(k)
      val value = fieldReader.read(y, fieldVector)
      struct.put(name, value.orNull)
    }
    Some(struct)
  }
} 
Example 2
Source File: ProjectionMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.source.mapper

import cats.data.NonEmptyList
import com.landoop.streamreactor.connect.hive.StructMapper
import com.landoop.streamreactor.connect.hive.source.config.ProjectionField
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}

class ProjectionMapper(projection: NonEmptyList[ProjectionField]) extends StructMapper {

  override def map(input: Struct): Struct = {
    val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, projectionField) =>
      Option(input.schema.field(projectionField.name))
        .fold(sys.error(s"Projection field ${projectionField.name} cannot be found in input")) { field =>
          builder.field(projectionField.alias, field.schema)
        }
    }
    val schema = builder.build()
    projection.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.alias, input.get(field.name))
    }
  }
} 
Example 3
Source File: PartitionValueMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.source.mapper

import com.landoop.streamreactor.connect.hive.{Partition, StructMapper}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

import scala.collection.JavaConverters._

class PartitionValueMapper(partition: Partition) extends StructMapper {
  override def map(input: Struct): Struct = {

    val builder = SchemaBuilder.struct()
    input.schema.fields.asScala.foreach { field =>
      builder.field(field.name, field.schema)
    }
    partition.entries.toList.foreach { entry =>
      builder.field(entry._1.value, Schema.STRING_SCHEMA)
    }
    val schema = builder.build()

    val struct = new Struct(schema)
    input.schema.fields.asScala.foreach { field =>
      struct.put(field.name, input.get(field.name))
    }
    partition.entries.toList.foreach { entry =>
      struct.put(entry._1.value, entry._2)
    }
    struct
  }
} 
Example 4
Source File: HiveSource.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.source

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveReader, Record}
import com.landoop.streamreactor.connect.hive.source.config.HiveSourceConfig
import com.landoop.streamreactor.connect.hive.source.mapper.{PartitionValueMapper, ProjectionMapper}
import com.landoop.streamreactor.connect.hive.source.offset.HiveSourceOffsetStorageReader
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.kafka.connect.data.Struct
import org.apache.kafka.connect.source.SourceRecord

import scala.collection.JavaConverters._


class HiveSource(db: DatabaseName,
                 tableName: TableName,
                 topic: Topic,
                 offsetReader: HiveSourceOffsetStorageReader,
                 config: HiveSourceConfig)
                (implicit client: IMetaStoreClient, fs: FileSystem) extends Iterator[SourceRecord] {

  val tableConfig = config.tableOptions.filter(_.tableName == tableName).find(_.topic == topic)
    .getOrElse(sys.error(s"Cannot find table configuration for ${db.value}.${tableName.value} => ${topic.value}"))

  private val table = client.getTable(db.value, tableName.value)
  private val format = HiveFormat(hive.serde(table))
  private val metastoreSchema = HiveSchemas.toKafka(table)
  private val parts = TableFileScanner.scan(db, tableName)

  private val readers = parts.map { case (path, partition) =>

    val fns: Seq[Struct => Struct] = Seq(
      partition.map(new PartitionValueMapper(_).map _),
      tableConfig.projection.map(new ProjectionMapper(_).map _)
    ).flatten
    val mapper: Struct => Struct = Function.chain(fns)

    val sourceOffset = offsetReader.offset(SourcePartition(db, tableName, topic, path)).getOrElse(SourceOffset(0))

    new HiveReader {
      lazy val reader = format.reader(path, sourceOffset.rowNumber, metastoreSchema)
      override def iterator: Iterator[Record] = reader.iterator.map { record =>
        Record(mapper(record.struct), record.path, record.offset)
      }
      override def close(): Unit = reader.close()
    }
  }

  private val iterator: Iterator[Record] = readers.map(_.iterator).reduce(_ ++ _).take(tableConfig.limit)

  override def hasNext: Boolean = iterator.hasNext

  override def next(): SourceRecord = {

    val record = iterator.next
    val sourcePartition = SourcePartition(db, tableName, topic, record.path)
    val offset = SourceOffset(record.offset)

    new SourceRecord(
      fromSourcePartition(sourcePartition).asJava,
      fromSourceOffset(offset).asJava,
      topic.value,
      record.struct.schema,
      record.struct
    )
  }

  def close(): Unit = {
    readers.foreach(_.close())
  }
} 
Example 5
Source File: ProjectionMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import cats.data.NonEmptyList
import com.datamountaineer.kcql.Field
import com.landoop.streamreactor.connect.hive.StructMapper
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}


class ProjectionMapper(projection: NonEmptyList[Field]) extends StructMapper {

  override def map(input: Struct): Struct = {
    // the compatible output schema built from projected fields with aliases applied
    val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, kcqlField) =>
      Option(input.schema.field(kcqlField.getName)).fold(sys.error(s"Missing field $kcqlField")) { field =>
        builder.field(kcqlField.getAlias, field.schema)
      }
    }
    val schema = builder.build()
    projection.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.getAlias, input.get(field.getName))
    }
  }
} 
Example 6
Source File: MetastoreSchemaAlignMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import com.landoop.streamreactor.connect.hive.StructMapper
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.util.Try


class MetastoreSchemaAlignMapper(schema: Schema) extends StructMapper {

  import scala.collection.JavaConverters._

  override def map(input: Struct): Struct = {
    //hive converts everything to lowercase
    val inputFieldsMapping = input.schema().fields().asScala.map { f => f.name().toLowerCase() -> f.name() }.toMap
    val struct = schema.fields.asScala.foldLeft(new Struct(schema)) { (struct, field) =>
      Try(input.get(inputFieldsMapping(field.name))).toOption match {
        case Some(value) => struct.put(field.name, value)
        case None if field.schema.isOptional => struct.put(field.name, null)
        case None => sys.error(s"Cannot map struct to required schema; ${field.name} is missing, no default value has been supplied and null is not permitted")
      }
    }
    struct
  }
} 
Example 7
Source File: DropPartitionValuesMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import com.landoop.streamreactor.connect.hive.{PartitionPlan, StructMapper}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}


class DropPartitionValuesMapper(plan: PartitionPlan) extends StructMapper {

  import scala.collection.JavaConverters._

  override def map(input: Struct): Struct = {
    val partitionKeys = plan.keys.map(_.value).toList
    val dataFields = input.schema.fields().asScala.filterNot(field => partitionKeys.contains(field.name))
    val builder = dataFields.foldLeft(SchemaBuilder.struct) { (builder, field) =>
      builder.field(field.name, field.schema)
    }
    val schema = builder.build()
    dataFields.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.name, input.get(field.name))
    }
  }
} 
Example 8
Source File: ValueConverter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

object ValueConverter {
  def apply(record: SinkRecord): Struct = record.value match {
    case struct: Struct => StructValueConverter.convert(struct)
    case map: Map[_, _] => MapValueConverter.convert(map)
    case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap)
    case string: String => StringValueConverter.convert(string)
    case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}")
  }
}

trait ValueConverter[T] {
  def convert(value: T): Struct
}

object StructValueConverter extends ValueConverter[Struct] {
  override def convert(struct: Struct): Struct = struct
}

object MapValueConverter extends ValueConverter[Map[_, _]] {
  def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = {
    value match {
      case s: String =>
        builder.field(key, Schema.OPTIONAL_STRING_SCHEMA)
        s
      case l: Long =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        l
      case i: Int =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        i.toLong
      case b: Boolean =>
        builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA)
        b
      case f: Float =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        f.toDouble
      case d: Double =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        d
      case innerMap: java.util.Map[_, _] =>
        val innerStruct = convert(innerMap.asScala.toMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct

      case innerMap: Map[_, _] =>
        val innerStruct = convert(innerMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct
    }
  }

  def convert(map: Map[_, _], optional: Boolean) = {
    val builder = SchemaBuilder.struct()
    val values = map.map { case (k, v) =>
      val key = k.toString
      val value = convertValue(v, key, builder)
      key -> value
    }.toList
    if (optional) builder.optional()
    val schema = builder.build
    val struct = new Struct(schema)
    values.foreach { case (key, value) =>
      struct.put(key.toString, value)
    }
    struct
  }
  override def convert(map: Map[_, _]): Struct = convert(map, false)
}

object StringValueConverter extends ValueConverter[String] {
  override def convert(string: String): Struct = {
    val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build()
    new Struct(schema).put("a", string)
  }
} 
Example 9
Source File: HiveSinkState.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.sink.config.TableOptions
import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.Table
import org.apache.kafka.connect.data.{Schema, Struct}

case class HiveSinkState(offsets: Map[TopicPartition, Offset],
                         committedOffsets: Map[TopicPartition, Offset],
                         table: Table,
                         tableLocation: Path,
                         plan: Option[PartitionPlan],
                         metastoreSchema: Schema,
                         mapper: Struct => Struct,
                         lastSchema: Schema) {
  def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = {
    copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset))
  }

  def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(offsets = offsets + (tp -> offset))
  }

  def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = {
    copy(committedOffsets = committedOffsets ++ offsets)
  }

  def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(committedOffsets = committedOffsets + (tp -> offset))
  }

  def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema)
}

object HiveSinkState {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def from(schema: Schema,
           table: TableOptions,
           dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = {
    logger.info(s"Init sink for schema $schema")

    val hiveTable = getOrCreateTable(table, dbName, schema)
    val tableLocation = new Path(hiveTable.getSd.getLocation)
    val plan = hive.partitionPlan(hiveTable)
    val metastoreSchema = table.evolutionPolicy
      .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema)
      .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema"))

    val mapperFns: Seq[Struct => Struct] = Seq(
      table.projection.map(new ProjectionMapper(_)),
      Some(new MetastoreSchemaAlignMapper(metastoreSchema)),
      plan.map(new DropPartitionValuesMapper(_))
    ).flatten.map(mapper => mapper.map _)

    val mapper = Function.chain(mapperFns)

    HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema)
  }

  def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema)
                      (implicit client: IMetaStoreClient, fs: FileSystem): Table = {

    def create: Table = {
      val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",")
      logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]")
      hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format)
    }

    logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}")
    client.tableExists(dbName.value, table.tableName.value) match {
      case true if table.overwriteTable =>
        hive.dropTable(dbName, table.tableName, true)
        create
      case true => client.getTable(dbName.value, table.tableName.value)
      case false if table.createTable => create
      case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist")
    }
  }
} 
Example 10
Source File: OrcSink.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.orc

import com.landoop.streamreactor.connect.hive.orc.vectors.{OrcVectorWriter, StructVectorWriter}
import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, StructUtils}
import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.collection.JavaConverters._

class OrcSink(path: Path,
              schema: Schema,
              config: OrcSinkConfig)(implicit fs: FileSystem) extends StrictLogging {

  private val typeDescription = OrcSchemas.toOrc(schema)
  private val structWriter = new StructVectorWriter(typeDescription.getChildren.asScala.map(OrcVectorWriter.fromSchema))
  private val batch = typeDescription.createRowBatch(config.batchSize)
  private val vector = new StructColumnVector(batch.numCols, batch.cols: _*)
  private val orcWriter = createOrcWriter(path, typeDescription, config)
  private var n = 0

  def flush(): Unit = {
    logger.debug(s"Writing orc batch [size=$n, path=$path]")
    batch.size = n
    orcWriter.addRowBatch(batch)
    orcWriter.writeIntermediateFooter
    batch.reset()
    n = 0
  }

  def write(struct: Struct): Unit = {
    structWriter.write(vector, n, Some(StructUtils.extractValues(struct)))
    n = n + 1
    if (n == config.batchSize)
      flush()
  }

  def close(): Unit = {
    if (n > 0)
      flush()
    orcWriter.close()
  }
} 
Example 11
Source File: OrcSource.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.orc

import com.landoop.streamreactor.connect.hive.OrcSourceConfig
import com.landoop.streamreactor.connect.hive.orc.vectors.OrcVectorReader.fromSchema
import com.landoop.streamreactor.connect.hive.orc.vectors.StructVectorReader
import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.ql.exec.vector.{StructColumnVector, VectorizedRowBatch}
import org.apache.kafka.connect.data.Struct
import org.apache.orc.OrcFile.ReaderOptions
import org.apache.orc.{OrcFile, Reader}

import scala.collection.JavaConverters._

class OrcSource(path: Path, config: OrcSourceConfig)(implicit fs: FileSystem) extends StrictLogging {

  private val reader = OrcFile.createReader(path, new ReaderOptions(fs.getConf))

  private val typeDescription = reader.getSchema
  private val schema = OrcSchemas.toKafka(typeDescription)

  private val readers = typeDescription.getChildren.asScala.map(fromSchema)
  private val vectorReader = new StructVectorReader(readers.toIndexedSeq, typeDescription)

  private val batch = typeDescription.createRowBatch()
  private val recordReader = reader.rows(new Reader.Options())

  def close(): Unit = {
    recordReader.close()
  }

  def iterator: Iterator[Struct] = new Iterator[Struct] {
    var iter = new BatchIterator(batch)
    override def hasNext: Boolean = iter.hasNext || {
      batch.reset()
      recordReader.nextBatch(batch)
      iter = new BatchIterator(batch)
      !batch.endOfFile && batch.size > 0 && iter.hasNext
    }
    override def next(): Struct = iter.next()
  }

  // iterates over a batch, be careful not to mutate the batch while it is being iterated
  class BatchIterator(batch: VectorizedRowBatch) extends Iterator[Struct] {
    var offset = 0
    val vector = new StructColumnVector(batch.numCols, batch.cols: _*)
    override def hasNext: Boolean = offset < batch.size
    override def next(): Struct = {
      val struct = vectorReader.read(offset, vector)
      offset = offset + 1
      struct.orNull
    }
  }
} 
Example 12
Source File: RootGroupConverter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.io.api.{Converter, GroupConverter}

import scala.collection.JavaConverters._

class RootGroupConverter(schema: Schema) extends GroupConverter with StrictLogging {
  require(schema.`type`() == Schema.Type.STRUCT)

  var struct: Struct = _
  private val builder = scala.collection.mutable.Map.empty[String, Any]
  private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq

  override def getConverter(k: Int): Converter = converters(k)
  override def start(): Unit = builder.clear()
  override def end(): Unit = struct = {
    val struct = new Struct(schema)
    schema.fields.asScala.map { field =>
      val value = builder.getOrElse(field.name, null)
      try {
        struct.put(field, value)
      } catch {
        case t: Exception =>
          throw t
      }
    }
    struct
  }
} 
Example 13
Source File: OrcHiveFormat.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, Serde}
import com.landoop.streamreactor.connect.hive.orc.OrcSink
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.util.Try

object OrcHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.orc.OrcSerde",
    "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
    "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
    Map("org.apache.hadoop.hive.ql.io.orc.OrcSerde" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {
    logger.debug(s"Creating orc writer at $path")

    val sink: OrcSink = com.landoop.streamreactor.connect.hive.orc.sink(path, schema, OrcSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val cretedTimestamp: Long = System.currentTimeMillis()
    var lastKnownFileSize:Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      sink.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing orc writer at path $path")
      sink.close()
    }
    override def file: Path = path
    override def currentCount: Long = count
    override def createdTime: Long = cretedTimestamp
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating orc reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.orc.source(path, OrcSourceConfig())
    var offset = startAt

    override def iterator: Iterator[Record] = reader.iterator.map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
} 
Example 14
Source File: ParquetHiveFormat.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.Serde
import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.ParquetWriter

import scala.util.Try

object ParquetHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
    Map("serialization.format" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {

    logger.debug(s"Creating parquet writer at $path")

    val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val createdTimestamp: Long = System.currentTimeMillis()
    var lastKnownFileSize:Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      writer.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing writer at path $path")
      writer.close()
    }

    override def currentCount: Long = count
    override def file: Path = path
    override def createdTime: Long = createdTimestamp
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating parquet reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path)
    var offset = startAt

    override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
} 
Example 15
Source File: RedisStreamTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.redis.sink.writer

/*
 * Copyright 2017 Datamountaineer.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util

import com.datamountaineer.streamreactor.connect.redis.sink.RedisSinkTask
import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import redis.clients.jedis.{Jedis, StreamEntryID}

import scala.collection.JavaConverters._

class RedisStreamTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar {
//
//  val redisServer = new RedisServer(6379)
//
//  override def beforeAll() = redisServer.start()
//
//  override def afterAll() = redisServer.stop()

  "Redis Stream writer" should {

    "write Kafka records to a Redis Stream" in {

      val TOPIC = "cpuTopic"
      val KCQL = s"INSERT INTO stream1 SELECT * from $TOPIC STOREAS STREAM"
      println("Testing KCQL : " + KCQL)
      val props = Map(
        RedisConfigConstants.REDIS_HOST->"localhost",
        RedisConfigConstants.REDIS_PORT->"6379",
        RedisConfigConstants.KCQL_CONFIG->KCQL,
        RedisConfigConstants.REDIS_PASSWORD -> ""
      ).asJava

      val config = RedisConfig(props)
      val connectionInfo = new RedisConnectionInfo("localhost", 6379, None)
      val settings = RedisSinkSettings(config)
      val writer = new RedisStreams(settings)

      val schema = SchemaBuilder.struct().name("com.example.Cpu")
        .field("type", Schema.STRING_SCHEMA)
        .field("temperature", Schema.FLOAT64_SCHEMA)
        .field("voltage", Schema.FLOAT64_SCHEMA)
        .field("ts", Schema.INT64_SCHEMA).build()

      val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L)

      val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1)

      val jedis = mock[Jedis]
      writer.jedis = jedis

      val map = new util.HashMap[String, String]()
      map.put("type", "Xeon")
      map.put("temperature", "60.4")
      map.put("voltage", "90.1")
      map.put("ts", 1482180657010L.toString)

      when(jedis.auth("")).isLenient()
      when(jedis.xadd("stream1", null, map)).thenReturn(mock[StreamEntryID])
      writer.initialize(1, settings.errorPolicy)
      writer.write(Seq(sinkRecord1))
    }
  }
} 
Example 16
Source File: RedisPubSubTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.redis.sink.writer

import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import redis.clients.jedis.{Jedis, JedisPubSub}
import redis.embedded.RedisServer

import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer

class RedisPubSubTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar {

  val redisServer = new RedisServer(6379)

  override def beforeAll() = redisServer.start()

  override def afterAll() = redisServer.stop()

  "Redis PUBSUB writer" should {

    "write Kafka records to a Redis PubSub" in {

      val TOPIC = "cpuTopic"
      val KCQL = s"SELECT * from $TOPIC STOREAS PubSub (channel=type)"
      println("Testing KCQL : " + KCQL)
      val props = Map(
        RedisConfigConstants.REDIS_HOST->"localhost",
        RedisConfigConstants.REDIS_PORT->"6379",
        RedisConfigConstants.KCQL_CONFIG->KCQL
      ).asJava

      val config = RedisConfig(props)
      val connectionInfo = new RedisConnectionInfo("localhost", 6379, None)
      val settings = RedisSinkSettings(config)
      val writer = new RedisPubSub(settings)
      writer.createClient(settings)

      val schema = SchemaBuilder.struct().name("com.example.Cpu")
        .field("type", Schema.STRING_SCHEMA)
        .field("temperature", Schema.FLOAT64_SCHEMA)
        .field("voltage", Schema.FLOAT64_SCHEMA)
        .field("ts", Schema.INT64_SCHEMA).build()

      val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L)
      val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L)
      val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L)

      val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1)
      val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2)
      val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3)

      val jedis = new Jedis(connectionInfo.host, connectionInfo.port)
      // Clean up in-memory jedis
      jedis.flushAll()

      val messagesMap = collection.mutable.Map[String, ListBuffer[String]]()

      val t = new Thread {
        private val pubsub = new JedisPubSub {
          override def onMessage(channel: String, message: String): Unit = {
            messagesMap.get(channel) match {
              case Some(msgs) => messagesMap.put(channel, msgs += message)
              case None => messagesMap.put(channel, ListBuffer(message))
            }
          }
        }

        override def run(): Unit = {
          jedis.subscribe(pubsub, "Xeon", "i7", "i7-i")
        }

        override def interrupt(): Unit = {
          pubsub.punsubscribe("*")
          super.interrupt()
        }
      }
      t.start()
      t.join(5000)
      if (t.isAlive) t.interrupt()

      writer.write(Seq(sinkRecord1))
      writer.write(Seq(sinkRecord2, sinkRecord3))

      messagesMap.size shouldBe 3

      messagesMap("Xeon").head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}"""
      messagesMap("i7").head shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}"""
      messagesMap("i7-i").head shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}"""
    }
  }
} 
Example 17
Source File: RedisInsertSortedSetTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.redis.sink.writer

import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec
import redis.clients.jedis.Jedis
import redis.embedded.RedisServer

import scala.collection.JavaConverters._

class RedisInsertSortedSetTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar {

  val redisServer = new RedisServer(6379)

  override def beforeAll() = redisServer.start()

  override def afterAll() = redisServer.stop()

  "Redis INSERT into Sorted Set (SS) writer" should {

    "write Kafka records to a Redis Sorted Set" in {

      val TOPIC = "cpuTopic"
      val KCQL = s"INSERT INTO cpu_stats SELECT * from $TOPIC STOREAS SortedSet(score=ts)"
      println("Testing KCQL : " + KCQL)
      val props = Map(
        RedisConfigConstants.REDIS_HOST->"localhost",
        RedisConfigConstants.REDIS_PORT->"6379",
        RedisConfigConstants.KCQL_CONFIG->KCQL
      ).asJava

      val config = RedisConfig(props)
      val connectionInfo = new RedisConnectionInfo("localhost", 6379, None)
      val settings = RedisSinkSettings(config)
      val writer = new RedisInsertSortedSet(settings)
      writer.createClient(settings)

      val schema = SchemaBuilder.struct().name("com.example.Cpu")
        .field("type", Schema.STRING_SCHEMA)
        .field("temperature", Schema.FLOAT64_SCHEMA)
        .field("voltage", Schema.FLOAT64_SCHEMA)
        .field("ts", Schema.INT64_SCHEMA).build()

      val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L)
      val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L)
      val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L)

      val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1)
      val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2)
      val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3)

      val jedis = new Jedis(connectionInfo.host, connectionInfo.port)
      // Clean up in-memory jedis
      jedis.flushAll()

      writer.write(Seq(sinkRecord1))
      writer.write(Seq(sinkRecord2, sinkRecord3))

      // Redis cardinality should now be 3
      jedis.zcard("cpu_stats") shouldBe 3

      val allSSrecords = jedis.zrange("cpu_stats", 0, 999999999999L)
      val results = allSSrecords.asScala.toList
      results.head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}"""
      results(1) shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}"""
      results(2) shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}"""

    }

  }

} 
Example 18
Source File: RedisFieldsKeyBuilder.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.redis.sink.writer

import com.datamountaineer.streamreactor.connect.rowkeys.StringKeyBuilder
import org.apache.kafka.connect.data.{Field, Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.annotation.tailrec
import scala.collection.JavaConverters._


  override def build(record: SinkRecord): String = {
    val struct: Struct = record.value.asInstanceOf[Struct]
    val schema: Schema = struct.schema

    def extractAvailableFieldNames(schema: Schema): Seq[String] = {
      if (schema.`type` == Schema.Type.STRUCT) {
        val fields = schema.fields
        fields.asScala.map(_.name) ++ fields.asScala.flatMap { f =>
          extractAvailableFieldNames(f.schema).map(name => f.name + "." + name)
        }
      } else Seq.empty
    }

    val availableFields = extractAvailableFieldNames(schema)
    val missingKeys = keys.filterNot(availableFields.contains)
    require(
      missingKeys.isEmpty,
      s"${missingKeys.mkString(",")} keys are not present in the SinkRecord payload: ${availableFields.mkString(", ")}"
    )

    def getValue(key: String): AnyRef = {
      @tailrec
      def findValue(keyParts: List[String], obj: AnyRef): Option[AnyRef] =
        (obj, keyParts) match {
          case (f: Field, k :: tail) => findValue(tail, f.schema.field(k))
          case (s: Struct, k :: tail) => findValue(tail, s.get(k))
          case (v, _) => Option(v)
        }

      findValue(key.split('.').toList, struct).getOrElse {
        throw new IllegalArgumentException(
          s"$key field value is null. Non null value is required for the fields creating the row key"
        )
      }
    }

    keys.map(getValue).mkString(pkDelimiter)
  }
} 
Example 19
Source File: RedisGeoAdd.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.redis.sink.writer

import com.datamountaineer.kcql.Kcql
import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisKCQLSetting, RedisSinkSettings}
import com.datamountaineer.streamreactor.connect.schemas.StructFieldsExtractor
import org.apache.kafka.connect.data.Struct
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._
import scala.util.Try
import scala.util.control.Exception.allCatch

class RedisGeoAdd(sinkSettings: RedisSinkSettings) extends RedisWriter with GeoAddSupport {

  val configs: Set[Kcql] = sinkSettings.kcqlSettings.map(_.kcqlConfig)
  configs.foreach { c =>
    assert(c.getSource.trim.length > 0, "You need to supply a valid source kafka topic to fetch records from. Review your KCQL syntax")
    assert(c.getPrimaryKeys.asScala.length >= 1, "The Redis GeoAdd mode requires at least 1 PK (Primary Key) to be defined")
    assert(c.getStoredAs.equalsIgnoreCase("GeoAdd"), "The Redis GeoAdd mode requires the KCQL syntax: STOREAS GeoAdd")
  }

  // Write a sequence of SinkRecords to Redis
  override def write(records: Seq[SinkRecord]): Unit = {
    if (records.isEmpty)
      logger.debug("No records received on 'GeoAdd' Redis writer")
    else {
      logger.debug(s"'GeoAdd' Redis writer received ${records.size} records")
      insert(records.groupBy(_.topic))
    }
  }

  // Insert a batch of sink records
  def insert(records: Map[String, Seq[SinkRecord]]): Unit = {
    records.foreach {
      case (topic, sinkRecords: Seq[SinkRecord]) => {
        val topicSettings: Set[RedisKCQLSetting] = sinkSettings.kcqlSettings.filter(_.kcqlConfig.getSource == topic)
        if (topicSettings.isEmpty)
          logger.warn(s"Received a batch for topic $topic - but no KCQL supports it")
        //pass try to error handler and try
        val t = Try {
          sinkRecords.foreach { record =>
            topicSettings.map { KCQL =>

              val extractor = StructFieldsExtractor(includeAllFields = false, KCQL.kcqlConfig.getPrimaryKeys.asScala.map(f => f.getName -> f.getName).toMap)
              val fieldsAndValues = extractor.get(record.value.asInstanceOf[Struct]).toMap
              val pkValue = KCQL.kcqlConfig.getPrimaryKeys.asScala.map(pk => fieldsAndValues(pk.getName).toString).mkString(":")

              // Use the target (and optionally the prefix) to name the GeoAdd key
              val optionalPrefix = if (Option(KCQL.kcqlConfig.getTarget).isEmpty) "" else KCQL.kcqlConfig.getTarget.trim
              val key = optionalPrefix + pkValue

              val recordToSink = convert(record, fields = KCQL.fieldsAndAliases, ignoreFields = KCQL.ignoredFields)
              val payload = convertValueToJson(recordToSink)

              val longitudeField = getLongitudeField(KCQL.kcqlConfig)
              val latitudeField = getLatitudeField(KCQL.kcqlConfig)
              val longitude = getFieldValue(record, longitudeField)
              val latitude = getFieldValue(record, latitudeField)

              if (isDoubleNumber(longitude) && isDoubleNumber(latitude)) {

                logger.debug(s"GEOADD $key longitude=$longitude latitude=$latitude payload = ${payload.toString}")
                val response = jedis.geoadd(key, longitude.toDouble, latitude.toDouble, payload.toString)

                if (response == 1) {
                  logger.debug("New element added")
                } else if (response == 0)
                  logger.debug("The element was already a member of the sorted set and the score was updated")
                response
              }
              else {
                logger.warn(s"GeoAdd record contains invalid longitude=$longitude and latitude=$latitude values, " +
                  s"Record with key ${record.key} is skipped");
                None
              }
            }
          }
        }
        handleTry(t)
      }
        logger.debug(s"Wrote ${sinkRecords.size} rows for topic $topic")
    }
  }

  def getFieldValue(record: SinkRecord, fieldName: String): String = {
    val struct = record.value().asInstanceOf[Struct]
    struct.getString(fieldName)
  }

  def isDoubleNumber(s: String): Boolean = (allCatch opt s.toDouble).isDefined
} 
Example 20
Source File: PulsarWriterTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.pulsar.sink

import com.datamountaineer.streamreactor.connect.pulsar.ProducerConfigFactory
import com.datamountaineer.streamreactor.connect.pulsar.config.{PulsarConfigConstants, PulsarSinkConfig, PulsarSinkSettings}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.pulsar.client.api.{Message, MessageId, Producer, PulsarClient}
import org.mockito.ArgumentMatchers.any
import org.mockito.MockitoSugar
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._


class PulsarWriterTest extends AnyWordSpec with MockitoSugar with Matchers {
  val pulsarTopic = "persistent://landoop/standalone/connect/kafka-topic"

  def getSchema: Schema = {
    SchemaBuilder.struct
      .field("int8", SchemaBuilder.int8().defaultValue(2.toByte).doc("int8 field").build())
      .field("int16", Schema.INT16_SCHEMA)
      .field("int32", Schema.INT32_SCHEMA)
      .field("int64", Schema.INT64_SCHEMA)
      .field("float32", Schema.FLOAT32_SCHEMA)
      .field("float64", Schema.FLOAT64_SCHEMA)
      .field("boolean", Schema.BOOLEAN_SCHEMA)
      .field("string", Schema.STRING_SCHEMA)
      .build()
  }


  def getStruct(schema: Schema): Struct = {
    new Struct(schema)
      .put("int8", 12.toByte)
      .put("int16", 12.toShort)
      .put("int32", 12)
      .put("int64", 12L)
      .put("float32", 12.2f)
      .put("float64", 12.2)
      .put("boolean", true)
      .put("string", "foo")
  }


  "should write messages" in {

    val config = PulsarSinkConfig(Map(
      PulsarConfigConstants.HOSTS_CONFIG -> "pulsar://localhost:6650",
      PulsarConfigConstants.KCQL_CONFIG -> s"INSERT INTO $pulsarTopic SELECT * FROM kafka_topic BATCH = 10 WITHPARTITIONER = SinglePartition WITHCOMPRESSION = ZLIB WITHDELAY = 1000"
    ).asJava)

    val schema = getSchema
    val struct = getStruct(schema)
    val record1 = new SinkRecord("kafka_topic", 0, null, null, schema, struct, 1)

    val settings = PulsarSinkSettings(config)
    val producerConfig = ProducerConfigFactory("test", settings.kcql)

    val client = mock[PulsarClient]
    val producer = mock[Producer]
    val messageId = mock[MessageId]

    when(client.createProducer(pulsarTopic, producerConfig(pulsarTopic))).thenReturn(producer)
    when(producer.send(any[Message])).thenReturn(messageId)

    val writer = PulsarWriter(client, "test", settings)
    writer.write(List(record1))
  }
} 
Example 21
Source File: ChangeFeedStructBuilder.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.rethink.source

import com.fasterxml.jackson.databind.ObjectMapper
import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}



object ChangeFeedStructBuilder extends StrictLogging {

  val mapper = new ObjectMapper()
  val oldVal = "old_val"
  val newVal = "new_val"
  val state = "state"
  val `type` = "type"

  val schema: Schema = SchemaBuilder.struct.name("ReThinkChangeFeed")
    .version(1)
    .field(state, Schema.OPTIONAL_STRING_SCHEMA)
    .field(oldVal, Schema.OPTIONAL_STRING_SCHEMA)
    .field(newVal, Schema.OPTIONAL_STRING_SCHEMA)
    .field(`type`, Schema.OPTIONAL_STRING_SCHEMA)
    .build

  def apply(hm: Map[String, Object]): Struct = {
    val struct = new Struct(schema)
    hm.foreach({ case (k, v) => if (v != null) struct.put(k, v.toString) })
    struct
  }
} 
Example 22
Source File: ParquetWriterTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive.StructUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class ParquetWriterTest extends AnyWordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  "ParquetWriter" should {
    "write parquet files" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", "mr").put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
    "support writing nulls" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", null).put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
  }
} 
Example 23
Source File: VoltDbWriter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.voltdb.writers

import com.datamountaineer.streamreactor.connect.errors.ErrorHandler
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import com.datamountaineer.streamreactor.connect.sink.DbWriter
import com.datamountaineer.streamreactor.connect.voltdb.config.VoltSettings
import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.Struct
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException
import org.voltdb.client.{ClientConfig, ClientFactory}

import scala.util.Try

class VoltDbWriter(settings: VoltSettings) extends DbWriter with StrictLogging with ConverterUtil with ErrorHandler {

  //ValidateStringParameterFn(settings.servers, "settings")
  //ValidateStringParameterFn(settings.user, "settings")

  //initialize error tracker
  initialize(settings.maxRetries, settings.errorPolicy)

  private val voltConfig = new ClientConfig(settings.user, settings.password)
  private val client = ClientFactory.createClient(voltConfig)
  VoltConnectionConnectFn(client, settings)

  private val proceduresMap = settings.fieldsExtractorMap.values.map { extract =>
    val procName = s"${extract.targetTable}.${if (extract.isUpsert) "upsert" else "insert"}"
    logger.info(s"Retrieving the metadata for $procName ...")
    val fields = VoltDbMetadataReader.getProcedureParameters(client, extract.targetTable).map(_.toUpperCase)
    logger.info(s"$procName expected arguments are: ${fields.mkString(",")}")
    extract.targetTable -> ProcAndFields(procName, fields)
  }.toMap

  override def write(records: Seq[SinkRecord]): Unit = {
    if (records.isEmpty) {
      logger.debug("No records received.")
    } else {
      val t = Try(records.withFilter(_.value() != null).foreach(insert))
      t.foreach(_ => logger.info("Writing complete"))
      handleTry(t)
    }
  }

  private def insert(record: SinkRecord) = {
    require(record.value().getClass == classOf[Struct], "Only Struct payloads are handled")
    val extractor = settings.fieldsExtractorMap.getOrElse(record.topic(),
      throw new ConfigException(s"${record.topic()} is not handled by the configuration:${settings.fieldsExtractorMap.keys.mkString(",")}"))

    val fieldsAndValuesMap = extractor.get(record.value().asInstanceOf[Struct]).map { case (k, v) => (k.toUpperCase, v) }
    logger.info(fieldsAndValuesMap.mkString(","))
    val procAndFields: ProcAndFields = proceduresMap(extractor.targetTable)
    //get the list of arguments to pass to the table insert/upsert procedure. if the procedure expects a field and is
    //not present in the incoming SinkRecord it would use null
    //No table evolution is supported yet

    val arguments: Array[String] = PrepareProcedureFieldsFn(procAndFields.fields, fieldsAndValuesMap).toArray
    logger.info(s"Calling procedure:${procAndFields.procName} with parameters:${procAndFields.fields.mkString(",")} with arguments:${arguments.mkString(",")}")

    client.callProcedure(procAndFields.procName, arguments: _*)
  }

  override def close(): Unit = client.close()

  private case class ProcAndFields(procName: String, fields: Seq[String])

} 
Example 24
Source File: StructFieldExtractorTest.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.schemas

import org.apache.kafka.connect.data.{Date, Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class StructFieldExtractorTest extends AnyWordSpec with Matchers {
  "StructFieldExtractor" should {
    "return all the fields and their bytes value" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = new StructFieldsExtractor(true, Map.empty).get(struct).toMap

      map.get("firstName").get shouldBe "Alex"
      map.get("lastName").get shouldBe "Smith"
      map.get("age").get shouldBe 30
    }

    "return all fields and apply the mapping" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = new StructFieldsExtractor(true, Map("lastName" -> "Name", "age" -> "a")).get(struct).toMap

      map.get("firstName").get shouldBe "Alex"
      map.get("Name").get shouldBe "Smith"
      map.get("a").get shouldBe 30

    }

    "return only the specified fields" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = new StructFieldsExtractor(false, Map("lastName" -> "Name", "age" -> "age")).get(struct).toMap

      map.get("Name").get shouldBe "Smith"
      map.get("age").get shouldBe 30

      map.size shouldBe 2
    }
  }

  "handle Date fieldds" in {
    val dateSchema = Date.builder().build()
    val schema = SchemaBuilder.struct().name("com.example.Person")
      .field("firstName", Schema.STRING_SCHEMA)
      .field("lastName", Schema.STRING_SCHEMA)
      .field("age", Schema.INT32_SCHEMA)
      .field("date", dateSchema).build()

    val date =  java.sql.Date.valueOf("2017-04-25")
    val struct = new Struct(schema)
      .put("firstName", "Alex")
      .put("lastName", "Smith")
      .put("age", 30)
      .put("date", date)

    val map1 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap
    map1.get("date").get shouldBe date
    map1.size shouldBe 1

    val d = Date.toLogical(dateSchema, 10000)
    struct.put("date", d)

    val map2 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap
    map2.get("date").get shouldBe d
    map2.size shouldBe 1

  }

} 
Example 25
Source File: TestUtilsBase.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect

import java.util
import java.util.Collections

import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.kafka.connect.source.SourceTaskContext
import org.apache.kafka.connect.storage.OffsetStorageReader
import org.mockito.Mockito._
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfter
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._



    //set up partition
    val partition: util.Map[String, String] = Collections.singletonMap(lookupPartitionKey, table)
    //as a list to search for
    val partitionList: util.List[util.Map[String, String]] = List(partition).asJava
    //set up the offset
    val offset: util.Map[String, Object] = (Collections.singletonMap(offsetColumn,offsetValue ))
    //create offsets to initialize from
    val offsets :util.Map[util.Map[String, String],util.Map[String, Object]] = Map(partition -> offset).asJava

    //mock out reader and task context
    val taskContext = mock[SourceTaskContext]
    val reader = mock[OffsetStorageReader]
    when(reader.offsets(partitionList)).thenReturn(offsets)
    when(taskContext.offsetStorageReader()).thenReturn(reader)

    taskContext
  }
} 
Example 26
Source File: JsonConverterWithSchemaEvolutionTest.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.converters.source

import java.util.Collections

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
import io.confluent.connect.avro.AvroData
import org.apache.avro.Schema
import org.apache.kafka.connect.data.Struct
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class JsonConverterWithSchemaEvolutionTest extends AnyWordSpec with Matchers {
  val topic = "the_real_topic"
  val sourceTopic = "source_topic"
  val avroData = new AvroData(4)

  "JsonConverter" should {
    "throw IllegalArgumentException if payload is null" in {
      intercept[IllegalArgumentException] {
        val converter = new JsonConverterWithSchemaEvolution
        val record = converter.convert("topic", "somesource", "1000", null)
      }
    }

    "handle a simple json" in {
      val json = JacksonJson.toJson(Car("LaFerrari", "Ferrari", 2015, 963, 0.0001))
      val converter = new JsonConverterWithSchemaEvolution
      val record = converter.convert(topic, sourceTopic, "100", json.getBytes)
      record.keySchema() shouldBe MsgKey.schema
      record.key().asInstanceOf[Struct].getString("topic") shouldBe sourceTopic
      record.key().asInstanceOf[Struct].getString("id") shouldBe "100"

      val schema =
        new Schema.Parser().parse(
          SchemaFor[CarOptional]().toString
            .replace("\"name\":\"CarOptional\"", s"""\"name\":\"$sourceTopic\"""")
            .replace(s""",\"namespace\":\"${getClass.getCanonicalName.dropRight(getClass.getSimpleName.length+1)}\"""", "")
        )
      val format = RecordFormat[CarOptional]
      val carOptional = format.to(CarOptional(Option("LaFerrari"), Option("Ferrari"), Option(2015), Option(963), Option(0.0001)))

      record.valueSchema() shouldBe avroData.toConnectSchema(schema)

      record.value() shouldBe avroData.toConnectData(schema, carOptional).value()
      record.sourcePartition() shouldBe null
      record.sourceOffset() shouldBe Collections.singletonMap(JsonConverterWithSchemaEvolution.ConfigKey, avroData.fromConnectSchema(avroData.toConnectSchema(schema)).toString())
    }
  }
}


case class Car(name: String,
               manufacturer: String,
               model: Long,
               bhp: Long,
               price: Double)


case class CarOptional(name: Option[String],
                       manufacturer: Option[String],
                       model: Option[Long],
                       bhp: Option[Long],
                       price: Option[Double]) 
Example 27
Source File: StringStructFieldsStringKeyBuilderTest.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.sink

import com.datamountaineer.streamreactor.connect.rowkeys.StringStructFieldsStringKeyBuilder
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class StringStructFieldsStringKeyBuilderTest extends AnyWordSpec with Matchers {
  "StructFieldsStringKeyBuilder" should {
    "raise an exception if the field is not present in the struct" in {
      intercept[IllegalArgumentException] {
        val schema = SchemaBuilder.struct().name("com.example.Person")
          .field("firstName", Schema.STRING_SCHEMA)
          .field("age", Schema.INT32_SCHEMA)
          .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

        val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

        val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
        StringStructFieldsStringKeyBuilder(Seq("threshold")).build(sinkRecord)
      }
    }

    "create the row key based on one single field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex"
    }

    "create the row key based on one single field with doc in the struct" in {
      val firstNameSchema = SchemaBuilder.`type`(Schema.Type.STRING).doc("first name")
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", firstNameSchema)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex"
    }

    "create the row key based on more thant one field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StringStructFieldsStringKeyBuilder(Seq("firstName", "age")).build(sinkRecord) shouldBe "Alex.30"
    }
  }
} 
Example 28
Source File: AvroConverter.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.converters.source

import java.io.File
import java.util.Collections

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import io.confluent.connect.avro.AvroData
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import org.apache.avro.{Schema => AvroSchema}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.source.SourceRecord
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException


class AvroConverter extends Converter {
  private val avroData = new AvroData(8)
  private var sourceToSchemaMap: Map[String, AvroSchema] = Map.empty
  private var avroReadersMap: Map[String, GenericDatumReader[GenericRecord]] = Map.empty

  override def convert(kafkaTopic: String,
                       sourceTopic: String,
                       messageId: String,
                       bytes: Array[Byte],
                       keys: Seq[String] = Seq.empty,
                       keyDelimiter: String = "."): SourceRecord = {
    Option(bytes) match {
      case None =>
        new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic),
          null,
          kafkaTopic,
          avroData.toConnectSchema(sourceToSchemaMap(sourceTopic)),
          null)
      case Some(_) =>
        val reader = avroReadersMap.getOrElse(sourceTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $sourceTopic"))
        val decoder = DecoderFactory.get().binaryDecoder(bytes, null)
        val record = reader.read(null, decoder)
        val schemaAndValue = avroData.toConnectData(sourceToSchemaMap(sourceTopic.toLowerCase), record)
        val value = schemaAndValue.value()
        value match {
          case s: Struct if keys.nonEmpty =>
            val keysValue = keys.flatMap { key =>
              Option(KeyExtractor.extract(s, key.split('.').toVector)).map(_.toString)
            }.mkString(keyDelimiter)
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              Schema.STRING_SCHEMA,
              keysValue,
              schemaAndValue.schema(),
              schemaAndValue.value())
          case _ =>
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              MsgKey.schema,
              MsgKey.getStruct(sourceTopic, messageId),
              schemaAndValue.schema(),
              schemaAndValue.value())
        }

    }
  }

  override def initialize(config: Map[String, String]): Unit = {
    sourceToSchemaMap = AvroConverter.getSchemas(config)
    avroReadersMap = sourceToSchemaMap.map { case (key, schema) =>
      key -> new GenericDatumReader[GenericRecord](schema)
    }
  }
}

object AvroConverter {
  val SCHEMA_CONFIG = "connect.source.converter.avro.schemas"

  def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = {
    config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided"))
      .toString
      .split(';')
      .filter(_.trim.nonEmpty)
      .map(_.split("="))
      .map {
        case Array(source, path) =>
          val file = new File(path)
          if (!file.exists()) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!")
          }
          val s = source.trim.toLowerCase()
          if (s.isEmpty) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path")
          }
          s -> new AvroSchema.Parser().parse(file)
        case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Source->AVRO_FILE")
      }.toMap
  }
} 
Example 29
Source File: RowKeyBuilderString.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.rowkeys

import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._


  override def build(record: SinkRecord): String = {
    val struct = record.value().asInstanceOf[Struct]
    val schema = struct.schema

    val availableFields = schema.fields().asScala.map(_.name).toSet
    val missingKeys = keys.filterNot(availableFields.contains)
    require(missingKeys.isEmpty, s"${missingKeys.mkString(",")} keys are not present in the SinkRecord payload:${availableFields.mkString(",")}")

    keys.flatMap { case key =>
      val field = schema.field(key)
      val value = struct.get(field)

      require(value != null, s"$key field value is null. Non null value is required for the fileds creating the Hbase row key")
      if (availableSchemaTypes.contains(field.schema().`type`())) Some(value.toString)
      else None
    }.mkString(keyDelimiter)
  }
} 
Example 30
Source File: SourceRecordProducers.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.ftp.source

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.source.SourceRecord


object SourceRecordProducers {
  type SourceRecordProducer = (ConnectFileMetaDataStore, String, FileMetaData, FileBody) => SourceRecord

  val fileInfoSchema = SchemaBuilder.struct()
    .field("name", Schema.STRING_SCHEMA)
    .field("offset", Schema.INT64_SCHEMA)
    .build()

  def stringKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord =
    new SourceRecord(
      store.fileMetasToConnectPartition(meta), // source part
      store.fileMetasToConnectOffset(meta), // source off
      topic, //topic
      Schema.STRING_SCHEMA, // key sch
      meta.attribs.path, // key
      Schema.BYTES_SCHEMA, // val sch
      body.bytes // val
    )

  def structKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord = {
    new SourceRecord(
      store.fileMetasToConnectPartition(meta), // source part
      store.fileMetasToConnectOffset(meta), // source off
      topic, //topic
      fileInfoSchema, // key sch
      new Struct(fileInfoSchema)
        .put("name",meta.attribs.path)
        .put("offset",body.offset),
      Schema.BYTES_SCHEMA, // val sch
      body.bytes // val
    )
  }
} 
Example 31
Source File: TestCoapMessageConverter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.coap.domain

import com.datamountaineer.streamreactor.connect.coap.TestBase
import org.apache.kafka.connect.data.Struct
import org.scalatest.wordspec.AnyWordSpec


class TestCoapMessageConverter extends AnyWordSpec with TestBase {
  "should convert a CoapResponse to a Struct " in {
    val response = getCoapResponse
    val converter = new CoapMessageConverter
    val record = converter.convert(RESOURCE_INSECURE ,TOPIC, response)
    val struct = record.value().asInstanceOf[Struct]
    struct.getString("payload") shouldBe response.getPayloadString
    struct.getInt32("raw_code") shouldBe response.getRawCode
    struct.getBoolean("is_last") shouldBe response.isLast
    struct.getInt32("content_format") shouldBe response.getOptions.getContentFormat
  }
} 
Example 32
Source File: StructFieldsExtractorTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.voltdb

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec


class StructFieldsExtractorTest extends AnyWordSpec with Matchers {
  "StructFieldsExtractor" should {
    "return all the fields and their bytes value" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val min = System.currentTimeMillis()
      val record = StructFieldsExtractor("table", true, Map.empty).get(struct)
      val map = record
      map("firstName") shouldBe "Alex"
      map("lastName") shouldBe "Smith"
      map("age") shouldBe 30
    }

    "return all fields and apply the mapping" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = StructFieldsExtractor("table", includeAllFields = true, Map("lastName" -> "Name", "age" -> "a")).get(struct)
      map("firstName") shouldBe "Alex"
      map("Name") shouldBe "Smith"
      map("a") shouldBe 30

    }

    "return only the specified fields" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("lastName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema)
        .put("firstName", "Alex")
        .put("lastName", "Smith")
        .put("age", 30)

      val map = StructFieldsExtractor("table", includeAllFields = false, Map("lastName" -> "Name", "age" -> "age")).get(struct)
      map("Name") shouldBe "Smith"
      map("age") shouldBe 30
      map.size shouldBe 2
    }
  }
} 
Example 33
Source File: StructFieldsExtractor.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.voltdb

import java.text.SimpleDateFormat
import java.util.TimeZone

import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Field, Struct, _}

import scala.collection.JavaConverters._

trait FieldsValuesExtractor {
  def get(struct: Struct): Map[String, Any]
}

case class StructFieldsExtractor(targetTable: String,
                                 includeAllFields: Boolean,
                                 fieldsAliasMap: Map[String, String],
                                 isUpsert: Boolean = false) extends FieldsValuesExtractor with StrictLogging {
  require(targetTable != null && targetTable.trim.length > 0)

  def get(struct: Struct): Map[String, Any] = {
    val schema = struct.schema()
    val fields: Seq[Field] = {
      if (includeAllFields) {
        schema.fields().asScala
      } else {
        val selectedFields = schema.fields().asScala.filter(f => fieldsAliasMap.contains(f.name()))
        val diffSet = fieldsAliasMap.keySet.diff(selectedFields.map(_.name()).toSet)
        if (diffSet.nonEmpty) {
          val errMsg = s"Following columns ${diffSet.mkString(",")} have not been found. Available columns:${fieldsAliasMap.keys.mkString(",")}"
          logger.error(errMsg)
          sys.error(errMsg)
        }
        selectedFields
      }
    }

    //need to select all fields including null. the stored proc needs a fixed set of params
    fields.map { field =>
      val schema = field.schema()
      val value = Option(struct.get(field))
        .map { value =>
          //handle specific schema
          schema.name() match {
            case Decimal.LOGICAL_NAME =>
              value.asInstanceOf[Any] match {
                case _:java.math.BigDecimal => value
                case arr: Array[Byte] => Decimal.toLogical(schema, arr)
                case _ => throw new IllegalArgumentException(s"${field.name()} is not handled for value:$value")
              }
            case Time.LOGICAL_NAME =>
              value.asInstanceOf[Any] match {
                case i: Int => StructFieldsExtractor.TimeFormat.format(Time.toLogical(schema, i))
                case d:java.util.Date => StructFieldsExtractor.TimeFormat.format(d)
                case _ => throw new IllegalArgumentException(s"${field.name()} is not handled for value:$value")
              }

            case Timestamp.LOGICAL_NAME =>
              value.asInstanceOf[Any] match {
                case d:java.util.Date => StructFieldsExtractor.DateFormat.format(d)
                case l: Long => StructFieldsExtractor.DateFormat.format(Timestamp.toLogical(schema, l))
                case _ => throw new IllegalArgumentException(s"${field.name()} is not handled for value:$value")
              }

            case _ => value
          }
        }.orNull

      fieldsAliasMap.getOrElse(field.name(), field.name()) -> value
    }.toMap
  }
}


object StructFieldsExtractor {
  val DateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
  val TimeFormat: SimpleDateFormat = new SimpleDateFormat("HH:mm:ss.SSSZ")
  DateFormat.setTimeZone(TimeZone.getTimeZone("UTC"))
} 
Example 34
Source File: MetastoreSchemaAlignMapperTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class MetastoreSchemaAlignMapperTest extends AnyFunSuite with Matchers {

  test("pad optional missing fields with null") {

    val recordSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .build()

    val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c")

    val metastoreSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().optional().build())
      .build()

    val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b", "c", "z")
  }

  test("drop fields not specified in metastore") {

    val recordSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .build()

    val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c")

    val metastoreSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .build()

    val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b")
  }
} 
Example 35
Source File: ParquetWriterTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive.StructUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class ParquetWriterTest extends AnyWordSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  "ParquetWriter" should {
    "write parquet files" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", "mr").put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
    "support writing nulls" in {

      val schema = SchemaBuilder.struct()
        .field("name", SchemaBuilder.string().required().build())
        .field("title", SchemaBuilder.string().optional().build())
        .field("salary", SchemaBuilder.float64().optional().build())
        .build()

      val users = List(
        new Struct(schema).put("name", "sam").put("title", null).put("salary", 100.43),
        new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06)
      )

      val path = new Path("sinktest.parquet")

      val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
      users.foreach(writer.write)
      writer.close()

      val reader = parquetReader(path)
      val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList
      reader.close()

      actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues)

      fs.delete(path, false)
    }
  }
} 
Example 36
Source File: MetastoreSchemaAlignMapperTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class MetastoreSchemaAlignMapperTest extends AnyFunSuite with Matchers {

  test("pad optional missing fields with null") {

    val recordSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .build()

    val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c")

    val metastoreSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().optional().build())
      .build()

    val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b", "c", "z")
  }

  test("drop fields not specified in metastore") {

    val recordSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .field("c", SchemaBuilder.string().required().build())
      .build()

    val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c")

    val metastoreSchema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("b", SchemaBuilder.string().required().build())
      .build()

    val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b")
  }
} 
Example 37
Source File: DropPartitionValuesMapperTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import cats.data.NonEmptyList
import com.landoop.streamreactor.connect.hive.{PartitionKey, PartitionPlan, TableName}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class DropPartitionValuesMapperTest extends AnyFunSuite with Matchers {

  test("strip partition values") {

    val schema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("p", SchemaBuilder.string().required().build())
      .field("q", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().required().build())
      .build()

    val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q")))
    val struct = new Struct(schema).put("a", "a").put("p", "p").put("q", "q").put("z", "z")
    val output = new DropPartitionValuesMapper(plan).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z")
  }

  test("handle partition field is missing in input") {

    val schema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("q", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().required().build())
      .build()


    val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q")))
    val struct = new Struct(schema).put("a", "a").put("q", "q").put("z", "z")
    val output = new DropPartitionValuesMapper(plan).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z")
  }
} 
Example 38
Source File: DefaultCommitPolicyTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.concurrent.duration._

class DefaultCommitPolicyTest extends AnyWordSpec with Matchers {

  val schema: Schema = SchemaBuilder.struct()
    .field("name", SchemaBuilder.string().required().build())
    .build()

  val struct = new Struct(schema)

  implicit val conf: Configuration = new Configuration()
  implicit val fs: LocalFileSystem = FileSystem.getLocal(conf)
  val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100))

  private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = {
    val status = fs.getFileStatus(path)
    policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime))
  }

  "DefaultCommitPolicy" should {
    "roll over after interval" in {

      val policy = DefaultCommitPolicy(None, Option(2.seconds), None)
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 10) shouldBe false
      Thread.sleep(2000)
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file count" in {
      val policy = DefaultCommitPolicy(None, None, Some(9))
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 7) shouldBe false
      shouldFlush(policy, path, 8) shouldBe false
      shouldFlush(policy, path, 9) shouldBe true
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file size" in {
      val policy = DefaultCommitPolicy(Some(10), None, None)
      val path = new Path("foo")
      val out = fs.create(path)
      shouldFlush(policy, path, 7) shouldBe false
      out.writeBytes("wibble wobble wabble wubble")
      out.close()
      shouldFlush(policy, path, 9) shouldBe true
      fs.delete(path, false)
    }
  }
} 
Example 39
Source File: MapValueConverterTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink

import com.landoop.json.sql.JacksonJson
import org.apache.kafka.connect.data.{Schema, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class MapValueConverterTest extends AnyFunSuite with Matchers {
  test("converts nested payload") {
    val json =
      """
        |{
        |  "idType": 3,
        |  "colorDepth": "",
        |  "threshold" : 45.77,
        |  "evars": {
        |    "evars": {
        |      "eVar1": "Tue Aug 27 2019 12:08:10",
        |      "eVar2": 156692207943934897
        |    }
        |  },
        |  "exclude": {
        |    "id": 0,
        |    "value": false
        |  }
        |}
        |""".stripMargin

    val map = JacksonJson.toMap[Any](json)

    val struct = MapValueConverter.convert(map)
    //Jackson transforming the json to Map the fields order is not retained
    struct.schema().fields().asScala.map(_.name()).sorted shouldBe List("idType", "colorDepth", "threshold", "evars", "exclude").sorted

    struct.schema().field("idType").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA

    struct.schema().field("colorDepth").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA

    struct.schema().field("threshold").schema() shouldBe Schema.OPTIONAL_FLOAT64_SCHEMA

    struct.schema().field("exclude").schema().`type`() shouldBe Schema.Type.STRUCT
    struct.schema().field("exclude").schema().isOptional shouldBe true

    struct.schema().field("evars").schema().`type`() shouldBe Schema.Type.STRUCT
    struct.schema().field("evars").schema().isOptional shouldBe true

    struct.schema().field("evars").schema().fields().asScala.map(_.name()) shouldBe List("evars")
    val evarsInner = struct.schema().field("evars").schema().field("evars")
    evarsInner.schema().`type`() shouldBe Schema.Type.STRUCT
    evarsInner.schema().isOptional shouldBe true
    evarsInner.schema().fields().asScala.map(_.name()).sorted shouldBe List("eVar1", "eVar2").sorted
    evarsInner.schema().field("eVar1").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA
    evarsInner.schema().field("eVar2").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA

    val exclude  = struct.schema().field("exclude").schema()
    exclude.schema().`type`() shouldBe Schema.Type.STRUCT
    exclude.schema().isOptional shouldBe true
    exclude.schema().fields().asScala.map(_.name()).sorted shouldBe List("id", "value").sorted
    exclude.schema().field("id").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA
    exclude.schema().field("value").schema() shouldBe Schema.OPTIONAL_BOOLEAN_SCHEMA

    struct.get("idType") shouldBe 3L
    struct.get("colorDepth") shouldBe ""
    struct.get("threshold") shouldBe 45.77D

    val evarsStruct = struct.get("evars").asInstanceOf[Struct].get("evars").asInstanceOf[Struct]
    evarsStruct.get("eVar1") shouldBe "Tue Aug 27 2019 12:08:10"
    evarsStruct.get("eVar2") shouldBe 156692207943934897L

    val excludeStruct = struct.get("exclude").asInstanceOf[Struct]
    excludeStruct.get("id") shouldBe 0L
    excludeStruct.get("value") shouldBe false
  }

} 
Example 40
Source File: OrcTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.orc

import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, StructUtils, orc}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class OrcTest extends AnyFlatSpec with Matchers {

  implicit val conf = new Configuration()
  implicit val fs = FileSystem.getLocal(conf)

  "Orc" should "read and write orc files" in {

    val schema = SchemaBuilder.struct()
      .field("name", SchemaBuilder.string().optional().build())
      .field("age", SchemaBuilder.int32().optional().build())
      .field("salary", SchemaBuilder.float64().optional().build())
      .name("from_orc")
      .build()

    val users = Seq(
      new Struct(schema).put("name", "sammy").put("age", 38).put("salary", 54.67),
      new Struct(schema).put("name", "laura").put("age", 37).put("salary", 91.84)
    )

    val path = new Path("orctest.orc")
    val sink = orc.sink(path, schema, OrcSinkConfig(overwrite = true))
    users.foreach(sink.write)
    sink.close()

    val source = orc.source(path, OrcSourceConfig())
    val actual = source.iterator.toList
    actual.head.schema shouldBe schema
    actual.map(StructUtils.extractValues) shouldBe
      List(Vector("sammy", 38, 54.67), Vector("laura", 37, 91.84))

    fs.delete(path, false)
  }
} 
Example 41
Source File: package.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter}

package object parquet {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = {
    if (fs.isDirectory(path)) {
      logger.debug(s"$path is a directory, reading constituent files")
      val remote = fs.listFiles(path, false)
      new Iterator[Path] {
        override def hasNext: Boolean = remote.hasNext
        override def next(): Path = remote.next().getPath
      }.toList
    } else {
      logger.debug(s"Reading $path as a single file")
      List(path)
    }
  }

  def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = {
    ParquetReader.builder(new StructReadSupport, file)
      .withConf(fs.getConf)
      .build()
  }

  def parquetWriter(path: Path,
                    schema: Schema,
                    config: ParquetSinkConfig): ParquetWriter[Struct] = {
    new StructParquetWriterBuilder(path, schema)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(config.enableDictionary)
      .withValidation(config.validation)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withWriteMode(if (config.overwrite) {
        ParquetFileWriter.Mode.OVERWRITE
      } else {
        ParquetFileWriter.Mode.CREATE
      }).build()
  }
} 
Example 42
Source File: StructWriteSupport.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._

// derived from Apache Spark's parquet write support, archive and license here:
// https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)
  private val schemaName = if (schema.name() == null) "schema" else schema.name()
  private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName)

  private val metadata = new java.util.HashMap[String, String]()
  metadata.put("written_by", "streamreactor")

  // The Parquet `RecordConsumer` to which all structs are written
  private var consumer: RecordConsumer = _

  type ValueWriter = (Any) => Unit

  override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String])
  override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata)
  override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer

  override def write(struct: Struct): Unit = {
    writeMessage {
      writeStructFields(struct)
    }
  }

  private def writeStructFields(struct: Struct): Unit = {
    for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) {
      val value = struct.get(field)
      if (value != null) {
        val writer = valueWriter(field.schema())
        writeField(field.name, index) {
          writer(value)
        }
      }
    }
  }

  def valueWriter(schema: Schema): ValueWriter = {
    // todo perhaps introduce something like spark's SpecializedGetters
    schema.`type`() match {
      case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean])
      case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt)
      case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong)
      case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes))
      case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat)
      case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble)
      case Schema.Type.STRUCT => value => {
        logger.debug(s"Writing nested struct")
        val struct = value.asInstanceOf[Struct]
        writeGroup {
          schema.fields.asScala
            .map { field => field -> struct.get(field) }
            .zipWithIndex.foreach { case ((field, v), k) =>
            writeField(field.name, k) {
              valueWriter(field.schema)(v)
            }
          }
        }
      }
      case _ => throw UnsupportedSchemaType(schema.`type`.toString)
    }
  }

  private def writeMessage(f: => Unit): Unit = {
    consumer.startMessage()
    f
    consumer.endMessage()
  }

  private def writeGroup(f: => Unit): Unit = {
    consumer.startGroup()
    // consumer.startMessage()
    f
    //consumer.endMessage()
    consumer.endGroup()
  }

  private def writeField(name: String, k: Int)(f: => Unit): Unit = {
    consumer.startField(name, k)
    f
    consumer.endField(name, k)
  }
} 
Example 43
Source File: StructReadSupport.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import java.util

import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.Struct
import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
import org.apache.parquet.io.api.RecordMaterializer
import org.apache.parquet.schema.MessageType

class StructReadSupport extends ReadSupport[Struct] {

  override def prepareForRead(configuration: Configuration,
                              metaData: util.Map[String, String],
                              fileSchema: MessageType,
                              context: ReadSupport.ReadContext): RecordMaterializer[Struct] = {
    // the file schema in here comes from the footer of the parquet file
    val schema = ParquetSchemas.toKafka(fileSchema)
    new StructMaterializer(schema)
  }

  override def init(context: InitContext): ReadSupport.ReadContext = {
    new ReadSupport.ReadContext(context.getFileSchema)
  }
} 
Example 44
Source File: Output.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.blockchain.data

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String)

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

} 
Example 45
Source File: Input.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.cassandra.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String) {
  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    map.put("sequence", sequence)
    prev_out.foreach(p => map.put("prev_out", p.toHashMap))
    map.put("script", script)
    map
  }
}

object Input {
  val ConnectSchema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po => struct.put("prev_out", po.toStruct()))
      struct
    }
  }

} 
Example 46
Source File: KeyUtils.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.cassandra.utils

import com.jayway.jsonpath.{Configuration, JsonPath}
import org.apache.kafka.connect.data.{Schema, Struct}

object KeyUtils {

  
  def keysFromStruct(struct: Struct, schema: Schema, fieldNames: Seq[String]): Seq[Object] =
    fieldNames.map(getKeyFromStruct(struct, _))

  private def getKeyFromStruct(struct: Struct, fieldName: String): Object = {
    if (fieldName.contains(".")) {
      val Array(nestedObject, nestedField) = fieldName.split("\\.", 2)
      getKeyFromStruct(struct.get(nestedObject).asInstanceOf[Struct], nestedField)
    } else {
      struct.get(fieldName)
    }
  }
} 
Example 47
Source File: Output.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.mongodb

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String) {

  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    addr_tag_link.foreach(map.put("addr_tag_link", _))
    addr_tag_link.foreach(map.put("addr_tag", _))
    map.put("spent", spent)
    map.put("tx_index", tx_index)
    map.put("type", `type`)
    addr.foreach(map.put("addr", _))
    map.put("value", value)
    map.put("n", n)
    map.put("script", script)
    map
  }

}

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.OPTIONAL_INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

} 
Example 48
Source File: Input.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.mongodb

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String) {
  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    map.put("sequence", sequence)
    prev_out.foreach(p => map.put("prev_out", p.toHashMap))
    map.put("script", script)
    map
  }
}

object Input {
  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po => struct.put("prev_out", po.toStruct()))
      struct
    }
  }

} 
Example 49
Source File: SinkRecordToDocument.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.mongodb.sink

import com.datamountaineer.streamreactor.connect.mongodb.config.MongoSettings
import com.datamountaineer.streamreactor.connect.mongodb.converters.SinkRecordConverter
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.bson.Document

object SinkRecordToDocument extends ConverterUtil {
  def apply(record: SinkRecord, keys: Set[String] = Set.empty)(implicit settings: MongoSettings): (Document, Iterable[(String, Any)]) = {
    val schema = record.valueSchema()
    val value = record.value()
    val fields = settings.fields.getOrElse(record.topic(), Map.empty)

    val allFields = if (fields.size == 1 && fields.head._1 == "*") true else false

    if (schema == null) {
      //try to take it as string
      value match {
        case _: java.util.Map[_, _] =>
          val extracted = convertSchemalessJson(
            record,
            fields,
            settings.ignoredField.getOrElse(record.topic(), Set.empty)
          )
          //not ideal; but the compile is hashmap anyway

          SinkRecordConverter.fromMap(extracted.asInstanceOf[java.util.Map[String, AnyRef]]) ->
            keys.headOption.map(_ => KeysExtractor.fromMap(extracted, keys)).getOrElse(Iterable.empty)
        case _ => sys.error("For schemaless record only String and Map types are supported")
      }
    } else {
      schema.`type`() match {
        case Schema.Type.STRING =>
          val extracted = convertStringSchemaAndJson(
            record,
            fields,
            settings.ignoredField.getOrElse(record.topic(), Set.empty),
            includeAllFields = allFields)
          SinkRecordConverter.fromJson(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty)

        case Schema.Type.STRUCT =>
          val extracted = convert(
            record,
            fields,
            settings.ignoredField.getOrElse(record.topic(), Set.empty)
          )
          SinkRecordConverter.fromStruct(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromStruct(extracted.value().asInstanceOf[Struct], keys)).getOrElse(Iterable.empty)

        case other => sys.error(s"$other schema is not supported")
      }
    }
  }
} 
Example 50
Source File: KeysExtractorTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import java.util

import com.datamountaineer.streamreactor.connect.azure.documentdb.Json
import com.sksamuel.avro4s.RecordFormat
import io.confluent.connect.avro.AvroData
import org.apache.kafka.common.config.ConfigException
import org.apache.kafka.connect.data.Struct
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._

class KeysExtractorTest extends AnyWordSpec with Matchers {
  private val avroData = new AvroData(4)

  case class WithNested(id: Int, nested: SomeTest)

  case class SomeTest(name: String, value: Double, flags: Seq[Int], map: Map[String, String])

  "KeysExtractor" should {
    "extract keys from JSON" in {
      val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction1.json").toURI.getPath).mkString
      val jvalue = Json.parseJson(json)

      val actual = KeysExtractor.fromJson(jvalue, Set("lock_time", "rbf"))
      actual shouldBe List("lock_time" -> 9223372036854775807L, "rbf" -> true)
    }

    "throw exception when extracting the keys from JSON" in {
      val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction1.json").toURI.getPath).mkString
      val jvalue = Json.parseJson(json)
      intercept[ConfigException] {
        val actual = KeysExtractor.fromJson(jvalue, Set("inputs"))
      }
    }


    "extract keys from a Map" in {
      val actual = KeysExtractor.fromMap(Map("key1" -> 12, "key2" -> 10L, "key3" -> "tripple").asJava, Set("key1", "key3"))
      actual shouldBe Set("key1" -> 12, "key3" -> "tripple")
    }

    "extract keys from a Map should throw an exception if the key is another map" in {
      intercept[ConfigException] {
        KeysExtractor.fromMap(Map("key1" -> 12, "key2" -> 10L, "key3" -> Map.empty[String, String]).asJava, Set("key1", "key3"))
      }
    }

    "extract keys from a Map should throw an exception if the key is an array" in {
      intercept[ConfigException] {
        KeysExtractor.fromMap(Map("key1" -> 12, "key2" -> 10L, "key3" -> new util.ArrayList[String]).asJava, Set("key1", "key3"))
      }
    }

    "extract from a struct" in {
      val format = RecordFormat[SomeTest]
      val avro = format.to(SomeTest("abc", 12.5, Seq.empty, Map.empty))
      val struct = avroData.toConnectData(avro.getSchema, avro)
      KeysExtractor.fromStruct(struct.value().asInstanceOf[Struct], Set("name")) shouldBe
        Set("name" -> "abc")
    }

    "extract from a struct should throw an exception if a key is an array" in {
      val format = RecordFormat[SomeTest]
      val avro = format.to(SomeTest("abc", 12.5, Seq.empty, Map.empty))
      intercept[ConfigException] {
        val struct = avroData.toConnectData(avro.getSchema, avro)
        KeysExtractor.fromStruct(struct.value().asInstanceOf[Struct], Set("flags"))
      }
    }

    "extract from a struct should throw an exception if a key is a map" in {
      val format = RecordFormat[SomeTest]
      val avro = format.to(SomeTest("abc", 12.5, Seq.empty, Map.empty))
      intercept[ConfigException] {
        val struct = avroData.toConnectData(avro.getSchema, avro)
        KeysExtractor.fromStruct(struct.value().asInstanceOf[Struct], Set("map"))
      }
    }

    "extract from a struct should throw an exception if a key is a struct" in {
      val format = RecordFormat[WithNested]
      val avro = format.to(WithNested(1, SomeTest("abc", 12.5, Seq.empty, Map.empty)))
      intercept[ConfigException] {
        val struct = avroData.toConnectData(avro.getSchema, avro)
        KeysExtractor.fromStruct(struct.value().asInstanceOf[Struct], Set("nested"))
      }
    }
  }
} 
Example 51
Source File: Output.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String) {

  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    addr_tag_link.foreach(map.put("addr_tag_link", _))
    addr_tag_link.foreach(map.put("addr_tag", _))
    map.put("spent", spent)
    map.put("tx_index", tx_index)
    map.put("type", `type`)
    addr.foreach(map.put("addr", _))
    map.put("value", value)
    map.put("n", n)
    map.put("script", script)
    map
  }

}

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.OPTIONAL_INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

} 
Example 52
Source File: Input.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String) {
  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    map.put("sequence", sequence)
    prev_out.foreach(p => map.put("prev_out", p.toHashMap))
    map.put("script", script)
    map
  }
}

object Input {
  val ConnectSchema = SchemaBuilder.struct
    .name("input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po => struct.put("prev_out", po.toStruct()))
      struct
    }
  }

} 
Example 53
Source File: SinkRecordToDocument.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink

import com.datamountaineer.streamreactor.connect.azure.documentdb.config.DocumentDbSinkSettings
import com.datamountaineer.streamreactor.connect.azure.documentdb.converters.SinkRecordConverter
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import com.microsoft.azure.documentdb.Document
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

object SinkRecordToDocument extends ConverterUtil {
  def apply(record: SinkRecord, keys: Set[String] = Set.empty)(implicit settings: DocumentDbSinkSettings): (Document, Iterable[(String, Any)]) = {
    val schema = record.valueSchema()
    val value = record.value()

    if (schema == null) {
      //try to take it as string
      value match {
        case _: java.util.Map[_, _] =>

          val fields = settings.fields(record.topic())
          val extracted = convertSchemalessJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic()))
          //not ideal; but the compile is hashmap anyway

          SinkRecordConverter.fromMap(extracted.asInstanceOf[java.util.Map[String, AnyRef]]) ->
            keys.headOption.map(_ => KeysExtractor.fromMap(extracted, keys)).getOrElse(Iterable.empty)

        case _: String =>
          val extracted = convertStringSchemaAndJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic()))
          SinkRecordConverter.fromJson(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty)

        case _ => sys.error("For schemaless record only String and Map types are supported")
      }
    } else {
      schema.`type`() match {
        case Schema.Type.STRING =>
          val extracted = convertStringSchemaAndJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic()))
          SinkRecordConverter.fromJson(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty)

        case Schema.Type.STRUCT =>
          val extracted = convert(record, settings.fields(record.topic()), settings.ignoredField(record.topic()))
          SinkRecordConverter.fromStruct(extracted) ->
            keys.headOption.map(_ => KeysExtractor.fromStruct(extracted.value().asInstanceOf[Struct], keys)).getOrElse(Iterable.empty)

        case other => sys.error(s"$other schema is not supported")
      }
    }
  }
} 
Example 54
Source File: Transaction.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.blockchain.data

import java.util

import com.datamountaineer.streamreactor.connect.blockchain.data.Input._
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.source.SourceRecord

case class Transaction(lock_time: Long,
                       ver: Int,
                       size: Long,
                       inputs: Seq[Input],
                       rbf: Option[Boolean],
                       time: Long,
                       tx_index: Long,
                       vin_sz: Int,
                       hash: String,
                       vout_sz: Int,
                       relayed_by: String,
                       out: Seq[Output])


object Transaction {
  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.transaction")
    .field("lock_time", Schema.INT64_SCHEMA)
    .field("ver", Schema.INT32_SCHEMA)
    .field("size", Schema.INT64_SCHEMA)
    .field("inputs", SchemaBuilder.array(Input.ConnectSchema).optional().build())
    .field("rbf", Schema.OPTIONAL_BOOLEAN_SCHEMA)
    .field("time", Schema.INT64_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("vin_sz", Schema.INT32_SCHEMA)
    .field("hash", Schema.STRING_SCHEMA)
    .field("vout_sz", Schema.INT32_SCHEMA)
    .field("relayed_by", Schema.STRING_SCHEMA)
    .field("out", SchemaBuilder.array(Output.ConnectSchema).optional().build())
    .build()

  implicit class TransactionToSourceRecordConverter(val tx: Transaction) extends AnyVal {
    def toSourceRecord(topic: String, partition: Int, key: Option[String]): SourceRecord = {
      new SourceRecord(
        null,
        null,
        topic,
        partition,
        key.map(_ => Schema.STRING_SCHEMA).orNull,
        key.orNull,
        ConnectSchema,
        tx.toStruct()
      )
    }

    //private def getOffset() = Collections.singletonMap("position", System.currentTimeMillis())

    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("lock_time", tx.lock_time)
        .put("ver", tx.ver)
        .put("size", tx.size)
        .put("time", tx.time)
        .put("tx_index", tx.tx_index)
        .put("vin_sz", tx.vin_sz)
        .put("hash", tx.hash)
        .put("vout_sz", tx.vout_sz)
        .put("relayed_by", tx.relayed_by)

      tx.out.headOption.foreach { _ =>
        import scala.collection.JavaConverters._
        struct.put("out", tx.out.map(_.toStruct()).asJava)
      }
      tx.rbf.foreach(struct.put("rbf", _))
      tx.inputs.headOption.foreach { _ =>
        val inputs = new util.ArrayList[Struct]
        tx.inputs.foreach(i => inputs.add(i.toStruct()))
        struct.put("inputs", inputs)
      }
      tx.out.headOption.foreach { _ =>
        val outputs = new util.ArrayList[Struct]
        tx.out.foreach(output => outputs.add(output.toStruct()))
      }

      struct
    }
  }

} 
Example 55
Source File: Output.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.cassandra.sink

import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Output(addr_tag_link: Option[String],
                  addr_tag: Option[String],
                  spent: Boolean,
                  tx_index: Long,
                  `type`: Int,
                  addr: Option[String],
                  value: Long,
                  n: Int,
                  script: String) {

  def toHashMap: util.HashMap[String, Any] = {
    val map = new util.HashMap[String, Any]()
    addr_tag_link.foreach(map.put("addr_tag_link", _))
    addr_tag_link.foreach(map.put("addr_tag", _))
    map.put("spent", spent)
    map.put("tx_index", tx_index)
    map.put("type", `type`)
    addr.foreach(map.put("addr", _))
    map.put("value", value)
    map.put("n", n)
    map.put("script", script)
    map
  }

}

object Output {

  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.output")
    .doc("The output instance part of a transaction.")
    .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA)
    .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA)
    .field("spent", Schema.BOOLEAN_SCHEMA)
    .field("tx_index", Schema.INT64_SCHEMA)
    .field("type", Schema.OPTIONAL_INT32_SCHEMA)
    .field("addr", Schema.OPTIONAL_STRING_SCHEMA)
    .field("value", Schema.INT64_SCHEMA)
    .field("n", Schema.INT32_SCHEMA)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class OutputToStructConverter(val output: Output) extends AnyVal {
    def toStruct() = {
      val struct = new Struct(ConnectSchema)
        .put("spent", output.spent)
        .put("tx_index", output.tx_index)
        .put("type", output.`type`)
        .put("value", output.value)
        .put("n", output.n)
        .put("script", output.script)
      output.addr.foreach(struct.put("addr", _))
      output.addr_tag.foreach(struct.put("addr_tag", _))
      output.addr_tag_link.foreach(struct.put("addr_tag_link", _))
      struct
    }
  }

} 
Example 56
Source File: Input.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.blockchain.data

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

case class Input(sequence: Long, prev_out: Option[Output], script: String)

object Input {
  val ConnectSchema: Schema = SchemaBuilder.struct
    .name("datamountaineer.blockchain.input")
    .doc("The input instance part of a transaction.")
    .field("sequence", Schema.INT64_SCHEMA)
    .field("prev_out", Output.ConnectSchema)
    .field("script", Schema.STRING_SCHEMA)
    .build()

  implicit class InputToStructConverter(val input: Input) extends AnyVal {
    def toStruct(): Struct = {
      val struct = new Struct(ConnectSchema)
        .put("sequence", input.sequence)
        .put("script", input.script)

      input.prev_out.foreach(po=>struct.put("prev_out", po.toStruct()))
      struct
    }
  }

} 
Example 57
Source File: ConnectMongoConverterSpec.scala    From kafka-connect-mongodb   with Apache License 2.0 5 votes vote down vote up
package com.startapp.data

import java.lang.Boolean
import java.util

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.{FlatSpec, Matchers}

class ConnectMongoConverterSpec extends FlatSpec with Matchers{
  private val FIELD1_NAME = "fieldInt"
  private val FIELD1_VALUE = new Integer(5)
  private val FIELD2_NAME = "fieldString"
  private val FIELD2_VALUE = "str"
  private val FIELD3_NAME = "fieldBoolean"
  private val FIELD3_VALUE = new Boolean(true)

  val schema = SchemaBuilder.struct().name("test schema")
    .field(FIELD1_NAME, Schema.INT32_SCHEMA)
    .field(FIELD2_NAME, Schema.STRING_SCHEMA)
    .field(FIELD3_NAME, Schema.BOOLEAN_SCHEMA)
    .build()

  "No Schema Connect Mongo Converter Bad Data" should "throw an exception" in {
    var exceptionThrown = false

    val badData = new Struct(schema)

    try{
      checkJsonMap(NoSchemaConnectMongoConverter, badData)
    }
    catch {
      case _ : java.lang.ClassCastException => exceptionThrown = true
    }

    exceptionThrown should be(true)
  }

  "No Schema Connect Mongo Converter Good Data" should "return the same map" in {
    val jsonMap = new util.HashMap[String, Object]()
    jsonMap.put(FIELD1_NAME, FIELD1_VALUE)
    jsonMap.put(FIELD2_NAME, FIELD2_VALUE)
    jsonMap.put(FIELD3_NAME, FIELD3_VALUE)

    checkJsonMap(NoSchemaConnectMongoConverter, jsonMap)
  }

  "Schema Connect Mongo Converter Bad Data" should "throw an exception" in {
    var exceptionThrown = false

    val badData = new util.HashMap[String, Object]()
    badData.put(FIELD1_NAME, FIELD1_VALUE)

    try {
      checkJsonMap(SchemaConnectMongoConverter, badData)
    }
    catch {
      case _ : java.lang.ClassCastException => exceptionThrown = true
    }

    exceptionThrown should be(true)
  }

  "Schema Connect Mongo Converter Good Data" should "convert data to json map" in {
    val data = new Struct(schema)
      .put(FIELD1_NAME, FIELD1_VALUE)
      .put(FIELD2_NAME, FIELD2_VALUE)
      .put(FIELD3_NAME, FIELD3_VALUE)

    checkJsonMap(SchemaConnectMongoConverter, data)
  }

  private def checkJsonMap(converter : ConnectMongoConverter, value: Object): Unit ={
    val newJsonMap = converter.toJsonMap(value).toMap

    newJsonMap(FIELD1_NAME) should be(FIELD1_VALUE)
    newJsonMap(FIELD2_NAME) should be(FIELD2_VALUE)
    newJsonMap(FIELD3_NAME) should be(FIELD3_VALUE)
  }

} 
Example 58
Source File: HANASourceTaskConversionTest.scala    From kafka-connect-sap   with Apache License 2.0 5 votes vote down vote up
package com.sap.kafka.connect.source

import com.sap.kafka.client.MetaSchema
import org.apache.kafka.connect.data.Schema.Type
import org.apache.kafka.connect.data.{Field, Schema, Struct}
import org.apache.kafka.connect.source.SourceRecord

import scala.collection.JavaConverters._

class HANASourceTaskConversionTest extends HANASourceTaskTestBase {

  override def beforeAll(): Unit = {
    super.beforeAll()
    task.start(singleTableConfig())
  }

  override def afterAll(): Unit = {
    task.stop()
    super.afterAll()
  }

  test("boolean type") {
    typeConversion(Schema.BOOLEAN_SCHEMA, true, java.lang.Boolean.FALSE,
      Schema.BOOLEAN_SCHEMA, java.lang.Boolean.FALSE)
  }

  test("int type") {
    typeConversion(Schema.INT32_SCHEMA, true, new java.lang.Integer(1),
      Schema.INT32_SCHEMA, new Integer(1))
  }

  test("long type") {
    typeConversion(Schema.INT64_SCHEMA, true, new java.lang.Long(1),
      Schema.INT64_SCHEMA, new java.lang.Long(1))
  }

  test("double type") {
    typeConversion(Schema.FLOAT64_SCHEMA, true, new java.lang.Double(1.0),
      Schema.FLOAT64_SCHEMA, new java.lang.Double(1.0))
  }

  test("string type") {
    typeConversion(Schema.STRING_SCHEMA, true, "'a'",
      Schema.STRING_SCHEMA, "a")
  }

  private def typeConversion(sqlType: Schema, nullable: Boolean,
                             sqlValue: Object, convertedSchema: Schema,
                             convertedValue: Object): Unit = {
    val fields = Seq(new Field("id", 1, sqlType))
    jdbcClient.createTable(Some("TEST"), "EMPLOYEES_SOURCE", MetaSchema(null, fields),
      3000)
    val connection = jdbcClient.getConnection
    val stmt = connection.createStatement()
    stmt.execute("insert into \"TEST\".\"EMPLOYEES_SOURCE\" values(" + sqlValue.toString + ")")
    val records = task.poll()
    validateRecords(records.asScala.toList, convertedSchema, convertedValue)
    stmt.execute("drop table \"TEST\".\"EMPLOYEES_SOURCE\"")
  }

  private def validateRecords(records: List[SourceRecord], expectedFieldSchema: Schema,
                              expectedValue: Object): Unit = {
    assert(records.size === 1)
    val objValue = records.head.value()
    assert(objValue.isInstanceOf[Struct])
    val value = objValue.asInstanceOf[Struct]

    val schema = value.schema()
    assert(Type.STRUCT === schema.`type`())
    val fields = schema.fields()

    assert(fields.size() === 1)

    val fieldSchema = fields.get(0).schema()
    assert(expectedFieldSchema === fieldSchema)

    assert(expectedValue === value.get(fields.get(0)))
  }
} 
Example 59
Source File: TableQuerier.scala    From kafka-connect-sap   with Apache License 2.0 5 votes vote down vote up
package com.sap.kafka.connect.source.querier

import com.sap.kafka.client.hana.HANAJdbcClient
import com.sap.kafka.connect.config.{BaseConfig, BaseConfigConstants}
import com.sap.kafka.connect.config.hana.HANAConfig
import com.sap.kafka.utils.hana.HANAJdbcTypeConverter
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.source.SourceRecord
import org.slf4j.LoggerFactory

import scala.util.Random

abstract class TableQuerier(mode: String, tableOrQuery: String,
                            topic: String, config: BaseConfig,
                            var jdbcClient: Option[HANAJdbcClient])
                extends Comparable[TableQuerier] {
  var tableName: String = if (mode.equals(BaseConfigConstants.QUERY_MODE_TABLE)) tableOrQuery else null
  var query: String = if (mode.equals(BaseConfigConstants.QUERY_MODE_SQL)) tableOrQuery else null

  var lastUpdate: Long = 0
  var schema: Schema = _
  var queryString: Option[String] = None
  var resultList: Option[List[Struct]] = None

  val log = LoggerFactory.getLogger(getClass)

  def getLastUpdate(): Long = lastUpdate

  def getOrCreateQueryString(): Option[String] = {
    createQueryString()
    queryString
  }

  def createQueryString(): Unit

  def querying(): Boolean = resultList.isDefined

  def maybeStartQuery(): Unit = {
    if (resultList.isEmpty) {
      schema = getSchema()
      queryString = getOrCreateQueryString()

      val batchMaxRows = config.batchMaxRows
      resultList = getOrCreateJdbcClient().get.executeQuery(schema, queryString.get,
        0, batchMaxRows)
      log.info(resultList.size.toString)
    }
  }

  def extractRecords(): List[SourceRecord]

  def close(now: Long): Unit = {
    resultList = None
    schema = null

    lastUpdate = now
  }

  protected def getOrCreateJdbcClient(): Option[HANAJdbcClient] = {
    if (jdbcClient.isDefined) {
      return jdbcClient
    }

    config match {
      case hanaConfig: HANAConfig => Some(HANAJdbcClient(hanaConfig))
      case _ => throw new RuntimeException("Cannot create Jdbc Client")
    }
  }

  private def getSchema(): Schema = {
    mode match {
      case BaseConfigConstants.QUERY_MODE_TABLE =>
        if (getOrCreateJdbcClient().get.isInstanceOf[HANAJdbcClient]) {
          val metadata = getOrCreateJdbcClient().get.getMetaData(tableOrQuery, None)
          HANAJdbcTypeConverter.convertHANAMetadataToSchema(tableName, metadata)
        } else {
          throw new RuntimeException("Jdbc Client is not available")
        }
      case BaseConfigConstants.QUERY_MODE_SQL =>
        if (getOrCreateJdbcClient().get.isInstanceOf[HANAJdbcClient]) {
          val metadata = getOrCreateJdbcClient().get.getMetadata(tableOrQuery)
          HANAJdbcTypeConverter.convertHANAMetadataToSchema("Query" + Random.nextInt, metadata)
        } else {
          throw new RuntimeException("Jdbc Client is not available")
        }
      case _ =>
        throw new RuntimeException("Other Query modes are not supported")
    }
  }

  override def compareTo(other: TableQuerier): Int = {
    if (this.lastUpdate < other.lastUpdate) {
      -1
    } else if (this.lastUpdate > other.lastUpdate) {
      0
    } else {
      this.tableName.compareTo(other.tableName)
    }
  }
} 
Example 60
Source File: FieldValueGetter.scala    From kafka-connect-kcql-smt   with Apache License 2.0 5 votes vote down vote up
package com.landoop.connect.sql

import org.apache.kafka.connect.data.{Schema, Struct}

trait FieldValueGetter {

  def get(value: Any, schema: Schema, path: Seq[String]): Option[Any] = {
    path.headOption.map { parent =>
      schema.`type`() match {
        case Schema.Type.STRUCT => if (Option(value).isEmpty) None else fromRecord(value, schema, path)
        case Schema.Type.MAP => if (Option(value).isEmpty) None else fromMap(value, schema, path)
        case _ => throw new IllegalArgumentException(s"Can't select $parent field from schema:$schema")
      }
    }.getOrElse {
      schema.`type`() match {
        case Schema.Type.BOOLEAN |
             Schema.Type.FLOAT64 | Schema.Type.FLOAT32 |
             Schema.Type.INT64 | Schema.Type.INT32 | Schema.Type.INT16 | Schema.Type.INT8 |
             Schema.Type.BYTES | Schema.Type.STRING => Option(value)

        case Schema.Type.ARRAY | Schema.Type.MAP | Schema.Type.STRUCT =>
          throw new IllegalArgumentException(s"Can't select an element from an array(schema:$schema)")

        case other => throw new IllegalArgumentException(s"Invalid Avro schema type:$other")
      }
    }
  }


  private def fromRecord(value: Any, schema: Schema, path: Seq[String]) = {
    val field = Option(schema.field(path.head))
      .getOrElse(throw new IllegalArgumentException(s"Can't find field:${path.head} in schema:$schema"))
    val v = value.asInstanceOf[Struct].get(path.head)
    get(v, field.schema(), path.tail)
  }


  private def fromMap(value: Any, schema: Schema, path: Seq[String]) = {
    val field = Option(schema.field(path.head))
      .getOrElse(throw new IllegalArgumentException(s"Can't find field:${path.head} in schema:$schema"))
    val v = value.asInstanceOf[Struct].get(path.head)
    get(v, field.schema(), path.tail)
  }

} 
Example 61
Source File: IotHubSourceTaskTest.scala    From toketi-kafka-connect-iothub   with MIT License 5 votes vote down vote up
// Copyright (c) Microsoft. All rights reserved.

package com.microsoft.azure.iot.kafka.connect.source

import java.time.{Duration, Instant}
import java.util

import com.microsoft.azure.iot.kafka.connect.source.testhelpers.{DeviceTemperature, MockDataReceiver, TestConfig, TestIotHubSourceTask}
import org.apache.kafka.connect.data.Struct
import org.json4s.jackson.Serialization.read
import org.scalatest.{FlatSpec, GivenWhenThen}

class IotHubSourceTaskTest extends FlatSpec with GivenWhenThen with JsonSerialization {

  "IotHubSourceTask poll" should "return a list of SourceRecords with the right format" in {

    Given("IotHubSourceTask instance")

    val iotHubSourceTask = new TestIotHubSourceTask
    iotHubSourceTask.start(TestConfig.sourceTaskTestProps)

    When("IotHubSourceTask.poll is called")
    val sourceRecords = iotHubSourceTask.poll()

    Then("It returns a list of SourceRecords")
    assert(sourceRecords != null)
    assert(sourceRecords.size() == 15)
    for (i <- 0 until 15) {
      val record = sourceRecords.get(i)
      assert(record.topic() == TestConfig.sourceTaskTestProps.get(IotHubSourceConfig.KafkaTopic))
      assert(record.valueSchema() == IotMessageConverter.schema)
      val messageStruct = record.value().asInstanceOf[Struct]
      assert(messageStruct.getString("deviceId").startsWith("device"))
      assert(messageStruct.getString("contentType") == "temperature")
      val enqueuedTime = Instant.parse(messageStruct.getString("enqueuedTime"))
      assert(enqueuedTime.isAfter(Instant.parse("2016-11-20T00:00:00Z")))

      val systemProperties = messageStruct.getMap[String, String]("systemProperties")
      assert(systemProperties != null)
      assert(systemProperties.get("sequenceNumber") != "")
      assert(systemProperties.get("correlationId") != "")

      val properties = messageStruct.getMap[String, String]("properties")
      assert(properties != null)
      assert(properties.get("timestamp") != "")

      val deviceTemperature = read[DeviceTemperature](messageStruct.get("content").asInstanceOf[String])
      assert(deviceTemperature != null)
      assert(deviceTemperature.unit == "F")
      assert(deviceTemperature.value != 0)
    }
  }

  "IotHubSourceTask start" should "initialize all properties" in {

    Given("A list of properties for IotHubSourceTask")
    val props: util.Map[String, String] = TestConfig.sourceTaskTestProps

    When("IotHubSourceTask is started")
    val task = new TestIotHubSourceTask
    task.start(props)

    Then("Data receiver should be properly initialized")
    assert(task.partitionSources.length == 3)
    assert(!task.partitionSources.exists(s => s.dataReceiver == null))
    for (ps ← task.partitionSources) {
      val dataReceiver = ps.dataReceiver.asInstanceOf[MockDataReceiver]
      assert(dataReceiver.offset.isDefined)
      assert(dataReceiver.startTime.isEmpty)
      assert(dataReceiver.connectionString != "")
      assert(dataReceiver.receiverConsumerGroup != "")
      assert(dataReceiver.receiveTimeout == Duration.ofSeconds(5))
    }
  }

  it should "initialize start time correctly on the data receiver when it is passed in the config" in {

    Given("A list of properties with StartTime for IotHubSourceTask")
    val props: util.Map[String, String] = TestConfig.sourceTaskTestPropsStartTime

    When("IotHubSourceTask is started")
    val task = new TestIotHubSourceTask
    task.start(props)

    Then("Data receiver should be properly initialized, with StartTime, while Offsets value should be ignored")
    assert(task.partitionSources.length == 3)
    assert(!task.partitionSources.exists(s => s.dataReceiver == null))
    for (ps ← task.partitionSources) {
      val dataReceiver = ps.dataReceiver.asInstanceOf[MockDataReceiver]
      assert(dataReceiver.offset.isEmpty)
      assert(dataReceiver.startTime.isDefined)
      assert(dataReceiver.startTime.get == Instant.parse("2016-12-10T00:00:00Z"))
      assert(dataReceiver.connectionString != "")
      assert(dataReceiver.receiverConsumerGroup != "")
    }
  }
} 
Example 62
Source File: IotMessageConverterTest.scala    From toketi-kafka-connect-iothub   with MIT License 5 votes vote down vote up
// Copyright (c) Microsoft. All rights reserved.

package com.microsoft.azure.iot.kafka.connect.source

import java.text.SimpleDateFormat
import java.time.Instant

import com.microsoft.azure.eventhubs.impl.AmqpConstants
import com.microsoft.azure.iot.kafka.connect.source.testhelpers.DeviceTemperature
import org.apache.kafka.connect.data.Struct
import org.json4s.jackson.Serialization._
import org.scalatest.{FlatSpec, GivenWhenThen}

import scala.collection.mutable
import scala.util.Random

class IotMessageConverterTest extends FlatSpec with GivenWhenThen with JsonSerialization {

  private val random: Random = new Random

  "IotMessage Converter" should "populate right values for kafka message struct fields" in {

    Given("IotMessage object")
    val deviceTemp = DeviceTemperature(100.01, "F")
    val deviceTempStr = write(deviceTemp)

    val sequenceNumber = random.nextLong()
    val correlationId = random.nextString(10)
    val offset = random.nextString(10)
    val enqueuedDate = new SimpleDateFormat("MM/dd/yyyy").parse("12/01/2016")
    val systemProperties = mutable.Map[String, Object](
      "iothub-connection-device-id" → "device10",
      AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME → sequenceNumber.asInstanceOf[Object],
      AmqpConstants.AMQP_PROPERTY_CORRELATION_ID → correlationId,
      AmqpConstants.OFFSET_ANNOTATION_NAME → offset,
      AmqpConstants.ENQUEUED_TIME_UTC_ANNOTATION_NAME → enqueuedDate)

    val timestamp = Instant.now().toString
    val messageProperties = mutable.Map[String, Object](
      "timestamp" → timestamp,
      "contentType" → "temperature"
    )

    val iotMessage = IotMessage(deviceTempStr, systemProperties, messageProperties)

    When("getIotMessageStruct is called with IotMessage object")
    val kafkaMessageStruct: Struct = IotMessageConverter.getIotMessageStruct(iotMessage)

    Then("The struct has all the expected properties")
    assert(kafkaMessageStruct.getString("deviceId") == "device10")
    assert(kafkaMessageStruct.getString("offset") == offset)
    assert(kafkaMessageStruct.getString("contentType") == "temperature")
    assert(kafkaMessageStruct.getString("enqueuedTime") == enqueuedDate.toInstant.toString)
    assert(kafkaMessageStruct.getInt64("sequenceNumber") == sequenceNumber)
    assert(kafkaMessageStruct.getString("content") == deviceTempStr)

    val structSystemProperties = kafkaMessageStruct.getMap[String, String]("systemProperties")
    assert(structSystemProperties != null)
    assert(structSystemProperties.size == 1)
    assert(structSystemProperties.get(AmqpConstants.AMQP_PROPERTY_CORRELATION_ID) == correlationId)

    val structProperties = kafkaMessageStruct.getMap[String, String]("properties")
    assert(structProperties != null)
    assert(structProperties.size == 1)
    assert(structProperties.get("timestamp") == timestamp)
  }

  it should "use default values for missing properties" in {

    val deviceTemp = DeviceTemperature(100.01, "F")
    val deviceTempStr = write(deviceTemp)

    val systemProperties = mutable.Map.empty[String, Object]
    val messageProperties = mutable.Map.empty[String, Object]

    val iotMessage = IotMessage(deviceTempStr, systemProperties, messageProperties)

    When("getIotMessageStruct is called with IotMessage object")
    val kafkaMessageStruct: Struct = IotMessageConverter.getIotMessageStruct(iotMessage)

    Then("The struct has all the expected properties")
    assert(kafkaMessageStruct.getString("deviceId") == "")
    assert(kafkaMessageStruct.getString("offset") == "")
    assert(kafkaMessageStruct.getString("contentType") == "")
    assert(kafkaMessageStruct.getString("enqueuedTime") == "")
    assert(kafkaMessageStruct.getInt64("sequenceNumber") == 0)
    assert(kafkaMessageStruct.getString("content") == deviceTempStr)

    val structSystemProperties = kafkaMessageStruct.getMap[String, String]("systemProperties")
    assert(structSystemProperties != null)
    assert(structSystemProperties.size == 0)

    val structProperties = kafkaMessageStruct.getMap[String, String]("properties")
    assert(structProperties != null)
    assert(structProperties.size == 0)
  }
} 
Example 63
Source File: IotHubPartitionSource.scala    From toketi-kafka-connect-iothub   with MIT License 5 votes vote down vote up
// Copyright (c) Microsoft. All rights reserved.

package com.microsoft.azure.iot.kafka.connect.source

import java.util.{Collections, Map}

import com.typesafe.scalalogging.LazyLogging
import org.apache.kafka.connect.data.Struct
import org.apache.kafka.connect.errors.ConnectException
import org.apache.kafka.connect.source.SourceRecord

import scala.collection.mutable.ListBuffer
import scala.util.control.NonFatal

class IotHubPartitionSource(val dataReceiver: DataReceiver,
    val partition: String,
    val topic: String,
    val batchSize: Int,
    val eventHubName: String,
    val sourcePartition: Map[String, String])
  extends LazyLogging
    with JsonSerialization {

  def getRecords: List[SourceRecord] = {

    logger.debug(s"Polling for data from eventHub $eventHubName partition $partition")
    val list = ListBuffer.empty[SourceRecord]
    try {
      val messages: Iterable[IotMessage] = this.dataReceiver.receiveData(batchSize)

      if (messages.isEmpty) {
        logger.debug(s"Finished processing all messages from eventHub $eventHubName " +
          s"partition ${this.partition}")
      } else {
        logger.debug(s"Received ${messages.size} messages from eventHub $eventHubName " +
          s"partition ${this.partition} (requested $batchSize batch)")

        for (msg: IotMessage <- messages) {

          val kafkaMessage: Struct = IotMessageConverter.getIotMessageStruct(msg)
          val sourceOffset = Collections.singletonMap("EventHubOffset",
            kafkaMessage.getString(IotMessageConverter.offsetKey))
          val sourceRecord = new SourceRecord(sourcePartition, sourceOffset, this.topic, kafkaMessage.schema(),
            kafkaMessage)
          list += sourceRecord
        }
      }
    } catch {
      case NonFatal(e) =>
        val errorMsg = s"Error while getting SourceRecords for eventHub $eventHubName " +
          s"partition $partition. Exception - ${e.toString} Stack trace - ${e.printStackTrace()}"
        logger.error(errorMsg)
        throw new ConnectException(errorMsg, e)
    }
    logger.debug(s"Obtained ${list.length} SourceRecords from IotHub")
    list.toList
  }
} 
Example 64
Source File: IotMessageConverter.scala    From toketi-kafka-connect-iothub   with MIT License 5 votes vote down vote up
// Copyright (c) Microsoft. All rights reserved.

package com.microsoft.azure.iot.kafka.connect.source

import java.time.Instant
import java.util.Date

import com.microsoft.azure.eventhubs.impl.AmqpConstants
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

object IotMessageConverter {

  val offsetKey = "offset"

  private val schemaName          = "iothub.kafka.connect"
  private val schemaVersion       = 1
  private val deviceIdKey         = "deviceId"
  private val contentTypeKey      = "contentType"
  private val sequenceNumberKey   = "sequenceNumber"
  private val enqueuedTimeKey     = "enqueuedTime"
  private val contentKey          = "content"
  private val systemPropertiesKey = "systemProperties"
  private val propertiesKey       = "properties"
  private val deviceIdIotHubKey   = "iothub-connection-device-id"

  // Public for testing purposes
  lazy val schema: Schema = SchemaBuilder.struct()
    .name(schemaName)
    .version(schemaVersion)
    .field(deviceIdKey, Schema.STRING_SCHEMA)
    .field(offsetKey, Schema.STRING_SCHEMA)
    .field(contentTypeKey, Schema.OPTIONAL_STRING_SCHEMA)
    .field(enqueuedTimeKey, Schema.STRING_SCHEMA)
    .field(sequenceNumberKey, Schema.INT64_SCHEMA)
    .field(contentKey, Schema.STRING_SCHEMA)
    .field(systemPropertiesKey, propertiesMapSchema)
    .field(propertiesKey, propertiesMapSchema)

  private lazy val propertiesMapSchema: Schema = SchemaBuilder.map(Schema.STRING_SCHEMA, Schema.STRING_SCHEMA)

  def getIotMessageStruct(iotMessage: IotMessage): Struct = {

    val systemProperties = iotMessage.systemProperties
    val deviceId: String = getOrDefaultAndRemove(systemProperties, deviceIdIotHubKey, "")
    val offset: String = getOrDefaultAndRemove(systemProperties, AmqpConstants.OFFSET_ANNOTATION_NAME, "")
    val sequenceNumber: Long = getOrDefaultAndRemove(systemProperties, AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME, 0)
    val enqueuedTime: Option[Instant] = getEnqueuedTime(systemProperties)
    val enqueuedTimeStr = if(enqueuedTime.isDefined) enqueuedTime.get.toString else ""

    val properties = iotMessage.properties
    val contentType: String = getOrDefaultAndRemove(properties, contentTypeKey, "")

    val systemPropertiesMap = systemProperties.map(i => (i._1, i._2.toString))

    new Struct(schema)
      .put(deviceIdKey, deviceId)
      .put(offsetKey, offset)
      .put(contentTypeKey, contentType)
      .put(enqueuedTimeKey, enqueuedTimeStr)
      .put(sequenceNumberKey, sequenceNumber)
      .put(contentKey, iotMessage.content)
      .put(systemPropertiesKey, systemPropertiesMap.asJava)
      .put(propertiesKey, properties.asJava)
  }

  private def getEnqueuedTime(map: scala.collection.mutable.Map[String, Object]): Option[Instant] = {
    val enqueuedTimeValue: Date = getOrDefaultAndRemove(map, AmqpConstants.ENQUEUED_TIME_UTC_ANNOTATION_NAME, null)
    if (enqueuedTimeValue != null) Some(enqueuedTimeValue.toInstant) else None
  }

  private def getOrDefaultAndRemove[T: ClassTag, S: ClassTag](map: scala.collection.mutable.Map[String, S],
      key: String, defaultVal: T): T = {

    if (map.contains(key)) {
      val retVal: T = map(key).asInstanceOf[T]
      map.remove(key)
      retVal
    } else {
      defaultVal
    }
  }
} 
Example 65
Source File: SchemaSpec.scala    From kafka-connect-cassandra   with Apache License 2.0 5 votes vote down vote up
package com.tuplejump.kafka.connect.cassandra

import com.datastax.driver.core.{ DataType, TestUtil}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord

class SchemaSpec extends AbstractFlatSpec {

  it should "convert a struct schema with single field" in {
    val topic = "topicx"

    val sc = sinkConfig(topic, "keyspacex", "tablex", List("id"))
    sc.options.consistency should be (TaskConfig.DefaultSinkConsistency)
    sc.schema.columnNames should === (List("id"))
    sc.query.cql should be ("INSERT INTO keyspacex.tablex(id) VALUES(?)")

    val schema = SchemaBuilder.struct.name("record").version(1).field("id", Schema.INT32_SCHEMA).build
    val value = new Struct(schema).put("id", 1)
    val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0)

    sc.schema.route.topic should be (record.topic)
    sc.schema.route.keyspace should be ("keyspacex")
    sc.schema.route.table should be ("tablex")

    sc.schema is record should be (true)
    val query = record.as(sc.schema.namespace)
    query.cql should be("INSERT INTO keyspacex.tablex(id) VALUES(1)")
  }

  it should "convert a struct schema with multiple fields" in {
    val topic = "test_kfk"
    val sc = sinkConfig(topic, "keyspacex", "tablex", List("available", "name", "age"))

    val schema = SchemaBuilder.struct.name("record").version(1)
      .field("available", Schema.BOOLEAN_SCHEMA)
      .field("name", Schema.STRING_SCHEMA)
      .field("age", Schema.INT32_SCHEMA).build
    val value = new Struct(schema).put("name", "user").put("available", false).put("age", 15)
    val record = new SinkRecord("test_kfk", 1, SchemaBuilder.struct.build, "key", schema, value, 0)

    schema.asColumnNames should be (sc.schema.columnNames)

    sc.schema.route.topic should be (record.topic)
    sc.schema is record should be (true)

    sc.query.cql should be ("INSERT INTO keyspacex.tablex(available,name,age) VALUES(?,?,?)")
    val query = record.as(sc.schema.namespace)
    query.cql should be("INSERT INTO keyspacex.tablex(available,name,age) VALUES(false,'user',15)")
  }

  it should "convert cassandra column defs to a source schema" in {
    val colDef = Map(
      "id" -> DataType.cint(),
      "name" -> DataType.varchar())

    val columns = TestUtil.getColumnDef(colDef)
    val expectedSchema = SchemaBuilder.struct()
      .field("id", Schema.INT32_SCHEMA)
      .field("name", Schema.STRING_SCHEMA).build()

    columns.asSchema should be(expectedSchema)
  }

  it should "convert kafka schema and struct to cassandra columns and schema mapping" in {
    import scala.collection.JavaConverters._
    val topic = "a"
    val route = InternalConfig.Route(TaskConfig.SinkRoute + topic, "ks1.t1").get
    val schemaMap = new InternalConfig.Schema(route, Nil, Nil, Nil, List("available","name","age"), "")

    val schema = SchemaBuilder.struct.name("record").version(1)
      .field("available", Schema.BOOLEAN_SCHEMA)
      .field("name", Schema.STRING_SCHEMA)
      .field("age", Schema.INT32_SCHEMA).build
    val struct = new Struct(schema).put("name", "user").put("available", false).put("age", 15)
    val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0)

    schema.asColumnNames should ===(schemaMap.columnNames)
    schemaMap.columnNames should ===(schema.fields.asScala.map(_.name).toList)
    schemaMap is record should be (true)
  }
} 
Example 66
Source File: ProjectionMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import cats.data.NonEmptyList
import com.datamountaineer.kcql.Field
import com.landoop.streamreactor.connect.hive.StructMapper
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}


class ProjectionMapper(projection: NonEmptyList[Field]) extends StructMapper {

  override def map(input: Struct): Struct = {
    // the compatible output schema built from projected fields with aliases applied
    val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, kcqlField) =>
      Option(input.schema.field(kcqlField.getName)).fold(sys.error(s"Missing field $kcqlField")) { field =>
        builder.field(kcqlField.getAlias, field.schema)
      }
    }
    val schema = builder.build()
    projection.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.getAlias, input.get(field.getName))
    }
  }
} 
Example 67
Source File: DropPartitionValuesMapperTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import cats.data.NonEmptyList
import com.landoop.streamreactor.connect.hive.{PartitionKey, PartitionPlan, TableName}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class DropPartitionValuesMapperTest extends AnyFunSuite with Matchers {

  test("strip partition values") {

    val schema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("p", SchemaBuilder.string().required().build())
      .field("q", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().required().build())
      .build()

    val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q")))
    val struct = new Struct(schema).put("a", "a").put("p", "p").put("q", "q").put("z", "z")
    val output = new DropPartitionValuesMapper(plan).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z")
  }

  test("handle partition field is missing in input") {

    val schema = SchemaBuilder.struct()
      .field("a", SchemaBuilder.string().required().build())
      .field("q", SchemaBuilder.string().required().build())
      .field("z", SchemaBuilder.string().required().build())
      .build()


    val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q")))
    val struct = new Struct(schema).put("a", "a").put("q", "q").put("z", "z")
    val output = new DropPartitionValuesMapper(plan).map(struct)
    output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z")
  }
} 
Example 68
Source File: DefaultCommitPolicyTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.staging

import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.concurrent.duration._

class DefaultCommitPolicyTest extends AnyWordSpec with Matchers {

  val schema: Schema = SchemaBuilder.struct()
    .field("name", SchemaBuilder.string().required().build())
    .build()

  val struct = new Struct(schema)

  implicit val conf: Configuration = new Configuration()
  implicit val fs: LocalFileSystem = FileSystem.getLocal(conf)
  val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100))

  private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = {
    val status = fs.getFileStatus(path)
    policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime))
  }

  "DefaultCommitPolicy" should {
    "roll over after interval" in {

      val policy = DefaultCommitPolicy(None, Option(2.seconds), None)
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 10) shouldBe false
      Thread.sleep(2000)
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file count" in {
      val policy = DefaultCommitPolicy(None, None, Some(9))
      val path = new Path("foo")
      fs.create(path)

      shouldFlush(policy, path, 7) shouldBe false
      shouldFlush(policy, path, 8) shouldBe false
      shouldFlush(policy, path, 9) shouldBe true
      shouldFlush(policy, path, 10) shouldBe true

      fs.delete(path, false)
    }
    "roll over after file size" in {
      val policy = DefaultCommitPolicy(Some(10), None, None)
      val path = new Path("foo")
      val out = fs.create(path)
      shouldFlush(policy, path, 7) shouldBe false
      out.writeBytes("wibble wobble wabble wubble")
      out.close()
      shouldFlush(policy, path, 9) shouldBe true
      fs.delete(path, false)
    }
  }
} 
Example 69
Source File: MapValueConverterTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink

import com.landoop.json.sql.JacksonJson
import org.apache.kafka.connect.data.{Schema, Struct}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import scala.collection.JavaConverters._

class MapValueConverterTest extends AnyFunSuite with Matchers {
  test("converts nested payload") {
    val json =
      """
        |{
        |  "idType": 3,
        |  "colorDepth": "",
        |  "threshold" : 45.77,
        |  "evars": {
        |    "evars": {
        |      "eVar1": "Tue Aug 27 2019 12:08:10",
        |      "eVar2": 156692207943934897
        |    }
        |  },
        |  "exclude": {
        |    "id": 0,
        |    "value": false
        |  }
        |}
        |""".stripMargin

    val map = JacksonJson.toMap[Any](json)
    val struct = MapValueConverter.convert(map)
    //Jackson transforming the json to Map the fields order is not retained
    struct.schema().fields().asScala.map(_.name()).sorted shouldBe List("idType", "colorDepth", "threshold", "evars", "exclude").sorted

    struct.schema().field("idType").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA

    struct.schema().field("colorDepth").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA

    struct.schema().field("threshold").schema() shouldBe Schema.OPTIONAL_FLOAT64_SCHEMA

    struct.schema().field("exclude").schema().`type`() shouldBe Schema.Type.STRUCT
    struct.schema().field("exclude").schema().isOptional shouldBe true

    struct.schema().field("evars").schema().`type`() shouldBe Schema.Type.STRUCT
    struct.schema().field("evars").schema().isOptional shouldBe true

    struct.schema().field("evars").schema().fields().asScala.map(_.name()) shouldBe List("evars")
    val evarsInner = struct.schema().field("evars").schema().field("evars")
    evarsInner.schema().`type`() shouldBe Schema.Type.STRUCT
    evarsInner.schema().isOptional shouldBe true
    evarsInner.schema().fields().asScala.map(_.name()).sorted shouldBe List("eVar1", "eVar2").sorted
    evarsInner.schema().field("eVar1").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA
    evarsInner.schema().field("eVar2").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA

    val exclude  = struct.schema().field("exclude").schema()
    exclude.schema().`type`() shouldBe Schema.Type.STRUCT
    exclude.schema().isOptional shouldBe true
    exclude.schema().fields().asScala.map(_.name()).sorted shouldBe List("id", "value").sorted
    exclude.schema().field("id").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA
    exclude.schema().field("value").schema() shouldBe Schema.OPTIONAL_BOOLEAN_SCHEMA

    struct.get("idType") shouldBe 3L
    struct.get("colorDepth") shouldBe ""
    struct.get("threshold") shouldBe 45.77D

    val evarsStruct = struct.get("evars").asInstanceOf[Struct].get("evars").asInstanceOf[Struct]
    evarsStruct.get("eVar1") shouldBe "Tue Aug 27 2019 12:08:10"
    evarsStruct.get("eVar2") shouldBe 156692207943934897L

    val excludeStruct = struct.get("exclude").asInstanceOf[Struct]
    excludeStruct.get("id") shouldBe 0L
    excludeStruct.get("value") shouldBe false
  }

} 
Example 70
Source File: package.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter}

package object parquet {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = {
    if (fs.isDirectory(path)) {
      logger.debug(s"$path is a directory, reading constituent files")
      val remote = fs.listFiles(path, false)
      new Iterator[Path] {
        override def hasNext: Boolean = remote.hasNext
        override def next(): Path = remote.next().getPath
      }.toList
    } else {
      logger.debug(s"Reading $path as a single file")
      List(path)
    }
  }

  def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = {
    ParquetReader.builder(new StructReadSupport, file)
      .withConf(fs.getConf)
      .build()
  }

  def parquetWriter(path: Path,
                    schema: Schema,
                    config: ParquetSinkConfig): ParquetWriter[Struct] = {
    new StructParquetWriterBuilder(path, schema)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(config.enableDictionary)
      .withValidation(config.validation)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withWriteMode(if (config.overwrite) {
        ParquetFileWriter.Mode.OVERWRITE
      } else {
        ParquetFileWriter.Mode.CREATE
      }).build()
  }
} 
Example 71
Source File: StructWriteSupport.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._

// derived from Apache Spark's parquet write support, archive and license here:
// https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)
  private val schemaName = if (schema.name() == null) "schema" else schema.name()
  private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName)

  private val metadata = new java.util.HashMap[String, String]()
  metadata.put("written_by", "streamreactor")

  // The Parquet `RecordConsumer` to which all structs are written
  private var consumer: RecordConsumer = _

  type ValueWriter = (Any) => Unit

  override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String])
  override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata)
  override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer

  override def write(struct: Struct): Unit = {
    writeMessage {
      writeStructFields(struct)
    }
  }

  private def writeStructFields(struct: Struct): Unit = {
    for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) {
      val value = struct.get(field)
      if (value != null) {
        val writer = valueWriter(field.schema())
        writeField(field.name, index) {
          writer(value)
        }
      }
    }
  }

  def valueWriter(schema: Schema): ValueWriter = {
    // todo perhaps introduce something like spark's SpecializedGetters
    schema.`type`() match {
      case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean])
      case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt)
      case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong)
      case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes))
      case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat)
      case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble)
      case Schema.Type.STRUCT => value => {
        logger.debug(s"Writing nested struct")
        val struct = value.asInstanceOf[Struct]
        writeGroup {
          schema.fields.asScala
            .map { field => field -> struct.get(field) }
            .zipWithIndex.foreach { case ((field, v), k) =>
            writeField(field.name, k) {
              valueWriter(field.schema)(v)
            }
          }
        }
      }
      case _ => throw UnsupportedSchemaType(schema.`type`.toString)
    }
  }

  private def writeMessage(f: => Unit): Unit = {
    consumer.startMessage()
    f
    consumer.endMessage()
  }

  private def writeGroup(f: => Unit): Unit = {
    consumer.startGroup()
    // consumer.startMessage()
    f
    //consumer.endMessage()
    consumer.endGroup()
  }

  private def writeField(name: String, k: Int)(f: => Unit): Unit = {
    consumer.startField(name, k)
    f
    consumer.endField(name, k)
  }
} 
Example 72
Source File: StructReadSupport.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import java.util

import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.Struct
import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
import org.apache.parquet.io.api.RecordMaterializer
import org.apache.parquet.schema.MessageType

class StructReadSupport extends ReadSupport[Struct] {

  override def prepareForRead(configuration: Configuration,
                              metaData: util.Map[String, String],
                              fileSchema: MessageType,
                              context: ReadSupport.ReadContext): RecordMaterializer[Struct] = {
    // the file schema in here comes from the footer of the parquet file
    val schema = ParquetSchemas.toKafka(fileSchema)
    new StructMaterializer(schema)
  }

  override def init(context: InitContext): ReadSupport.ReadContext = {
    new ReadSupport.ReadContext(context.getFileSchema)
  }
} 
Example 73
Source File: RootGroupConverter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import com.typesafe.scalalogging.StrictLogging
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.io.api.{Converter, GroupConverter}

import scala.collection.JavaConverters._

class RootGroupConverter(schema: Schema) extends GroupConverter with StrictLogging {
  require(schema.`type`() == Schema.Type.STRUCT)

  var struct: Struct = _
  private val builder = scala.collection.mutable.Map.empty[String, Any]
  private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq

  override def getConverter(k: Int): Converter = converters(k)
  override def start(): Unit = builder.clear()
  override def end(): Unit = struct = {
    val struct = new Struct(schema)
    schema.fields.asScala.map { field =>
      val value = builder.getOrElse(field.name, null)
      try {
        struct.put(field, value)
      } catch {
        case t: Exception =>
          throw t
      }
    }
    struct
  }
} 
Example 74
Source File: ProjectionMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.source.mapper

import cats.data.NonEmptyList
import com.landoop.streamreactor.connect.hive.StructMapper
import com.landoop.streamreactor.connect.hive.source.config.ProjectionField
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}

class ProjectionMapper(projection: NonEmptyList[ProjectionField]) extends StructMapper {

  override def map(input: Struct): Struct = {
    val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, projectionField) =>
      Option(input.schema.field(projectionField.name))
        .fold(sys.error(s"Projection field ${projectionField.name} cannot be found in input")) { field =>
          builder.field(projectionField.alias, field.schema)
        }
    }
    val schema = builder.build()
    projection.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.alias, input.get(field.name))
    }
  }
} 
Example 75
Source File: PartitionValueMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.source.mapper

import com.landoop.streamreactor.connect.hive.{Partition, StructMapper}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}

import scala.collection.JavaConverters._

class PartitionValueMapper(partition: Partition) extends StructMapper {
  override def map(input: Struct): Struct = {

    val builder = SchemaBuilder.struct()
    input.schema.fields.asScala.foreach { field =>
      builder.field(field.name, field.schema)
    }
    partition.entries.toList.foreach { entry =>
      builder.field(entry._1.value, Schema.STRING_SCHEMA)
    }
    val schema = builder.build()

    val struct = new Struct(schema)
    input.schema.fields.asScala.foreach { field =>
      struct.put(field.name, input.get(field.name))
    }
    partition.entries.toList.foreach { entry =>
      struct.put(entry._1.value, entry._2)
    }
    struct
  }
} 
Example 76
Source File: HiveSource.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.source

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveReader, Record}
import com.landoop.streamreactor.connect.hive.source.config.HiveSourceConfig
import com.landoop.streamreactor.connect.hive.source.mapper.{PartitionValueMapper, ProjectionMapper}
import com.landoop.streamreactor.connect.hive.source.offset.HiveSourceOffsetStorageReader
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.kafka.connect.data.Struct
import org.apache.kafka.connect.source.SourceRecord

import scala.collection.JavaConverters._


class HiveSource(db: DatabaseName,
                 tableName: TableName,
                 topic: Topic,
                 offsetReader: HiveSourceOffsetStorageReader,
                 config: HiveSourceConfig)
                (implicit client: IMetaStoreClient, fs: FileSystem) extends Iterator[SourceRecord] {

  val tableConfig = config.tableOptions.filter(_.tableName == tableName).find(_.topic == topic)
    .getOrElse(sys.error(s"Cannot find table configuration for ${db.value}.${tableName.value} => ${topic.value}"))

  private val table = client.getTable(db.value, tableName.value)
  private val format = HiveFormat(hive.serde(table))
  private val metastoreSchema = HiveSchemas.toKafka(table)
  private val parts = TableFileScanner.scan(db, tableName)

  private val readers = parts.map { case (path, partition) =>

    val fns: Seq[Struct => Struct] = Seq(
      partition.map(new PartitionValueMapper(_).map _),
      tableConfig.projection.map(new ProjectionMapper(_).map _)
    ).flatten
    val mapper: Struct => Struct = Function.chain(fns)

    val sourceOffset = offsetReader.offset(SourcePartition(db, tableName, topic, path)).getOrElse(SourceOffset(0))

    new HiveReader {
      lazy val reader = format.reader(path, sourceOffset.rowNumber, metastoreSchema)
      override def iterator: Iterator[Record] = reader.iterator.map { record =>
        Record(mapper(record.struct), record.path, record.offset)
      }
      override def close(): Unit = reader.close()
    }
  }

  private val iterator: Iterator[Record] = readers.map(_.iterator).reduce(_ ++ _).take(tableConfig.limit)

  override def hasNext: Boolean = iterator.hasNext

  override def next(): SourceRecord = {

    val record = iterator.next
    val sourcePartition = SourcePartition(db, tableName, topic, record.path)
    val offset = SourceOffset(record.offset)

    new SourceRecord(
      fromSourcePartition(sourcePartition).asJava,
      fromSourceOffset(offset).asJava,
      topic.value,
      record.struct.schema,
      record.struct
    )
  }

  def close(): Unit = {
    readers.foreach(_.close())
  }
} 
Example 77
Source File: CassandraSinkTaskSpec.scala    From kafka-connect-cassandra   with Apache License 2.0 5 votes vote down vote up
package com.tuplejump.kafka.connect.cassandra

import scala.collection.JavaConverters._
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.{SinkRecord, SinkTaskContext}

class CassandraSinkTaskSpec extends AbstractFlatSpec {

  val topicName = "test_kv_topic"
  val tableName = "test.kv"
  val config = sinkProperties(Map(topicName -> tableName))

  it should "start sink task" in {
    val sinkTask = new CassandraSinkTask()
    val mockContext = mock[SinkTaskContext]

    sinkTask.initialize(mockContext)
    sinkTask.start(config.asJava)
    sinkTask.stop()
  }

  it should "save records in cassandra" in {
    val sinkTask = new CassandraSinkTask()
    val mockContext = mock[SinkTaskContext]

    sinkTask.initialize(mockContext)
    sinkTask.start(config.asJava)

    val valueSchema = SchemaBuilder.struct.name("record").version(1)
      .field("key", Schema.STRING_SCHEMA)
      .field("value", Schema.INT32_SCHEMA).build
    val value1 = new Struct(valueSchema).put("key", "pqr").put("value", 15)
    val value2 = new Struct(valueSchema).put("key", "abc").put("value", 17)

    val record1 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value1, 0)
    val record2 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value2, 0)

    sinkTask.put(List(record1, record2).asJavaCollection)

    sinkTask.stop()

    val cc = CassandraCluster.local
    val session = cc.session
    val result = session.execute(s"select count(1) from $tableName").one()
    val rowCount = result.getLong(0)
    rowCount should be(2)
    cc.shutdown()
  }
} 
Example 78
Source File: MetastoreSchemaAlignMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import com.landoop.streamreactor.connect.hive.StructMapper
import org.apache.kafka.connect.data.{Schema, Struct}

import scala.util.Try


class MetastoreSchemaAlignMapper(schema: Schema) extends StructMapper {

  import scala.collection.JavaConverters._

  override def map(input: Struct): Struct = {
    //hive converts everything to lowercase
    val inputFieldsMapping = input.schema().fields().asScala.map { f => f.name().toLowerCase() -> f.name() }.toMap
    val struct = schema.fields.asScala.foldLeft(new Struct(schema)) { (struct, field) =>
      Try(input.get(inputFieldsMapping(field.name))).toOption match {
        case Some(value) => struct.put(field.name, value)
        case None if field.schema.isOptional => struct.put(field.name, null)
        case None => sys.error(s"Cannot map struct to required schema; ${field.name} is missing, no default value has been supplied and null is not permitted")
      }
    }
    struct
  }
} 
Example 79
Source File: DropPartitionValuesMapper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink.mapper

import com.landoop.streamreactor.connect.hive.{PartitionPlan, StructMapper}
import org.apache.kafka.connect.data.{SchemaBuilder, Struct}


class DropPartitionValuesMapper(plan: PartitionPlan) extends StructMapper {

  import scala.collection.JavaConverters._

  override def map(input: Struct): Struct = {
    val partitionKeys = plan.keys.map(_.value).toList
    val dataFields = input.schema.fields().asScala.filterNot(field => partitionKeys.contains(field.name))
    val builder = dataFields.foldLeft(SchemaBuilder.struct) { (builder, field) =>
      builder.field(field.name, field.schema)
    }
    val schema = builder.build()
    dataFields.foldLeft(new Struct(schema)) { (struct, field) =>
      struct.put(field.name, input.get(field.name))
    }
  }
} 
Example 80
Source File: ValueConverter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink

import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

object ValueConverter {
  def apply(record: SinkRecord): Struct = record.value match {
    case struct: Struct => StructValueConverter.convert(struct)
    case map: Map[_, _] => MapValueConverter.convert(map)
    case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap)
    case string: String => StringValueConverter.convert(string)
    case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}")
  }
}

trait ValueConverter[T] {
  def convert(value: T): Struct
}

object StructValueConverter extends ValueConverter[Struct] {
  override def convert(struct: Struct): Struct = struct
}

object MapValueConverter extends ValueConverter[Map[_, _]] {
  def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = {
    value match {
      case s: String =>
        builder.field(key, Schema.OPTIONAL_STRING_SCHEMA)
        s
      case l: Long =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        l
      case i: Int =>
        builder.field(key, Schema.OPTIONAL_INT64_SCHEMA)
        i.toLong
      case b: Boolean =>
        builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA)
        b
      case f: Float =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        f.toDouble
      case d: Double =>
        builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA)
        d
      case innerMap: java.util.Map[_, _] =>
        val innerStruct = convert(innerMap.asScala.toMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct

      case innerMap: Map[_, _] =>
        val innerStruct = convert(innerMap, true)
        builder.field(key, innerStruct.schema())
        innerStruct
    }
  }

  def convert(map: Map[_, _], optional: Boolean) = {
    val builder = SchemaBuilder.struct()
    val values = map.map { case (k, v) =>
      val key = k.toString
      val value = convertValue(v, key, builder)
      key -> value
    }.toList
    if (optional) builder.optional()
    val schema = builder.build
    val struct = new Struct(schema)
    values.foreach { case (key, value) =>
      struct.put(key.toString, value)
    }
    struct
  }
  override def convert(map: Map[_, _]): Struct = convert(map, false)
}

object StringValueConverter extends ValueConverter[String] {
  override def convert(string: String): Struct = {
    val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build()
    new Struct(schema).put("a", string)
  }
} 
Example 81
Source File: HiveSinkState.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.sink

import com.landoop.streamreactor.connect.hive
import com.landoop.streamreactor.connect.hive._
import com.landoop.streamreactor.connect.hive.sink.config.TableOptions
import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.hive.metastore.api.Table
import org.apache.kafka.connect.data.{Schema, Struct}

case class HiveSinkState(offsets: Map[TopicPartition, Offset],
                         committedOffsets: Map[TopicPartition, Offset],
                         table: Table,
                         tableLocation: Path,
                         plan: Option[PartitionPlan],
                         metastoreSchema: Schema,
                         mapper: Struct => Struct,
                         lastSchema: Schema) {
  def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = {
    copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset))
  }

  def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(offsets = offsets + (tp -> offset))
  }

  def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = {
    copy(committedOffsets = committedOffsets ++ offsets)
  }

  def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = {
    copy(committedOffsets = committedOffsets + (tp -> offset))
  }

  def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema)
}

object HiveSinkState {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def from(schema: Schema,
           table: TableOptions,
           dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = {
    logger.info(s"Init sink for schema $schema")

    val hiveTable = getOrCreateTable(table, dbName, schema)
    val tableLocation = new Path(hiveTable.getSd.getLocation)
    val plan = hive.partitionPlan(hiveTable)
    val metastoreSchema = table.evolutionPolicy
      .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema)
      .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema"))

    val mapperFns: Seq[Struct => Struct] = Seq(
      table.projection.map(new ProjectionMapper(_)),
      Some(new MetastoreSchemaAlignMapper(metastoreSchema)),
      plan.map(new DropPartitionValuesMapper(_))
    ).flatten.map(mapper => mapper.map _)

    val mapper = Function.chain(mapperFns)

    HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema)
  }

  def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema)
                      (implicit client: IMetaStoreClient, fs: FileSystem): Table = {

    def create: Table = {
      val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",")
      logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]")
      hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format)
    }

    logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}")
    client.tableExists(dbName.value, table.tableName.value) match {
      case true if table.overwriteTable =>
        hive.dropTable(dbName, table.tableName, true)
        create
      case true => client.getTable(dbName.value, table.tableName.value)
      case false if table.createTable => create
      case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist")
    }
  }
} 
Example 82
Source File: ParquetHiveFormat.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.formats

import com.landoop.streamreactor.connect.hive.Serde
import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.ParquetWriter

import scala.util.Try

object ParquetHiveFormat extends HiveFormat {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  override def serde = Serde(
    "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
    "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
    Map("serialization.format" -> "1")
  )

  override def writer(path: Path, schema: Schema)
                     (implicit fs: FileSystem): HiveWriter = new HiveWriter {

    logger.debug(s"Creating parquet writer at $path")

    val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true))
    Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx")))

    val createdTime: Long = System.currentTimeMillis()
    var lastKnownFileSize: Long = fs.getFileStatus(path).getLen
    var readFileSize = false
    var count = 0

    override def write(struct: Struct): Long = {
      writer.write(struct)
      count = count + 1
      readFileSize = true
      count
    }

    override def close(): Unit = {
      logger.debug(s"Closing writer at path $path")
      writer.close()
    }

    override def currentCount: Long = count
    override def file: Path = path
    override def fileSize: Long = {
      if (readFileSize) {
        lastKnownFileSize = fs.getFileStatus(path).getLen
        readFileSize = false
      }

      lastKnownFileSize
    }
  }

  override def reader(path: Path, startAt: Int, schema: Schema)
                     (implicit fs: FileSystem): HiveReader = new HiveReader {

    logger.debug(s"Creating parquet reader for $path with offset $startAt")
    val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path)
    var offset = startAt

    override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct =>
      val record = Record(struct, path, offset)
      offset = offset + 1
      record
    }

    override def close(): Unit = reader.close()
  }
} 
Example 83
Source File: SinkRecordParser.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.influx.converters

import com.datamountaineer.streamreactor.connect.influx.helpers.Util
import com.datamountaineer.streamreactor.connect.influx.writers.KcqlDetails.Path
import com.datamountaineer.streamreactor.connect.influx.writers.ValuesExtractor
import com.fasterxml.jackson.databind.JsonNode
import com.landoop.json.sql.JacksonJson
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.util.Try

object SinkRecordParser {
  type Field = String

  trait ParsedSinkRecord {
    def valueFields(ignored: Set[Path]): Seq[(String, Any)]

    def field(path: Path): Option[Any]
  }

  trait ParsedKeyValueSinkRecord extends ParsedSinkRecord {
    def keyFields(ignored: Set[Path]): Seq[(String, Any)]
  }

  private case class JsonSinkRecord(json: JsonNode) extends ParsedSinkRecord {
    override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(json, ignored.map(_.value.last))

    override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(json, path.value))
  }

  private case class StructSinkRecord(struct: Struct) extends ParsedSinkRecord {
    override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(struct, ignored.map(_.value.last))

    override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(struct, path.value))
  }

  private case class MapSinkRecord(map: java.util.Map[String, Any]) extends ParsedSinkRecord {
    override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(map, ignored.map(_.value.last))

    override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(map, path.value))
  }

  private case class KeyValueRecord(key: ParsedSinkRecord, value: ParsedSinkRecord) extends ParsedKeyValueSinkRecord {
    override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = value.valueFields(ignored)

    override def field(path: Path): Option[Any] = path.value.headOption match {
      case Some(fieldName) if Util.caseInsensitiveComparison(fieldName, Util.KEY_CONSTANT) => key.field(Path(path.value.tail))
      case Some(_) => value.field(path)
      case None => throw new IllegalArgumentException("Unreachable situation detected. Path should never be empty")
    }

    override def keyFields(ignored: Set[Path]): Seq[(String, Any)] = key.valueFields(ignored)
  }

  def build(record: SinkRecord): Try[ParsedKeyValueSinkRecord] = {

    val key = Option(record.keySchema()).map(_.`type`()) match {
      case Some(Schema.Type.STRING) => Try(JsonSinkRecord(JacksonJson.asJson(record.key().asInstanceOf[String])))
      case Some(Schema.Type.STRUCT) => Try(StructSinkRecord(record.key().asInstanceOf[Struct]))
      case None => Try(MapSinkRecord(record.key().asInstanceOf[java.util.Map[String, Any]]))
    }

    val value = Option(record.valueSchema()).map(_.`type`()) match {
      case Some(Schema.Type.STRING) =>
        Try(require(record.value() != null && record.value().getClass == classOf[String], "The SinkRecord payload should be of type String")).flatMap(_ => Try(JsonSinkRecord(JacksonJson.asJson(record.value().asInstanceOf[String]))))
      case Some(Schema.Type.STRUCT) =>
        Try(require(record.value() != null && record.value().getClass == classOf[Struct], "The SinkRecord payload should be of type Struct")).flatMap(_ => Try(StructSinkRecord(record.value().asInstanceOf[Struct])))
      case None =>
        Try(require(record.value() != null && record.value().isInstanceOf[java.util.Map[_, _]], "The SinkRecord payload should be of type java.util.Map[String, Any]")).flatMap(_ => Try(MapSinkRecord(record.value().asInstanceOf[java.util.Map[String, Any]])))
    }

    key
      .flatMap(key => value.map(key -> _))
      .map { case (k, v) => KeyValueRecord(k, v) }
  }
} 
Example 84
Source File: StructFieldsRowKeyBuilderTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class StructFieldsRowKeyBuilderTest extends AnyWordSpec with Matchers {
  "StructFieldsRowKeyBuilder" should {
    "raise an exception if the field is not present in the struct" in {
      intercept[IllegalArgumentException] {
        val schema = SchemaBuilder.struct().name("com.example.Person")
          .field("firstName", Schema.STRING_SCHEMA)
          .field("age", Schema.INT32_SCHEMA)
          .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

        val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

        val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
        //val field = Field("threshold", "threshold", false)

        StructFieldsRowKeyBuilderBytes(List("threshold")).build(sinkRecord, null)
      }
    }

    "create the row key based on one single field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName")).build(sinkRecord, null) shouldBe "Alex".fromString
    }

    "create the row key based on more thant one field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      //val field2 = Field("age", "age", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName", "age")).build(sinkRecord, null) shouldBe
        Bytes.add("Alex".fromString(), "\n".fromString(), 30.fromInt())
    }
  }
} 
Example 85
Source File: ObjectMessageConverter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.jms.sink.converters

import com.datamountaineer.streamreactor.connect.jms.config.JMSSetting
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import javax.jms.{ObjectMessage, Session}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

class ObjectMessageConverter extends JMSMessageConverter with ConverterUtil {
  override def convert(record: SinkRecord, session: Session, setting: JMSSetting): (String, ObjectMessage) = {
    val converted =  super[ConverterUtil].convert(record, setting.fields, setting.ignoreField)
    val msg = session.createObjectMessage()
    val value = converted.value()
    val schema = converted.valueSchema()
    schema.`type`() match {
      case Schema.Type.STRUCT =>
        val struct = value.asInstanceOf[Struct]
        struct.schema().fields().asScala.foreach { f =>
          ObjectMessageConverterFn(f.name(), struct.get(f), f.schema(), msg, session)
        }

      case _ => ObjectMessageConverterFn("field", value, schema, msg, session)
    }
    (setting.source, msg)
  }
}

object ObjectMessageConverterFn {
  def apply(fieldName: String, value: AnyRef, schema: Schema, msg: ObjectMessage, session: Session): Unit = {
    schema.`type`() match {
      case Schema.Type.BYTES => msg.setObjectProperty(fieldName, value.asInstanceOf[Array[Byte]].toList.asJava)
      case Schema.Type.BOOLEAN => msg.setBooleanProperty(fieldName, value.asInstanceOf[Boolean])
      case Schema.Type.FLOAT32 => msg.setFloatProperty(fieldName, value.asInstanceOf[Float])
      case Schema.Type.FLOAT64 => msg.setDoubleProperty(fieldName, value.asInstanceOf[Double])
      case Schema.Type.INT8 => msg.setByteProperty(fieldName, value.asInstanceOf[Byte])
      case Schema.Type.INT16 => msg.setShortProperty(fieldName, value.asInstanceOf[Short])
      case Schema.Type.INT32 => msg.setIntProperty(fieldName, value.asInstanceOf[Int])
      case Schema.Type.INT64 => msg.setLongProperty(fieldName, value.asInstanceOf[Long])
      case Schema.Type.STRING => msg.setStringProperty(fieldName, value.asInstanceOf[String])
      case Schema.Type.MAP => msg.setObjectProperty(fieldName, value)
      case Schema.Type.ARRAY => msg.setObjectProperty(fieldName, value)
      case Schema.Type.STRUCT =>
        val nestedMsg = session.createObjectMessage()
        val struct = value.asInstanceOf[Struct]
        struct.schema().fields().asScala.foreach { f =>
          ObjectMessageConverterFn(f.name(), struct.get(f), f.schema(), nestedMsg, session)
        }
        msg.setObjectProperty(fieldName, nestedMsg)
    }
  }
} 
Example 86
Source File: MapMessageConverter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.jms.sink.converters

import com.datamountaineer.streamreactor.connect.jms.config.JMSSetting
import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil
import javax.jms.{MapMessage, Session}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.sink.SinkRecord

import scala.collection.JavaConverters._

class MapMessageConverter extends JMSMessageConverter with ConverterUtil {
  override def convert(record: SinkRecord, session: Session, setting: JMSSetting): (String, MapMessage) = {
    val converted =  super[ConverterUtil].convert(record, setting.fields, setting.ignoreField)
    val msg = session.createMapMessage()
    val value = converted.value()
    val schema = converted.valueSchema()
    schema.`type`() match {
      case Schema.Type.STRUCT =>
        val struct = value.asInstanceOf[Struct]
        struct.schema().fields().asScala.foreach { f =>
          MapMessageBuilderFn(f.name(), struct.get(f), f.schema(), msg, session)
        }

      case _ => MapMessageBuilderFn("field", value, schema, msg, session)
    }
    (setting.source, msg)
  }
}


object MapMessageBuilderFn {
  def apply(fieldName: String, value: AnyRef, schema: Schema, msg: MapMessage, session: Session): Unit = {
    schema.`type`() match {
      case Schema.Type.BYTES => msg.setBytes(fieldName, value.asInstanceOf[Array[Byte]])
      case Schema.Type.BOOLEAN => msg.setBoolean(fieldName, value.asInstanceOf[Boolean])
      case Schema.Type.FLOAT32 => msg.setFloat(fieldName, value.asInstanceOf[Float])
      case Schema.Type.FLOAT64 => msg.setDouble(fieldName, value.asInstanceOf[Double])
      case Schema.Type.INT8 => msg.setByte(fieldName, value.asInstanceOf[Byte])
      case Schema.Type.INT16 => msg.setShort(fieldName, value.asInstanceOf[Short])
      case Schema.Type.INT32 => msg.setInt(fieldName, value.asInstanceOf[Int])
      case Schema.Type.INT64 => msg.setLong(fieldName, value.asInstanceOf[Long])
      case Schema.Type.STRING => msg.setString(fieldName, value.asInstanceOf[String])
      case Schema.Type.MAP => msg.setObject(fieldName, value)
      case Schema.Type.ARRAY => msg.setObject(fieldName, value)
      case Schema.Type.STRUCT =>
        val nestedMsg = session.createMapMessage()
        val struct = value.asInstanceOf[Struct]
        struct.schema().fields().asScala.foreach { f =>
          MapMessageBuilderFn(f.name(), struct.get(f), f.schema(), nestedMsg, session)
        }
        msg.setObject(fieldName, nestedMsg)
    }
  }
}