org.apache.kafka.connect.data.Schema Scala Examples
The following examples show how to use org.apache.kafka.connect.data.Schema.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OrcSink.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.orc import com.landoop.streamreactor.connect.hive.orc.vectors.{OrcVectorWriter, StructVectorWriter} import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, StructUtils} import com.typesafe.scalalogging.StrictLogging import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector import org.apache.kafka.connect.data.{Schema, Struct} import scala.collection.JavaConverters._ class OrcSink(path: Path, schema: Schema, config: OrcSinkConfig)(implicit fs: FileSystem) extends StrictLogging { private val typeDescription = OrcSchemas.toOrc(schema) private val structWriter = new StructVectorWriter(typeDescription.getChildren.asScala.map(OrcVectorWriter.fromSchema)) private val batch = typeDescription.createRowBatch(config.batchSize) private val vector = new StructColumnVector(batch.numCols, batch.cols: _*) private val orcWriter = createOrcWriter(path, typeDescription, config) private var n = 0 def flush(): Unit = { logger.debug(s"Writing orc batch [size=$n, path=$path]") batch.size = n orcWriter.addRowBatch(batch) orcWriter.writeIntermediateFooter batch.reset() n = 0 } def write(struct: Struct): Unit = { structWriter.write(vector, n, Some(StructUtils.extractValues(struct))) n = n + 1 if (n == config.batchSize) flush() } def close(): Unit = { if (n > 0) flush() orcWriter.close() } }
Example 2
Source File: RootGroupConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.io.api.{Converter, GroupConverter} import scala.collection.JavaConverters._ class RootGroupConverter(schema: Schema) extends GroupConverter with StrictLogging { require(schema.`type`() == Schema.Type.STRUCT) var struct: Struct = _ private val builder = scala.collection.mutable.Map.empty[String, Any] private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq override def getConverter(k: Int): Converter = converters(k) override def start(): Unit = builder.clear() override def end(): Unit = struct = { val struct = new Struct(schema) schema.fields.asScala.map { field => val value = builder.getOrElse(field.name, null) try { struct.put(field, value) } catch { case t: Exception => throw t } } struct } }
Example 3
Source File: PartitionValueMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source.mapper import com.landoop.streamreactor.connect.hive.{Partition, StructMapper} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import scala.collection.JavaConverters._ class PartitionValueMapper(partition: Partition) extends StructMapper { override def map(input: Struct): Struct = { val builder = SchemaBuilder.struct() input.schema.fields.asScala.foreach { field => builder.field(field.name, field.schema) } partition.entries.toList.foreach { entry => builder.field(entry._1.value, Schema.STRING_SCHEMA) } val schema = builder.build() val struct = new Struct(schema) input.schema.fields.asScala.foreach { field => struct.put(field.name, input.get(field.name)) } partition.entries.toList.foreach { entry => struct.put(entry._1.value, entry._2) } struct } }
Example 4
Source File: MetastoreSchemaAlignMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import com.landoop.streamreactor.connect.hive.StructMapper import org.apache.kafka.connect.data.{Schema, Struct} import scala.util.Try class MetastoreSchemaAlignMapper(schema: Schema) extends StructMapper { import scala.collection.JavaConverters._ override def map(input: Struct): Struct = { //hive converts everything to lowercase val inputFieldsMapping = input.schema().fields().asScala.map { f => f.name().toLowerCase() -> f.name() }.toMap val struct = schema.fields.asScala.foldLeft(new Struct(schema)) { (struct, field) => Try(input.get(inputFieldsMapping(field.name))).toOption match { case Some(value) => struct.put(field.name, value) case None if field.schema.isOptional => struct.put(field.name, null) case None => sys.error(s"Cannot map struct to required schema; ${field.name} is missing, no default value has been supplied and null is not permitted") } } struct } }
Example 5
Source File: HiveWriterManager.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive.{Offset, TopicPartition, TopicPartitionOffset} import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveWriter} import com.landoop.streamreactor.connect.hive.sink.staging.StageManager import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.Schema def flush(offsets: Map[TopicPartition, Offset]): Unit = { logger.info(s"Flushing offsets $offsets") // we may not have an offset for a given topic/partition if no data was written to that TP writers.foreach { case (key, writer) => writer.close() offsets.get(key.tp).foreach { offset => stageManager.commit(writer.file, key.tp.withOffset(offset)) } writers.remove(key) } } def getWriters: Seq[OpenWriter] = writers.map { case (key, writer) => OpenWriter(key.tp, key.dir, writer) }.toList }
Example 6
Source File: ValueConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ object ValueConverter { def apply(record: SinkRecord): Struct = record.value match { case struct: Struct => StructValueConverter.convert(struct) case map: Map[_, _] => MapValueConverter.convert(map) case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap) case string: String => StringValueConverter.convert(string) case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}") } } trait ValueConverter[T] { def convert(value: T): Struct } object StructValueConverter extends ValueConverter[Struct] { override def convert(struct: Struct): Struct = struct } object MapValueConverter extends ValueConverter[Map[_, _]] { def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = { value match { case s: String => builder.field(key, Schema.OPTIONAL_STRING_SCHEMA) s case l: Long => builder.field(key, Schema.OPTIONAL_INT64_SCHEMA) l case i: Int => builder.field(key, Schema.OPTIONAL_INT64_SCHEMA) i.toLong case b: Boolean => builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA) b case f: Float => builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA) f.toDouble case d: Double => builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA) d case innerMap: java.util.Map[_, _] => val innerStruct = convert(innerMap.asScala.toMap, true) builder.field(key, innerStruct.schema()) innerStruct case innerMap: Map[_, _] => val innerStruct = convert(innerMap, true) builder.field(key, innerStruct.schema()) innerStruct } } def convert(map: Map[_, _], optional: Boolean) = { val builder = SchemaBuilder.struct() val values = map.map { case (k, v) => val key = k.toString val value = convertValue(v, key, builder) key -> value }.toList if (optional) builder.optional() val schema = builder.build val struct = new Struct(schema) values.foreach { case (key, value) => struct.put(key.toString, value) } struct } override def convert(map: Map[_, _]): Struct = convert(map, false) } object StringValueConverter extends ValueConverter[String] { override def convert(string: String): Struct = { val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build() new Struct(schema).put("a", string) } }
Example 7
Source File: AddEvolutionPolicy.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.evolution import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.kafka.connect.data.Schema import scala.collection.JavaConverters._ import scala.util.Try object AddEvolutionPolicy extends EvolutionPolicy { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def evolve(dbName: DatabaseName, tableName: TableName, metastoreSchema: Schema, inputSchema: Schema) (implicit client: IMetaStoreClient): Try[Schema] = Try { val missing = inputSchema.fields.asScala .filter(f => metastoreSchema.field(f.name) == null) .map(HiveSchemas.toFieldSchema) if (missing.nonEmpty) { logger.info(s"Evolving hive metastore to add: ${missing.mkString(",")}") val table = client.getTable(dbName.value, tableName.value) val cols = table.getSd.getCols missing.foreach(field => cols.add(field)) table.getSd.setCols(cols) client.alter_table(dbName.value, tableName.value, table) HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value)) } else { metastoreSchema } } }
Example 8
Source File: IgnoreEvolutionPolicy.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.evolution import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.kafka.connect.data.Schema import scala.collection.JavaConverters._ import scala.util.Try object IgnoreEvolutionPolicy extends EvolutionPolicy { override def evolve(dbName: DatabaseName, tableName: TableName, metastoreSchema: Schema, inputSchema: Schema) (implicit client: IMetaStoreClient): Try[Schema] = Try { HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value)) }.map { schema => val compatible = schema.fields().asScala.forall { field => inputSchema.field(field.name) != null || field.schema().isOptional || field.schema().defaultValue() != null } if (compatible) schema else sys.error("Input Schema is not compatible with the metastore") } }
Example 9
Source File: StrictEvolutionPolicy.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.evolution import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.kafka.connect.data.Schema import scala.collection.JavaConverters._ import scala.util.Try object StrictEvolutionPolicy extends EvolutionPolicy { override def evolve(dbName: DatabaseName, tableName: TableName, metastoreSchema: Schema, inputSchema: Schema) (implicit client: IMetaStoreClient): Try[Schema] = Try { val schema = HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value)) schema }.map { schema => //Hive keeps the fields in lowercase val inputFields = inputSchema.fields().asScala.map { f => f.name().toLowerCase() }.toSet schema.fields().asScala.foreach { field => val exists = inputFields.contains(field.name) val optional = field.schema().isOptional val default = field.schema().defaultValue() val compatible = exists || optional || default != null if (!compatible) { sys.error(s"Input Schema is not compatible with the metastore for field [${field.name()}]") } } schema } }
Example 10
Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.sink.config.TableOptions import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.Table import org.apache.kafka.connect.data.{Schema, Struct} case class HiveSinkState(offsets: Map[TopicPartition, Offset], committedOffsets: Map[TopicPartition, Offset], table: Table, tableLocation: Path, plan: Option[PartitionPlan], metastoreSchema: Schema, mapper: Struct => Struct, lastSchema: Schema) { def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = { copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset)) } def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(offsets = offsets + (tp -> offset)) } def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = { copy(committedOffsets = committedOffsets ++ offsets) } def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(committedOffsets = committedOffsets + (tp -> offset)) } def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema) } object HiveSinkState { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def from(schema: Schema, table: TableOptions, dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = { logger.info(s"Init sink for schema $schema") val hiveTable = getOrCreateTable(table, dbName, schema) val tableLocation = new Path(hiveTable.getSd.getLocation) val plan = hive.partitionPlan(hiveTable) val metastoreSchema = table.evolutionPolicy .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema) .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema")) val mapperFns: Seq[Struct => Struct] = Seq( table.projection.map(new ProjectionMapper(_)), Some(new MetastoreSchemaAlignMapper(metastoreSchema)), plan.map(new DropPartitionValuesMapper(_)) ).flatten.map(mapper => mapper.map _) val mapper = Function.chain(mapperFns) HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema) } def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema) (implicit client: IMetaStoreClient, fs: FileSystem): Table = { def create: Table = { val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",") logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]") hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format) } logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}") client.tableExists(dbName.value, table.tableName.value) match { case true if table.overwriteTable => hive.dropTable(dbName, table.tableName, true) create case true => client.getTable(dbName.value, table.tableName.value) case false if table.createTable => create case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist") } } }
Example 11
Source File: domain.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import cats.Show import cats.data.NonEmptyList import org.apache.hadoop.fs.Path import org.apache.kafka.common.{TopicPartition => KafkaTopicPartition} import org.apache.kafka.connect.data.Schema case class Topic(value: String) { require(value != null && value.trim.nonEmpty) } case class Offset(value: Long) { require(value >= 0) } case class TopicPartition(topic: Topic, partition: Int) { def withOffset(offset: Offset): TopicPartitionOffset = TopicPartitionOffset(topic, partition, offset) def toKafka = new KafkaTopicPartition(topic.value, partition) } case class TopicPartitionOffset(topic: Topic, partition: Int, offset: Offset) { def toTopicPartition = TopicPartition(topic, partition) } case class DatabaseName(value: String) { require(value != null && value.trim.nonEmpty) } case class TableName(value: String) { require(value != null && value.trim.nonEmpty) } // contains all the partition keys for a particular table case class PartitionPlan(tableName: TableName, keys: NonEmptyList[PartitionKey]) // contains a partition key, which you can think of as like a partition column name case class PartitionKey(value: String) // defines a partition key field case class PartitionField(name: String, schema: Schema = Schema.STRING_SCHEMA, comment: Option[String] = None) { require(name != null && name.trim.nonEmpty) } // contains a single partition in a table, that is one set of unique values, one per partition key case class Partition(entries: NonEmptyList[(PartitionKey, String)], location: Option[Path]) case class Serde(serializationLib: String, inputFormat: String, outputFormat: String, params: Map[String, String]) // generates the default hive metatstore location string for a partition object DefaultPartitionLocation extends Show[Partition] { override def show(t: Partition): String = { t.entries.map { case (key, value) => key.value + "=" + value }.toList.mkString("/") } }
Example 12
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.Schema import org.apache.orc.OrcFile.EncodingStrategy import org.apache.orc._ package object orc { def createOrcWriter(path: Path, schema: TypeDescription, config: OrcSinkConfig) (implicit fs: FileSystem): Writer = { val options = OrcFile.writerOptions(null, fs.getConf).setSchema(schema) options.compress(config.compressionKind) options.encodingStrategy(config.encodingStrategy) options.blockPadding(config.blockPadding) options.version(OrcFile.Version.V_0_12) config.bloomFilterColumns.map(_.mkString(",")).foreach(options.bloomFilterColumns) config.rowIndexStride.foreach(options.rowIndexStride) config.blockSize.foreach(options.blockSize) config.stripeSize.foreach(options.stripeSize) if (config.overwrite && fs.exists(path)) fs.delete(path, false) OrcFile.createWriter(path, options) } def source(path: Path, config: OrcSourceConfig) (implicit fs: FileSystem) = new OrcSource(path, config) def sink(path: Path, schema: Schema, config: OrcSinkConfig) (implicit fs: FileSystem) = new OrcSink(path, schema, config) } case class OrcSourceConfig() case class OrcSinkConfig(overwrite: Boolean = false, batchSize: Int = 1024, // orc default is 1024 encodingStrategy: EncodingStrategy = EncodingStrategy.COMPRESSION, compressionKind: CompressionKind = CompressionKind.SNAPPY, blockPadding: Boolean = true, blockSize: Option[Long] = None, stripeSize: Option[Long] = None, bloomFilterColumns: Seq[String] = Nil, rowIndexStride: Option[Int] = None)
Example 13
Source File: NestedGroupConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.{Field, Schema} import org.apache.parquet.io.api.{Converter, GroupConverter} import scala.collection.JavaConverters._ class NestedGroupConverter(schema: Schema, field: Field, parentBuilder: scala.collection.mutable.Map[String, Any]) extends GroupConverter with StrictLogging { private[parquet] val builder = scala.collection.mutable.Map.empty[String, Any] private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq override def getConverter(k: Int): Converter = converters(k) override def start(): Unit = builder.clear() override def end(): Unit = parentBuilder.put(field.name, builder.result) }
Example 14
Source File: OrcSchemas.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.orc import com.landoop.streamreactor.connect.hive.UnsupportedSchemaType import org.apache.kafka.connect.data.{Decimal, Schema, SchemaBuilder} import org.apache.orc.TypeDescription import org.apache.orc.TypeDescription.Category import scala.collection.JavaConverters._ object OrcSchemas { def toKafka(schema: TypeDescription): Schema = schema.getCategory match { case Category.BOOLEAN => Schema.OPTIONAL_BOOLEAN_SCHEMA case Category.BYTE => Schema.OPTIONAL_INT8_SCHEMA case Category.DOUBLE => Schema.OPTIONAL_FLOAT64_SCHEMA case Category.INT => Schema.OPTIONAL_INT32_SCHEMA case Category.FLOAT => Schema.OPTIONAL_FLOAT32_SCHEMA case Category.LONG => Schema.OPTIONAL_INT64_SCHEMA case Category.SHORT => Schema.OPTIONAL_INT16_SCHEMA case Category.STRING => Schema.OPTIONAL_STRING_SCHEMA case Category.VARCHAR => Schema.OPTIONAL_STRING_SCHEMA case Category.CHAR => Schema.OPTIONAL_STRING_SCHEMA case Category.DATE => Schema.OPTIONAL_STRING_SCHEMA case Category.TIMESTAMP => Schema.OPTIONAL_STRING_SCHEMA case Category.BYTE => Schema.OPTIONAL_BYTES_SCHEMA case Category.STRUCT => toKafkaStruct(schema) } def toKafkaStruct(schema: TypeDescription): Schema = { import scala.collection.JavaConverters._ val builder = SchemaBuilder.struct().name("from_orc") schema.getFieldNames.asScala.zipWithIndex.foreach { case (field, k) => builder.field(field, toKafka(schema.getChildren.get(k))) } builder.build() } def toOrc(schema: Schema): TypeDescription = { schema.`type`() match { case Schema.Type.STRING if schema.name() == Decimal.LOGICAL_NAME => TypeDescription.createDecimal() case Schema.Type.STRING => TypeDescription.createString() case Schema.Type.BOOLEAN => TypeDescription.createBoolean() case Schema.Type.FLOAT32 => TypeDescription.createFloat() case Schema.Type.FLOAT64 => TypeDescription.createDouble() case Schema.Type.INT8 => TypeDescription.createByte() case Schema.Type.INT16 => TypeDescription.createShort() case Schema.Type.INT32 => TypeDescription.createInt() case Schema.Type.INT64 => TypeDescription.createLong() case Schema.Type.BYTES if schema.name() == Decimal.LOGICAL_NAME => TypeDescription.createDecimal() case Schema.Type.BYTES => TypeDescription.createBinary() case Schema.Type.ARRAY => TypeDescription.createList(toOrc(schema.valueSchema())) case Schema.Type.MAP => TypeDescription.createMap(toOrc(schema.keySchema()), toOrc(schema.valueSchema())) case Schema.Type.STRUCT => schema.fields().asScala.foldLeft(TypeDescription.createStruct) { case (struct, field) => struct.addField(field.name, toOrc(field.schema)) } case unsupportedDataType => throw UnsupportedSchemaType(unsupportedDataType.toString) } } }
Example 15
Source File: OrcHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, Serde} import com.landoop.streamreactor.connect.hive.orc.OrcSink import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import scala.util.Try object OrcHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.orc.OrcSerde", "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", Map("org.apache.hadoop.hive.ql.io.orc.OrcSerde" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating orc writer at $path") val sink: OrcSink = com.landoop.streamreactor.connect.hive.orc.sink(path, schema, OrcSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val cretedTimestamp: Long = System.currentTimeMillis() var lastKnownFileSize:Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { sink.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing orc writer at path $path") sink.close() } override def file: Path = path override def currentCount: Long = count override def createdTime: Long = cretedTimestamp override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating orc reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.orc.source(path, OrcSourceConfig()) var offset = startAt override def iterator: Iterator[Record] = reader.iterator.map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 16
Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.Serde import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.ParquetWriter import scala.util.Try object ParquetHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", Map("serialization.format" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating parquet writer at $path") val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val createdTimestamp: Long = System.currentTimeMillis() var lastKnownFileSize:Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { writer.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing writer at path $path") writer.close() } override def currentCount: Long = count override def file: Path = path override def createdTime: Long = createdTimestamp override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating parquet reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path) var offset = startAt override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 17
Source File: RedisStreamTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.redis.sink.writer /* * Copyright 2017 Datamountaineer. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util import com.datamountaineer.streamreactor.connect.redis.sink.RedisSinkTask import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.mockito.MockitoSugar import org.scalatest.BeforeAndAfterAll import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import redis.clients.jedis.{Jedis, StreamEntryID} import scala.collection.JavaConverters._ class RedisStreamTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar { // // val redisServer = new RedisServer(6379) // // override def beforeAll() = redisServer.start() // // override def afterAll() = redisServer.stop() "Redis Stream writer" should { "write Kafka records to a Redis Stream" in { val TOPIC = "cpuTopic" val KCQL = s"INSERT INTO stream1 SELECT * from $TOPIC STOREAS STREAM" println("Testing KCQL : " + KCQL) val props = Map( RedisConfigConstants.REDIS_HOST->"localhost", RedisConfigConstants.REDIS_PORT->"6379", RedisConfigConstants.KCQL_CONFIG->KCQL, RedisConfigConstants.REDIS_PASSWORD -> "" ).asJava val config = RedisConfig(props) val connectionInfo = new RedisConnectionInfo("localhost", 6379, None) val settings = RedisSinkSettings(config) val writer = new RedisStreams(settings) val schema = SchemaBuilder.struct().name("com.example.Cpu") .field("type", Schema.STRING_SCHEMA) .field("temperature", Schema.FLOAT64_SCHEMA) .field("voltage", Schema.FLOAT64_SCHEMA) .field("ts", Schema.INT64_SCHEMA).build() val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L) val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1) val jedis = mock[Jedis] writer.jedis = jedis val map = new util.HashMap[String, String]() map.put("type", "Xeon") map.put("temperature", "60.4") map.put("voltage", "90.1") map.put("ts", 1482180657010L.toString) when(jedis.auth("")).isLenient() when(jedis.xadd("stream1", null, map)).thenReturn(mock[StreamEntryID]) writer.initialize(1, settings.errorPolicy) writer.write(Seq(sinkRecord1)) } } }
Example 18
Source File: RedisPubSubTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.redis.sink.writer import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.mockito.MockitoSugar import org.scalatest.BeforeAndAfterAll import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import redis.clients.jedis.{Jedis, JedisPubSub} import redis.embedded.RedisServer import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer class RedisPubSubTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar { val redisServer = new RedisServer(6379) override def beforeAll() = redisServer.start() override def afterAll() = redisServer.stop() "Redis PUBSUB writer" should { "write Kafka records to a Redis PubSub" in { val TOPIC = "cpuTopic" val KCQL = s"SELECT * from $TOPIC STOREAS PubSub (channel=type)" println("Testing KCQL : " + KCQL) val props = Map( RedisConfigConstants.REDIS_HOST->"localhost", RedisConfigConstants.REDIS_PORT->"6379", RedisConfigConstants.KCQL_CONFIG->KCQL ).asJava val config = RedisConfig(props) val connectionInfo = new RedisConnectionInfo("localhost", 6379, None) val settings = RedisSinkSettings(config) val writer = new RedisPubSub(settings) writer.createClient(settings) val schema = SchemaBuilder.struct().name("com.example.Cpu") .field("type", Schema.STRING_SCHEMA) .field("temperature", Schema.FLOAT64_SCHEMA) .field("voltage", Schema.FLOAT64_SCHEMA) .field("ts", Schema.INT64_SCHEMA).build() val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L) val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L) val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L) val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1) val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2) val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3) val jedis = new Jedis(connectionInfo.host, connectionInfo.port) // Clean up in-memory jedis jedis.flushAll() val messagesMap = collection.mutable.Map[String, ListBuffer[String]]() val t = new Thread { private val pubsub = new JedisPubSub { override def onMessage(channel: String, message: String): Unit = { messagesMap.get(channel) match { case Some(msgs) => messagesMap.put(channel, msgs += message) case None => messagesMap.put(channel, ListBuffer(message)) } } } override def run(): Unit = { jedis.subscribe(pubsub, "Xeon", "i7", "i7-i") } override def interrupt(): Unit = { pubsub.punsubscribe("*") super.interrupt() } } t.start() t.join(5000) if (t.isAlive) t.interrupt() writer.write(Seq(sinkRecord1)) writer.write(Seq(sinkRecord2, sinkRecord3)) messagesMap.size shouldBe 3 messagesMap("Xeon").head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}""" messagesMap("i7").head shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}""" messagesMap("i7-i").head shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}""" } } }
Example 19
Source File: RedisInsertSortedSetTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.redis.sink.writer import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.mockito.MockitoSugar import org.scalatest.BeforeAndAfterAll import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import redis.clients.jedis.Jedis import redis.embedded.RedisServer import scala.collection.JavaConverters._ class RedisInsertSortedSetTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar { val redisServer = new RedisServer(6379) override def beforeAll() = redisServer.start() override def afterAll() = redisServer.stop() "Redis INSERT into Sorted Set (SS) writer" should { "write Kafka records to a Redis Sorted Set" in { val TOPIC = "cpuTopic" val KCQL = s"INSERT INTO cpu_stats SELECT * from $TOPIC STOREAS SortedSet(score=ts)" println("Testing KCQL : " + KCQL) val props = Map( RedisConfigConstants.REDIS_HOST->"localhost", RedisConfigConstants.REDIS_PORT->"6379", RedisConfigConstants.KCQL_CONFIG->KCQL ).asJava val config = RedisConfig(props) val connectionInfo = new RedisConnectionInfo("localhost", 6379, None) val settings = RedisSinkSettings(config) val writer = new RedisInsertSortedSet(settings) writer.createClient(settings) val schema = SchemaBuilder.struct().name("com.example.Cpu") .field("type", Schema.STRING_SCHEMA) .field("temperature", Schema.FLOAT64_SCHEMA) .field("voltage", Schema.FLOAT64_SCHEMA) .field("ts", Schema.INT64_SCHEMA).build() val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L) val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L) val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L) val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1) val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2) val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3) val jedis = new Jedis(connectionInfo.host, connectionInfo.port) // Clean up in-memory jedis jedis.flushAll() writer.write(Seq(sinkRecord1)) writer.write(Seq(sinkRecord2, sinkRecord3)) // Redis cardinality should now be 3 jedis.zcard("cpu_stats") shouldBe 3 val allSSrecords = jedis.zrange("cpu_stats", 0, 999999999999L) val results = allSSrecords.asScala.toList results.head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}""" results(1) shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}""" results(2) shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}""" } } }
Example 20
Source File: RedisFieldsKeyBuilder.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.redis.sink.writer import com.datamountaineer.streamreactor.connect.rowkeys.StringKeyBuilder import org.apache.kafka.connect.data.{Field, Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.annotation.tailrec import scala.collection.JavaConverters._ override def build(record: SinkRecord): String = { val struct: Struct = record.value.asInstanceOf[Struct] val schema: Schema = struct.schema def extractAvailableFieldNames(schema: Schema): Seq[String] = { if (schema.`type` == Schema.Type.STRUCT) { val fields = schema.fields fields.asScala.map(_.name) ++ fields.asScala.flatMap { f => extractAvailableFieldNames(f.schema).map(name => f.name + "." + name) } } else Seq.empty } val availableFields = extractAvailableFieldNames(schema) val missingKeys = keys.filterNot(availableFields.contains) require( missingKeys.isEmpty, s"${missingKeys.mkString(",")} keys are not present in the SinkRecord payload: ${availableFields.mkString(", ")}" ) def getValue(key: String): AnyRef = { @tailrec def findValue(keyParts: List[String], obj: AnyRef): Option[AnyRef] = (obj, keyParts) match { case (f: Field, k :: tail) => findValue(tail, f.schema.field(k)) case (s: Struct, k :: tail) => findValue(tail, s.get(k)) case (v, _) => Option(v) } findValue(key.split('.').toList, struct).getOrElse { throw new IllegalArgumentException( s"$key field value is null. Non null value is required for the fields creating the row key" ) } } keys.map(getValue).mkString(pkDelimiter) } }
Example 21
Source File: PulsarWriterTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.pulsar.sink import com.datamountaineer.streamreactor.connect.pulsar.ProducerConfigFactory import com.datamountaineer.streamreactor.connect.pulsar.config.{PulsarConfigConstants, PulsarSinkConfig, PulsarSinkSettings} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.apache.pulsar.client.api.{Message, MessageId, Producer, PulsarClient} import org.mockito.ArgumentMatchers.any import org.mockito.MockitoSugar import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.collection.JavaConverters._ class PulsarWriterTest extends AnyWordSpec with MockitoSugar with Matchers { val pulsarTopic = "persistent://landoop/standalone/connect/kafka-topic" def getSchema: Schema = { SchemaBuilder.struct .field("int8", SchemaBuilder.int8().defaultValue(2.toByte).doc("int8 field").build()) .field("int16", Schema.INT16_SCHEMA) .field("int32", Schema.INT32_SCHEMA) .field("int64", Schema.INT64_SCHEMA) .field("float32", Schema.FLOAT32_SCHEMA) .field("float64", Schema.FLOAT64_SCHEMA) .field("boolean", Schema.BOOLEAN_SCHEMA) .field("string", Schema.STRING_SCHEMA) .build() } def getStruct(schema: Schema): Struct = { new Struct(schema) .put("int8", 12.toByte) .put("int16", 12.toShort) .put("int32", 12) .put("int64", 12L) .put("float32", 12.2f) .put("float64", 12.2) .put("boolean", true) .put("string", "foo") } "should write messages" in { val config = PulsarSinkConfig(Map( PulsarConfigConstants.HOSTS_CONFIG -> "pulsar://localhost:6650", PulsarConfigConstants.KCQL_CONFIG -> s"INSERT INTO $pulsarTopic SELECT * FROM kafka_topic BATCH = 10 WITHPARTITIONER = SinglePartition WITHCOMPRESSION = ZLIB WITHDELAY = 1000" ).asJava) val schema = getSchema val struct = getStruct(schema) val record1 = new SinkRecord("kafka_topic", 0, null, null, schema, struct, 1) val settings = PulsarSinkSettings(config) val producerConfig = ProducerConfigFactory("test", settings.kcql) val client = mock[PulsarClient] val producer = mock[Producer] val messageId = mock[MessageId] when(client.createProducer(pulsarTopic, producerConfig(pulsarTopic))).thenReturn(producer) when(producer.send(any[Message])).thenReturn(messageId) val writer = PulsarWriter(client, "test", settings) writer.write(List(record1)) } }
Example 22
Source File: ChangeFeedStructBuilder.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.rethink.source import com.fasterxml.jackson.databind.ObjectMapper import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} object ChangeFeedStructBuilder extends StrictLogging { val mapper = new ObjectMapper() val oldVal = "old_val" val newVal = "new_val" val state = "state" val `type` = "type" val schema: Schema = SchemaBuilder.struct.name("ReThinkChangeFeed") .version(1) .field(state, Schema.OPTIONAL_STRING_SCHEMA) .field(oldVal, Schema.OPTIONAL_STRING_SCHEMA) .field(newVal, Schema.OPTIONAL_STRING_SCHEMA) .field(`type`, Schema.OPTIONAL_STRING_SCHEMA) .build def apply(hm: Map[String, Object]): Struct = { val struct = new Struct(schema) hm.foreach({ case (k, v) => if (v != null) struct.put(k, v.toString) }) struct } }
Example 23
Source File: ConnectSchema.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.bloomberg import org.apache.kafka.connect.data.{Schema, SchemaBuilder} import scala.collection.JavaConverters._ def createSchema(name: String, value: Any): Schema = { value match { case _: Boolean => Schema.BOOLEAN_SCHEMA case _: Int => Schema.INT32_SCHEMA case _: Long => Schema.INT64_SCHEMA case _: Double => Schema.FLOAT64_SCHEMA case _: Char => Schema.STRING_SCHEMA case _: String => Schema.STRING_SCHEMA case _: Float => Schema.FLOAT32_SCHEMA case list: java.util.List[_] => val firstItemSchema = if (list.isEmpty) Schema.OPTIONAL_STRING_SCHEMA else createSchema(name, list.get(0)) SchemaBuilder.array(firstItemSchema).build() case map: java.util.LinkedHashMap[String @unchecked, _] => val recordBuilder = SchemaBuilder.struct() recordBuilder.name(name) map.entrySet().asScala.foreach(kvp => recordBuilder.field(kvp.getKey, createSchema(kvp.getKey, kvp.getValue))) recordBuilder.build() case v => sys.error(s"${v.getClass} is not handled.") } } } object ConnectSchema { val namespace = "com.datamountaineer.streamreactor.connect.bloomberg" val connectSchema = new ConnectSchema(namespace) implicit class BloombergDataToConnectSchema(val data: BloombergData) { def getConnectSchema : Schema = { connectSchema.createSchema("BloombergData", data.data) } } }
Example 24
Source File: HiveSchemaTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.hive.it import java.util.concurrent.TimeUnit import com.landoop.streamreactor.connect.hive.{DatabaseName, TableName} import org.apache.kafka.connect.data.Schema import org.scalatest.concurrent.Eventually import org.scalatest.matchers.should.Matchers import org.scalatest.time.{Millis, Span} import org.scalatest.wordspec.AnyWordSpec import scala.collection.JavaConverters._ import scala.io.Source import scala.util.Random class HiveSchemaTest extends AnyWordSpec with Matchers with PersonTestData with Eventually with HiveTests { private implicit val patience: PatienceConfig = PatienceConfig(Span(60000, Millis), Span(5000, Millis)) case class Foo(s: String, l: Long, b: Boolean, d: Double) def foo = Foo("string", Random.nextLong, Random.nextBoolean, Random.nextDouble) "Hive" should { "create correct schema for table" in { val topic = createTopic() val taskDef = Source.fromInputStream(getClass.getResourceAsStream("/hive_sink_task_no_partitions.json")).getLines().mkString("\n") .replace("{{TOPIC}}", topic) .replace("{{TABLE}}", topic) .replace("{{NAME}}", topic) postTask(taskDef) val producer = stringStringProducer() writeRecords(producer, topic, JacksonSupport.mapper.writeValueAsString(foo), 2000) producer.close(30, TimeUnit.SECONDS) // wait for some data to have been flushed eventually { withConn { conn => val stmt = conn.createStatement val rs = stmt.executeQuery(s"select count(*) FROM $topic") rs.next() rs.getLong(1) should be > 0L } } // check that the schema is correct val schema = com.landoop.streamreactor.connect.hive.schema(DatabaseName("default"), TableName(topic)) schema.fields().asScala.map(_.name).toSet shouldBe Set("s", "b", "l", "d") schema.field("s").schema().`type`() shouldBe Schema.Type.STRING schema.field("l").schema().`type`() shouldBe Schema.Type.INT64 schema.field("d").schema().`type`() shouldBe Schema.Type.FLOAT64 schema.field("b").schema().`type`() shouldBe Schema.Type.BOOLEAN stopTask(topic) } } }
Example 25
Source File: SinkRecordToJson.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.source import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import com.fasterxml.jackson.databind.ObjectMapper import com.landoop.json.sql.JacksonJson import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.sink.SinkRecord import org.json4s.jackson.JsonMethods._ import scala.util.Try object SinkRecordToJson extends ConverterUtil { private val mapper = new ObjectMapper() def apply(record: SinkRecord, fields: Map[String, Map[String, String]], ignoreFields: Map[String, Set[String]]): String = { val schema = record.valueSchema() val value = record.value() if (schema == null) { if(value == null){ throw new IllegalArgumentException(s"The sink record value is null.(topic=${record.topic()} partition=${record.kafkaPartition()} offset=${record.kafkaOffset()})".stripMargin) } //try to take it as string value match { case map: java.util.Map[_, _] => val extracted = convertSchemalessJson(record, fields.getOrElse(record.topic(), Map.empty), ignoreFields.getOrElse(record.topic(), Set.empty)) .asInstanceOf[java.util.Map[String, Any]] //not ideal; but the implementation is hashmap anyway mapper.writeValueAsString(extracted) case other => sys.error( s""" |For schemaless record only String and Map types are supported. Class =${Option(other).map(_.getClass.getCanonicalName).getOrElse("unknown(null value)}")} |Record info: |topic=${record.topic()} partition=${record.kafkaPartition()} offset=${record.kafkaOffset()} |${Try(JacksonJson.toJson(value)).getOrElse("")}""".stripMargin) } } else { schema.`type`() match { case Schema.Type.STRING => val extracted = convertStringSchemaAndJson(record, fields.getOrElse(record.topic(), Map.empty), ignoreFields.getOrElse(record.topic(), Set.empty)) compact(render(extracted)) case Schema.Type.STRUCT => val extracted = convert(record, fields.getOrElse(record.topic(), Map.empty), ignoreFields.getOrElse(record.topic(), Set.empty)) simpleJsonConverter.fromConnectData(extracted.valueSchema(), extracted.value()).toString case other => sys.error(s"$other schema is not supported") } } } }
Example 26
Source File: StructFieldExtractorTest.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.schemas import org.apache.kafka.connect.data.{Date, Schema, SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StructFieldExtractorTest extends AnyWordSpec with Matchers { "StructFieldExtractor" should { "return all the fields and their bytes value" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = new StructFieldsExtractor(true, Map.empty).get(struct).toMap map.get("firstName").get shouldBe "Alex" map.get("lastName").get shouldBe "Smith" map.get("age").get shouldBe 30 } "return all fields and apply the mapping" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = new StructFieldsExtractor(true, Map("lastName" -> "Name", "age" -> "a")).get(struct).toMap map.get("firstName").get shouldBe "Alex" map.get("Name").get shouldBe "Smith" map.get("a").get shouldBe 30 } "return only the specified fields" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = new StructFieldsExtractor(false, Map("lastName" -> "Name", "age" -> "age")).get(struct).toMap map.get("Name").get shouldBe "Smith" map.get("age").get shouldBe 30 map.size shouldBe 2 } } "handle Date fieldds" in { val dateSchema = Date.builder().build() val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("date", dateSchema).build() val date = java.sql.Date.valueOf("2017-04-25") val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) .put("date", date) val map1 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap map1.get("date").get shouldBe date map1.size shouldBe 1 val d = Date.toLogical(dateSchema, 10000) struct.put("date", d) val map2 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap map2.get("date").get shouldBe d map2.size shouldBe 1 } }
Example 27
Source File: TestUtilsBase.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect import java.util import java.util.Collections import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.apache.kafka.connect.source.SourceTaskContext import org.apache.kafka.connect.storage.OffsetStorageReader import org.mockito.Mockito._ import org.mockito.MockitoSugar import org.scalatest.BeforeAndAfter import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.collection.JavaConverters._ //set up partition val partition: util.Map[String, String] = Collections.singletonMap(lookupPartitionKey, table) //as a list to search for val partitionList: util.List[util.Map[String, String]] = List(partition).asJava //set up the offset val offset: util.Map[String, Object] = (Collections.singletonMap(offsetColumn,offsetValue )) //create offsets to initialize from val offsets :util.Map[util.Map[String, String],util.Map[String, Object]] = Map(partition -> offset).asJava //mock out reader and task context val taskContext = mock[SourceTaskContext] val reader = mock[OffsetStorageReader] when(reader.offsets(partitionList)).thenReturn(offsets) when(taskContext.offsetStorageReader()).thenReturn(reader) taskContext } }
Example 28
Source File: BytesConverterTest.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.source import com.datamountaineer.streamreactor.connect.converters.MsgKey import org.apache.kafka.connect.data.Schema import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class BytesConverterTest extends AnyWordSpec with Matchers { private val converter = new BytesConverter() private val topic = "topicA" "BytesConverter" should { "handle null payloads" in { val sourceRecord = converter.convert(topic, "somesource", "100", null) sourceRecord.keySchema() shouldBe MsgKey.schema sourceRecord.key() shouldBe MsgKey.getStruct("somesource", "100") sourceRecord.valueSchema() shouldBe Schema.BYTES_SCHEMA sourceRecord.value() shouldBe null } "handle non-null payloads" in { val expectedPayload: Array[Byte] = Array(245, 2, 10, 200, 22, 0, 0, 11).map(_.toByte) val sourceRecord = converter.convert(topic, "somesource", "1001", expectedPayload) sourceRecord.keySchema() shouldBe MsgKey.schema sourceRecord.key() shouldBe MsgKey.getStruct("somesource", "1001") sourceRecord.valueSchema() shouldBe Schema.BYTES_SCHEMA sourceRecord.value() shouldBe expectedPayload } } }
Example 29
Source File: BytesConverterTest.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.sink import com.datamountaineer.streamreactor.connect.converters.MsgKey import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.sink.SinkRecord import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class BytesConverterTest extends AnyWordSpec with Matchers { private val converter = new BytesConverter() private val topic = "topicA" "Sink BytesConverter" should { "handle null payloads" in { val sinkRecord = converter.convert(topic, null) sinkRecord.keySchema() shouldBe null sinkRecord.key() shouldBe null sinkRecord.valueSchema() shouldBe Schema.BYTES_SCHEMA sinkRecord.value() shouldBe null } "handle non-null payloads" in { val expectedPayload: Array[Byte] = Array(245, 2, 10, 200, 22, 0, 0, 11).map(_.toByte) val data = new SinkRecord(topic, 0, null, "keyA", null, expectedPayload, 0) val sinkRecord = converter.convert(topic, data) sinkRecord.keySchema() shouldBe MsgKey.schema sinkRecord.key() shouldBe MsgKey.getStruct("topicA", "keyA") sinkRecord.valueSchema() shouldBe Schema.BYTES_SCHEMA sinkRecord.value() shouldBe expectedPayload } } }
Example 30
Source File: StringStructFieldsStringKeyBuilderTest.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.sink import com.datamountaineer.streamreactor.connect.rowkeys.StringStructFieldsStringKeyBuilder import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StringStructFieldsStringKeyBuilderTest extends AnyWordSpec with Matchers { "StructFieldsStringKeyBuilder" should { "raise an exception if the field is not present in the struct" in { intercept[IllegalArgumentException] { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StringStructFieldsStringKeyBuilder(Seq("threshold")).build(sinkRecord) } } "create the row key based on one single field in the struct" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex" } "create the row key based on one single field with doc in the struct" in { val firstNameSchema = SchemaBuilder.`type`(Schema.Type.STRING).doc("first name") val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", firstNameSchema) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex" } "create the row key based on more thant one field in the struct" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StringStructFieldsStringKeyBuilder(Seq("firstName", "age")).build(sinkRecord) shouldBe "Alex.30" } } }
Example 31
Source File: StringGenericRowKeyBuilderTest.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.sink import com.datamountaineer.streamreactor.connect.rowkeys.StringGenericRowKeyBuilder import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.sink.SinkRecord import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StringGenericRowKeyBuilderTest extends AnyWordSpec with Matchers { "StringGenericRowKeyBuilder" should { "use the topic, partition and offset to make the key" in { val topic = "sometopic" val partition = 2 val offset = 1243L val sinkRecord = new SinkRecord(topic, partition, Schema.INT32_SCHEMA, 345, Schema.STRING_SCHEMA, "", offset) val keyBuilder = new StringGenericRowKeyBuilder() val expected = Seq(topic, partition, offset).mkString("|") keyBuilder.build(sinkRecord) shouldBe expected } } }
Example 32
Source File: StringSinkRecordKeyBuilderTest.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.sink import com.datamountaineer.streamreactor.connect.rowkeys.StringSinkRecordKeyBuilder import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.sink.SinkRecord import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StringSinkRecordKeyBuilderTest extends AnyWordSpec with Matchers { val keyRowKeyBuilder = new StringSinkRecordKeyBuilder() "SinkRecordKeyStringKeyBuilder" should { "create the right key from the Schema key value - Byte" in { val b = 123.toByte val sinkRecord = new SinkRecord("", 1, Schema.INT8_SCHEMA, b, Schema.FLOAT64_SCHEMA, Nil, 0) keyRowKeyBuilder.build(sinkRecord) shouldBe "123" } "create the right key from the Schema key value - String" in { val s = "somekey" val sinkRecord = new SinkRecord("", 1, Schema.STRING_SCHEMA, s, Schema.FLOAT64_SCHEMA, Nil, 0) keyRowKeyBuilder.build(sinkRecord) shouldBe s } "create the right key from the Schema key value - Bytes" in { val bArray = Array(23.toByte, 24.toByte, 242.toByte) val sinkRecord = new SinkRecord("", 1, Schema.BYTES_SCHEMA, bArray, Schema.FLOAT64_SCHEMA, Nil, 0) keyRowKeyBuilder.build(sinkRecord) shouldBe bArray.toString } "create the right key from the Schema key value - Boolean" in { val bool = true val sinkRecord = new SinkRecord("", 1, Schema.BOOLEAN_SCHEMA, bool, Schema.FLOAT64_SCHEMA, Nil, 0) keyRowKeyBuilder.build(sinkRecord) shouldBe "true" } } }
Example 33
Source File: BytesConverter.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.source import java.util.Collections import com.datamountaineer.streamreactor.connect.converters.MsgKey import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.source.SourceRecord class BytesConverter extends Converter { override def convert(kafkaTopic: String, sourceTopic: String, messageId: String, bytes: Array[Byte], keys: Seq[String] = Seq.empty, keyDelimiter: String = "."): SourceRecord = { new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, MsgKey.schema, MsgKey.getStruct(sourceTopic, messageId), Schema.BYTES_SCHEMA, bytes) } }
Example 34
Source File: JsonResilientConverter.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.source import java.util import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.data.SchemaAndValue import org.apache.kafka.connect.json.JsonConverter class JsonResilientConverter extends JsonConverter { override def configure(configs: util.Map[String, _], isKey: Boolean) { super.configure(configs, isKey) } override def fromConnectData(topic: String, schema: Schema, value: Object): Array[Byte] = { try { super.fromConnectData(topic, schema, value) } catch { case t: Throwable => t.printStackTrace() // Ignore exceptions null } } override def toConnectData(topic: String, value: Array[Byte]): SchemaAndValue = { try { super.toConnectData(topic, value) } catch { case t: Throwable => t.printStackTrace() // Ignore exceptions SchemaAndValue.NULL } } }
Example 35
Source File: AvroConverter.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.source import java.io.File import java.util.Collections import com.datamountaineer.streamreactor.connect.converters.MsgKey import io.confluent.connect.avro.AvroData import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.io.DecoderFactory import org.apache.avro.{Schema => AvroSchema} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.source.SourceRecord import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException class AvroConverter extends Converter { private val avroData = new AvroData(8) private var sourceToSchemaMap: Map[String, AvroSchema] = Map.empty private var avroReadersMap: Map[String, GenericDatumReader[GenericRecord]] = Map.empty override def convert(kafkaTopic: String, sourceTopic: String, messageId: String, bytes: Array[Byte], keys: Seq[String] = Seq.empty, keyDelimiter: String = "."): SourceRecord = { Option(bytes) match { case None => new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, avroData.toConnectSchema(sourceToSchemaMap(sourceTopic)), null) case Some(_) => val reader = avroReadersMap.getOrElse(sourceTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $sourceTopic")) val decoder = DecoderFactory.get().binaryDecoder(bytes, null) val record = reader.read(null, decoder) val schemaAndValue = avroData.toConnectData(sourceToSchemaMap(sourceTopic.toLowerCase), record) val value = schemaAndValue.value() value match { case s: Struct if keys.nonEmpty => val keysValue = keys.flatMap { key => Option(KeyExtractor.extract(s, key.split('.').toVector)).map(_.toString) }.mkString(keyDelimiter) new SourceRecord( Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, Schema.STRING_SCHEMA, keysValue, schemaAndValue.schema(), schemaAndValue.value()) case _ => new SourceRecord( Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, MsgKey.schema, MsgKey.getStruct(sourceTopic, messageId), schemaAndValue.schema(), schemaAndValue.value()) } } } override def initialize(config: Map[String, String]): Unit = { sourceToSchemaMap = AvroConverter.getSchemas(config) avroReadersMap = sourceToSchemaMap.map { case (key, schema) => key -> new GenericDatumReader[GenericRecord](schema) } } } object AvroConverter { val SCHEMA_CONFIG = "connect.source.converter.avro.schemas" def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = { config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided")) .toString .split(';') .filter(_.trim.nonEmpty) .map(_.split("=")) .map { case Array(source, path) => val file = new File(path) if (!file.exists()) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!") } val s = source.trim.toLowerCase() if (s.isEmpty) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path") } s -> new AvroSchema.Parser().parse(file) case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Source->AVRO_FILE") }.toMap } }
Example 36
Source File: BytesConverter.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.sink import com.datamountaineer.streamreactor.connect.converters.MsgKey import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.sink.SinkRecord class BytesConverter extends Converter { override def convert(sinkTopic: String, data: SinkRecord): SinkRecord = { Option(data) match { case None => new SinkRecord( sinkTopic, 0, null, null, Schema.BYTES_SCHEMA, null, 0 ) case Some(_) => new SinkRecord( data.topic(), data.kafkaPartition(), MsgKey.schema, MsgKey.getStruct(sinkTopic, data.key().toString()), Schema.BYTES_SCHEMA, data.value(), 0 ) } } }
Example 37
Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.concurrent.duration._ class DefaultCommitPolicyTest extends AnyWordSpec with Matchers { val schema: Schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .build() val struct = new Struct(schema) implicit val conf: Configuration = new Configuration() implicit val fs: LocalFileSystem = FileSystem.getLocal(conf) val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100)) private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = { val status = fs.getFileStatus(path) policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime)) } "DefaultCommitPolicy" should { "roll over after interval" in { val policy = DefaultCommitPolicy(None, Option(2.seconds), None) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 10) shouldBe false Thread.sleep(2000) shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file count" in { val policy = DefaultCommitPolicy(None, None, Some(9)) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 7) shouldBe false shouldFlush(policy, path, 8) shouldBe false shouldFlush(policy, path, 9) shouldBe true shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file size" in { val policy = DefaultCommitPolicy(Some(10), None, None) val path = new Path("foo") val out = fs.create(path) shouldFlush(policy, path, 7) shouldBe false out.writeBytes("wibble wobble wabble wubble") out.close() shouldFlush(policy, path, 9) shouldBe true fs.delete(path, false) } } }
Example 38
Source File: RowKeyBuilderString.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.rowkeys import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ override def build(record: SinkRecord): String = { val struct = record.value().asInstanceOf[Struct] val schema = struct.schema val availableFields = schema.fields().asScala.map(_.name).toSet val missingKeys = keys.filterNot(availableFields.contains) require(missingKeys.isEmpty, s"${missingKeys.mkString(",")} keys are not present in the SinkRecord payload:${availableFields.mkString(",")}") keys.flatMap { case key => val field = schema.field(key) val value = struct.get(field) require(value != null, s"$key field value is null. Non null value is required for the fileds creating the Hbase row key") if (availableSchemaTypes.contains(field.schema().`type`())) Some(value.toString) else None }.mkString(keyDelimiter) } }
Example 39
Source File: TwitterStatusReader.scala From kafka-connect-twitter with Apache License 2.0 | 5 votes |
package com.eneco.trading.kafka.connect.twitter import java.util import java.util.concurrent.{TimeUnit, LinkedBlockingQueue, Executors} import com.eneco.trading.kafka.connect.twitter.domain.TwitterStatus import com.twitter.hbc.httpclient.BasicClient import com.twitter.hbc.twitter4j.Twitter4jStatusClient import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.source.SourceRecord import twitter4j._ import scala.collection.JavaConverters._ import Extensions._ class StatusEnqueuer(queue: LinkedBlockingQueue[Status]) extends StatusListener with Logging { override def onStallWarning(stallWarning: StallWarning) = log.warn("onStallWarning") override def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) = log.info("onDeletionNotice") override def onScrubGeo(l: Long, l1: Long) = { log.debug(s"onScrubGeo $l $l1") } override def onStatus(status: Status) = { log.debug("onStatus") queue.put(status) } override def onTrackLimitationNotice(i: Int) = log.info(s"onTrackLimitationNotice $i") override def onException(e: Exception)= log.warn("onException " + e.toString) } trait StatusToSourceRecord { def convert(status: Status, topic: String): SourceRecord } object StatusToStringKeyValue extends StatusToSourceRecord { def convert (status: Status, topic: String): SourceRecord = { new SourceRecord( Map("tweetSource" -> status.getSource).asJava, //source partitions? Map("tweetId" -> status.getId).asJava, //source offsets? topic, null, Schema.STRING_SCHEMA, status.getUser.getScreenName, Schema.STRING_SCHEMA, status.getText, status.getCreatedAt.getTime) } } object StatusToTwitterStatusStructure extends StatusToSourceRecord { def convert(status: Status, topic: String): SourceRecord = { //val ts = TwitterStatus.struct(TwitterStatus(status)) new SourceRecord( Map("tweetSource" -> status.getSource).asJava, //source partitions? Map("tweetId" -> status.getId).asJava, //source offsets? topic, null, Schema.STRING_SCHEMA, status.getUser.getScreenName, TwitterStatus.schema, TwitterStatus.struct(status), status.getCreatedAt.getTime) } } def stop() = { log.info("Stop Twitter client") client.stop() } }
Example 40
Source File: TwitterStatusReader.scala From kafka-tweet-producer with Apache License 2.0 | 5 votes |
package com.eneco.trading.kafka.connect.twitter import java.util import java.util.concurrent.{TimeUnit, LinkedBlockingQueue, Executors} import com.eneco.trading.kafka.connect.twitter.domain.TwitterStatus import com.twitter.hbc.httpclient.BasicClient import com.twitter.hbc.twitter4j.Twitter4jStatusClient import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.source.SourceRecord import twitter4j._ import scala.collection.JavaConverters._ import Extensions._ class StatusEnqueuer(queue: LinkedBlockingQueue[Status]) extends StatusListener with Logging { override def onStallWarning(stallWarning: StallWarning) = log.warn("onStallWarning") override def onDeletionNotice(statusDeletionNotice: StatusDeletionNotice) = log.info("onDeletionNotice") override def onScrubGeo(l: Long, l1: Long) = { log.debug(s"onScrubGeo $l $l1") } override def onStatus(status: Status) = { log.debug("onStatus") queue.put(status) } override def onTrackLimitationNotice(i: Int) = log.info(s"onTrackLimitationNotice $i") override def onException(e: Exception)= log.warn("onException " + e.toString) } trait StatusToSourceRecord { def convert(status: Status, topic: String): SourceRecord } object StatusToStringKeyValue extends StatusToSourceRecord { def convert (status: Status, topic: String): SourceRecord = { new SourceRecord( Map("tweetSource" -> status.getSource).asJava, //source partitions? Map("tweetId" -> status.getId).asJava, //source offsets? topic, null, Schema.STRING_SCHEMA, status.getUser.getScreenName, Schema.STRING_SCHEMA, status.getText) } } object StatusToTwitterStatusStructure extends StatusToSourceRecord { def convert(status: Status, topic: String): SourceRecord = { //val ts = TwitterStatus.struct(TwitterStatus(status)) new SourceRecord( Map("tweetSource" -> status.getSource).asJava, //source partitions? Map("tweetId" -> status.getId).asJava, //source offsets? topic, TwitterStatus.schema, TwitterStatus.struct(status)) } } def stop() = { log.info("Stop Twitter client") client.stop() } }
Example 41
Source File: SourceRecordProducers.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.ftp.source import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.source.SourceRecord object SourceRecordProducers { type SourceRecordProducer = (ConnectFileMetaDataStore, String, FileMetaData, FileBody) => SourceRecord val fileInfoSchema = SchemaBuilder.struct() .field("name", Schema.STRING_SCHEMA) .field("offset", Schema.INT64_SCHEMA) .build() def stringKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord = new SourceRecord( store.fileMetasToConnectPartition(meta), // source part store.fileMetasToConnectOffset(meta), // source off topic, //topic Schema.STRING_SCHEMA, // key sch meta.attribs.path, // key Schema.BYTES_SCHEMA, // val sch body.bytes // val ) def structKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord = { new SourceRecord( store.fileMetasToConnectPartition(meta), // source part store.fileMetasToConnectOffset(meta), // source off topic, //topic fileInfoSchema, // key sch new Struct(fileInfoSchema) .put("name",meta.attribs.path) .put("offset",body.offset), Schema.BYTES_SCHEMA, // val sch body.bytes // val ) } }
Example 42
Source File: StructFieldsExtractorTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.voltdb import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StructFieldsExtractorTest extends AnyWordSpec with Matchers { "StructFieldsExtractor" should { "return all the fields and their bytes value" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val min = System.currentTimeMillis() val record = StructFieldsExtractor("table", true, Map.empty).get(struct) val map = record map("firstName") shouldBe "Alex" map("lastName") shouldBe "Smith" map("age") shouldBe 30 } "return all fields and apply the mapping" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = StructFieldsExtractor("table", includeAllFields = true, Map("lastName" -> "Name", "age" -> "a")).get(struct) map("firstName") shouldBe "Alex" map("Name") shouldBe "Smith" map("a") shouldBe 30 } "return only the specified fields" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = StructFieldsExtractor("table", includeAllFields = false, Map("lastName" -> "Name", "age" -> "age")).get(struct) map("Name") shouldBe "Smith" map("age") shouldBe 30 map.size shouldBe 2 } } }
Example 43
Source File: HiveSchemaTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.hive.it import java.util.concurrent.TimeUnit import com.landoop.streamreactor.connect.hive.{DatabaseName, TableName} import org.apache.kafka.connect.data.Schema import org.scalatest.concurrent.Eventually import org.scalatest.matchers.should.Matchers import org.scalatest.time.{Millis, Span} import org.scalatest.wordspec.AnyWordSpec import scala.collection.JavaConverters._ import scala.io.Source import scala.util.Random class HiveSchemaTest extends AnyWordSpec with Matchers with PersonTestData with Eventually with HiveTests { private implicit val patience: PatienceConfig = PatienceConfig(Span(60000, Millis), Span(5000, Millis)) case class Foo(s: String, l: Long, b: Boolean, d: Double) def foo = Foo("string", Random.nextLong, Random.nextBoolean, Random.nextDouble) "Hive" should { "create correct schema for table" in { val topic = createTopic() val taskDef = Source.fromInputStream(getClass.getResourceAsStream("/hive_sink_task_no_partitions.json")).getLines().mkString("\n") .replace("{{TOPIC}}", topic) .replace("{{TABLE}}", topic) .replace("{{NAME}}", topic) postTask(taskDef) val producer = stringStringProducer() writeRecords(producer, topic, JacksonSupport.mapper.writeValueAsString(foo), 2000) producer.close(30, TimeUnit.SECONDS) // wait for some data to have been flushed eventually { withConn { conn => val stmt = conn.createStatement val rs = stmt.executeQuery(s"select count(*) FROM $topic") rs.next() rs.getLong(1) should be > 0L } } // check that the schema is correct val schema = com.landoop.streamreactor.connect.hive.schema(DatabaseName("default"), TableName(topic)) schema.fields().asScala.map(_.name).toSet shouldBe Set("s", "b", "l", "d") schema.field("s").schema().`type`() shouldBe Schema.Type.STRING schema.field("l").schema().`type`() shouldBe Schema.Type.INT64 schema.field("d").schema().`type`() shouldBe Schema.Type.FLOAT64 schema.field("b").schema().`type`() shouldBe Schema.Type.BOOLEAN stopTask(topic) } } }
Example 44
Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.concurrent.duration._ class DefaultCommitPolicyTest extends AnyWordSpec with Matchers { val schema: Schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .build() val struct = new Struct(schema) implicit val conf: Configuration = new Configuration() implicit val fs: LocalFileSystem = FileSystem.getLocal(conf) val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100)) private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = { val status = fs.getFileStatus(path) policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime)) } "DefaultCommitPolicy" should { "roll over after interval" in { val policy = DefaultCommitPolicy(None, Option(2.seconds), None) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 10) shouldBe false Thread.sleep(2000) shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file count" in { val policy = DefaultCommitPolicy(None, None, Some(9)) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 7) shouldBe false shouldFlush(policy, path, 8) shouldBe false shouldFlush(policy, path, 9) shouldBe true shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file size" in { val policy = DefaultCommitPolicy(Some(10), None, None) val path = new Path("foo") val out = fs.create(path) shouldFlush(policy, path, 7) shouldBe false out.writeBytes("wibble wobble wabble wubble") out.close() shouldFlush(policy, path, 9) shouldBe true fs.delete(path, false) } } }
Example 45
Source File: MapValueConverterTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.json.sql.JacksonJson import org.apache.kafka.connect.data.{Schema, Struct} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import scala.collection.JavaConverters._ class MapValueConverterTest extends AnyFunSuite with Matchers { test("converts nested payload") { val json = """ |{ | "idType": 3, | "colorDepth": "", | "threshold" : 45.77, | "evars": { | "evars": { | "eVar1": "Tue Aug 27 2019 12:08:10", | "eVar2": 156692207943934897 | } | }, | "exclude": { | "id": 0, | "value": false | } |} |""".stripMargin val map = JacksonJson.toMap[Any](json) val struct = MapValueConverter.convert(map) //Jackson transforming the json to Map the fields order is not retained struct.schema().fields().asScala.map(_.name()).sorted shouldBe List("idType", "colorDepth", "threshold", "evars", "exclude").sorted struct.schema().field("idType").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA struct.schema().field("colorDepth").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA struct.schema().field("threshold").schema() shouldBe Schema.OPTIONAL_FLOAT64_SCHEMA struct.schema().field("exclude").schema().`type`() shouldBe Schema.Type.STRUCT struct.schema().field("exclude").schema().isOptional shouldBe true struct.schema().field("evars").schema().`type`() shouldBe Schema.Type.STRUCT struct.schema().field("evars").schema().isOptional shouldBe true struct.schema().field("evars").schema().fields().asScala.map(_.name()) shouldBe List("evars") val evarsInner = struct.schema().field("evars").schema().field("evars") evarsInner.schema().`type`() shouldBe Schema.Type.STRUCT evarsInner.schema().isOptional shouldBe true evarsInner.schema().fields().asScala.map(_.name()).sorted shouldBe List("eVar1", "eVar2").sorted evarsInner.schema().field("eVar1").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA evarsInner.schema().field("eVar2").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA val exclude = struct.schema().field("exclude").schema() exclude.schema().`type`() shouldBe Schema.Type.STRUCT exclude.schema().isOptional shouldBe true exclude.schema().fields().asScala.map(_.name()).sorted shouldBe List("id", "value").sorted exclude.schema().field("id").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA exclude.schema().field("value").schema() shouldBe Schema.OPTIONAL_BOOLEAN_SCHEMA struct.get("idType") shouldBe 3L struct.get("colorDepth") shouldBe "" struct.get("threshold") shouldBe 45.77D val evarsStruct = struct.get("evars").asInstanceOf[Struct].get("evars").asInstanceOf[Struct] evarsStruct.get("eVar1") shouldBe "Tue Aug 27 2019 12:08:10" evarsStruct.get("eVar2") shouldBe 156692207943934897L val excludeStruct = struct.get("exclude").asInstanceOf[Struct] excludeStruct.get("id") shouldBe 0L excludeStruct.get("value") shouldBe false } }
Example 46
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter} package object parquet { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = { if (fs.isDirectory(path)) { logger.debug(s"$path is a directory, reading constituent files") val remote = fs.listFiles(path, false) new Iterator[Path] { override def hasNext: Boolean = remote.hasNext override def next(): Path = remote.next().getPath }.toList } else { logger.debug(s"Reading $path as a single file") List(path) } } def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = { ParquetReader.builder(new StructReadSupport, file) .withConf(fs.getConf) .build() } def parquetWriter(path: Path, schema: Schema, config: ParquetSinkConfig): ParquetWriter[Struct] = { new StructParquetWriterBuilder(path, schema) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(config.enableDictionary) .withValidation(config.validation) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withWriteMode(if (config.overwrite) { ParquetFileWriter.Mode.OVERWRITE } else { ParquetFileWriter.Mode.CREATE }).build() } }
Example 47
Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ // derived from Apache Spark's parquet write support, archive and license here: // https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private val schemaName = if (schema.name() == null) "schema" else schema.name() private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName) private val metadata = new java.util.HashMap[String, String]() metadata.put("written_by", "streamreactor") // The Parquet `RecordConsumer` to which all structs are written private var consumer: RecordConsumer = _ type ValueWriter = (Any) => Unit override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String]) override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata) override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer override def write(struct: Struct): Unit = { writeMessage { writeStructFields(struct) } } private def writeStructFields(struct: Struct): Unit = { for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) { val value = struct.get(field) if (value != null) { val writer = valueWriter(field.schema()) writeField(field.name, index) { writer(value) } } } } def valueWriter(schema: Schema): ValueWriter = { // todo perhaps introduce something like spark's SpecializedGetters schema.`type`() match { case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean]) case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt) case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong) case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes)) case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat) case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble) case Schema.Type.STRUCT => value => { logger.debug(s"Writing nested struct") val struct = value.asInstanceOf[Struct] writeGroup { schema.fields.asScala .map { field => field -> struct.get(field) } .zipWithIndex.foreach { case ((field, v), k) => writeField(field.name, k) { valueWriter(field.schema)(v) } } } } case _ => throw UnsupportedSchemaType(schema.`type`.toString) } } private def writeMessage(f: => Unit): Unit = { consumer.startMessage() f consumer.endMessage() } private def writeGroup(f: => Unit): Unit = { consumer.startGroup() // consumer.startMessage() f //consumer.endMessage() consumer.endGroup() } private def writeField(name: String, k: Int)(f: => Unit): Unit = { consumer.startField(name, k) f consumer.endField(name, k) } }
Example 48
Source File: Converters.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.kafka.connect.data.{Field, Schema} import org.apache.parquet.io.api.Converter object Converters { def get(field: Field, builder: scala.collection.mutable.Map[String, Any]): Converter = { field.schema().`type`() match { case Schema.Type.STRUCT => new NestedGroupConverter(field.schema(), field, builder) case Schema.Type.INT64 | Schema.Type.INT32 | Schema.Type.INT16 | Schema.Type.INT8 => new AppendingPrimitiveConverter(field, builder) case Schema.Type.FLOAT64 | Schema.Type.FLOAT32 => new AppendingPrimitiveConverter(field, builder) // case Schema.Type.INT64 => new TimestampPrimitiveConverter(field, builder) case Schema.Type.STRING => new DictionaryStringPrimitiveConverter(field, builder) case Schema.Type.ARRAY => ??? case other => throw UnsupportedSchemaType(s"Unsupported data type $other") } } }
Example 49
Source File: Input.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.blockchain.data import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Input(sequence: Long, prev_out: Option[Output], script: String) object Input { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.input") .doc("The input instance part of a transaction.") .field("sequence", Schema.INT64_SCHEMA) .field("prev_out", Output.ConnectSchema) .field("script", Schema.STRING_SCHEMA) .build() implicit class InputToStructConverter(val input: Input) extends AnyVal { def toStruct(): Struct = { val struct = new Struct(ConnectSchema) .put("sequence", input.sequence) .put("script", input.script) input.prev_out.foreach(po=>struct.put("prev_out", po.toStruct())) struct } } }
Example 50
Source File: Input.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.cassandra.sink import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Input(sequence: Long, prev_out: Option[Output], script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() map.put("sequence", sequence) prev_out.foreach(p => map.put("prev_out", p.toHashMap)) map.put("script", script) map } } object Input { val ConnectSchema = SchemaBuilder.struct .name("datamountaineer.blockchain.input") .doc("The input instance part of a transaction.") .field("sequence", Schema.INT64_SCHEMA) .field("prev_out", Output.ConnectSchema) .field("script", Schema.STRING_SCHEMA) .build() implicit class InputToStructConverter(val input: Input) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("sequence", input.sequence) .put("script", input.script) input.prev_out.foreach(po => struct.put("prev_out", po.toStruct())) struct } } }
Example 51
Source File: KeyUtils.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.cassandra.utils import com.jayway.jsonpath.{Configuration, JsonPath} import org.apache.kafka.connect.data.{Schema, Struct} object KeyUtils { def keysFromStruct(struct: Struct, schema: Schema, fieldNames: Seq[String]): Seq[Object] = fieldNames.map(getKeyFromStruct(struct, _)) private def getKeyFromStruct(struct: Struct, fieldName: String): Object = { if (fieldName.contains(".")) { val Array(nestedObject, nestedField) = fieldName.split("\\.", 2) getKeyFromStruct(struct.get(nestedObject).asInstanceOf[Struct], nestedField) } else { struct.get(fieldName) } } }
Example 52
Source File: Output.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.mongodb import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Output(addr_tag_link: Option[String], addr_tag: Option[String], spent: Boolean, tx_index: Long, `type`: Int, addr: Option[String], value: Long, n: Int, script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() addr_tag_link.foreach(map.put("addr_tag_link", _)) addr_tag_link.foreach(map.put("addr_tag", _)) map.put("spent", spent) map.put("tx_index", tx_index) map.put("type", `type`) addr.foreach(map.put("addr", _)) map.put("value", value) map.put("n", n) map.put("script", script) map } } object Output { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.output") .doc("The output instance part of a transaction.") .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA) .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA) .field("spent", Schema.BOOLEAN_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("type", Schema.OPTIONAL_INT32_SCHEMA) .field("addr", Schema.OPTIONAL_STRING_SCHEMA) .field("value", Schema.INT64_SCHEMA) .field("n", Schema.INT32_SCHEMA) .field("script", Schema.STRING_SCHEMA) .build() implicit class OutputToStructConverter(val output: Output) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("spent", output.spent) .put("tx_index", output.tx_index) .put("type", output.`type`) .put("value", output.value) .put("n", output.n) .put("script", output.script) output.addr.foreach(struct.put("addr", _)) output.addr_tag.foreach(struct.put("addr_tag", _)) output.addr_tag_link.foreach(struct.put("addr_tag_link", _)) struct } } }
Example 53
Source File: Input.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.mongodb import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Input(sequence: Long, prev_out: Option[Output], script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() map.put("sequence", sequence) prev_out.foreach(p => map.put("prev_out", p.toHashMap)) map.put("script", script) map } } object Input { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.input") .doc("The input instance part of a transaction.") .field("sequence", Schema.INT64_SCHEMA) .field("prev_out", Output.ConnectSchema) .field("script", Schema.STRING_SCHEMA) .build() implicit class InputToStructConverter(val input: Input) extends AnyVal { def toStruct(): Struct = { val struct = new Struct(ConnectSchema) .put("sequence", input.sequence) .put("script", input.script) input.prev_out.foreach(po => struct.put("prev_out", po.toStruct())) struct } } }
Example 54
Source File: SinkRecordToDocument.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.mongodb.sink import com.datamountaineer.streamreactor.connect.mongodb.config.MongoSettings import com.datamountaineer.streamreactor.connect.mongodb.converters.SinkRecordConverter import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.bson.Document object SinkRecordToDocument extends ConverterUtil { def apply(record: SinkRecord, keys: Set[String] = Set.empty)(implicit settings: MongoSettings): (Document, Iterable[(String, Any)]) = { val schema = record.valueSchema() val value = record.value() val fields = settings.fields.getOrElse(record.topic(), Map.empty) val allFields = if (fields.size == 1 && fields.head._1 == "*") true else false if (schema == null) { //try to take it as string value match { case _: java.util.Map[_, _] => val extracted = convertSchemalessJson( record, fields, settings.ignoredField.getOrElse(record.topic(), Set.empty) ) //not ideal; but the compile is hashmap anyway SinkRecordConverter.fromMap(extracted.asInstanceOf[java.util.Map[String, AnyRef]]) -> keys.headOption.map(_ => KeysExtractor.fromMap(extracted, keys)).getOrElse(Iterable.empty) case _ => sys.error("For schemaless record only String and Map types are supported") } } else { schema.`type`() match { case Schema.Type.STRING => val extracted = convertStringSchemaAndJson( record, fields, settings.ignoredField.getOrElse(record.topic(), Set.empty), includeAllFields = allFields) SinkRecordConverter.fromJson(extracted) -> keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty) case Schema.Type.STRUCT => val extracted = convert( record, fields, settings.ignoredField.getOrElse(record.topic(), Set.empty) ) SinkRecordConverter.fromStruct(extracted) -> keys.headOption.map(_ => KeysExtractor.fromStruct(extracted.value().asInstanceOf[Struct], keys)).getOrElse(Iterable.empty) case other => sys.error(s"$other schema is not supported") } } } }
Example 55
Source File: Output.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Output(addr_tag_link: Option[String], addr_tag: Option[String], spent: Boolean, tx_index: Long, `type`: Int, addr: Option[String], value: Long, n: Int, script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() addr_tag_link.foreach(map.put("addr_tag_link", _)) addr_tag_link.foreach(map.put("addr_tag", _)) map.put("spent", spent) map.put("tx_index", tx_index) map.put("type", `type`) addr.foreach(map.put("addr", _)) map.put("value", value) map.put("n", n) map.put("script", script) map } } object Output { val ConnectSchema: Schema = SchemaBuilder.struct .name("output") .doc("The output instance part of a transaction.") .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA) .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA) .field("spent", Schema.BOOLEAN_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("type", Schema.OPTIONAL_INT32_SCHEMA) .field("addr", Schema.OPTIONAL_STRING_SCHEMA) .field("value", Schema.INT64_SCHEMA) .field("n", Schema.INT32_SCHEMA) .field("script", Schema.STRING_SCHEMA) .build() implicit class OutputToStructConverter(val output: Output) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("spent", output.spent) .put("tx_index", output.tx_index) .put("type", output.`type`) .put("value", output.value) .put("n", output.n) .put("script", output.script) output.addr.foreach(struct.put("addr", _)) output.addr_tag.foreach(struct.put("addr_tag", _)) output.addr_tag_link.foreach(struct.put("addr_tag_link", _)) struct } } }
Example 56
Source File: Input.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Input(sequence: Long, prev_out: Option[Output], script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() map.put("sequence", sequence) prev_out.foreach(p => map.put("prev_out", p.toHashMap)) map.put("script", script) map } } object Input { val ConnectSchema = SchemaBuilder.struct .name("input") .doc("The input instance part of a transaction.") .field("sequence", Schema.INT64_SCHEMA) .field("prev_out", Output.ConnectSchema) .field("script", Schema.STRING_SCHEMA) .build() implicit class InputToStructConverter(val input: Input) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("sequence", input.sequence) .put("script", input.script) input.prev_out.foreach(po => struct.put("prev_out", po.toStruct())) struct } } }
Example 57
Source File: SinkRecordToDocumentTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink import com.datamountaineer.streamreactor.connect.azure.documentdb.Json import com.datamountaineer.streamreactor.connect.azure.documentdb.config.DocumentDbSinkSettings import com.datamountaineer.streamreactor.connect.errors.NoopErrorPolicy import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import com.microsoft.azure.documentdb.{ConsistencyLevel, Document} import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.sink.SinkRecord import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class SinkRecordToDocumentTest extends AnyWordSpec with Matchers with ConverterUtil { private val connection = "https://accountName.documents.azure.com:443/" "SinkRecordToDocument" should { "convert Kafka Struct to a Azure Document Db Document" in { for (i <- 1 to 4) { val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction$i.json").toURI.getPath).mkString val tx = Json.fromJson[Transaction](json) val record = new SinkRecord("topic1", 0, null, null, Transaction.ConnectSchema, tx.toStruct(), 0) implicit val settings = DocumentDbSinkSettings( connection, "secret", "database", Seq.empty, Map("topic1" -> Set.empty[String]), Map("topic1" -> Map.empty), Map("topic1" -> Set.empty), NoopErrorPolicy(), ConsistencyLevel.Session, false, None) val (document, _) = SinkRecordToDocument(record) val expected = new Document(json) //comparing string representation; we have more specific types given the schema document.toString shouldBe expected.toString } } "convert String Schema + Json payload to a Azure Document DB Document" in { for (i <- 1 to 4) { val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction$i.json").toURI.getPath).mkString val record = new SinkRecord("topic1", 0, null, null, Schema.STRING_SCHEMA, json, 0) implicit val settings = DocumentDbSinkSettings( connection, "secret", "database", Seq.empty, Map("topic1" -> Set.empty[String]), Map("topic1" -> Map.empty), Map("topic1" -> Set.empty), NoopErrorPolicy(), ConsistencyLevel.Session, false, None) val (document, _) = SinkRecordToDocument(record) val expected = new Document(json) //comparing string representation; we have more specific types given the schema document.toString() shouldBe expected.toString } } "convert Schemaless + Json payload to a Azure Document DB Document" in { for (i <- 1 to 4) { val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction$i.json").toURI.getPath).mkString val record = new SinkRecord("topic1", 0, null, null, Schema.STRING_SCHEMA, json, 0) implicit val settings = DocumentDbSinkSettings( connection, "secret", "database", Seq.empty, Map("topic1" -> Set.empty[String]), Map("topic1" -> Map.empty), Map("topic1" -> Set.empty), NoopErrorPolicy(), ConsistencyLevel.Session, false, None) val (document, _) = SinkRecordToDocument(record) val expected = new Document(json) //comparing string representation; we have more specific types given the schema document.toString() shouldBe expected.toString } } } }
Example 58
Source File: SinkRecordToDocument.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink import com.datamountaineer.streamreactor.connect.azure.documentdb.config.DocumentDbSinkSettings import com.datamountaineer.streamreactor.connect.azure.documentdb.converters.SinkRecordConverter import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import com.microsoft.azure.documentdb.Document import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord object SinkRecordToDocument extends ConverterUtil { def apply(record: SinkRecord, keys: Set[String] = Set.empty)(implicit settings: DocumentDbSinkSettings): (Document, Iterable[(String, Any)]) = { val schema = record.valueSchema() val value = record.value() if (schema == null) { //try to take it as string value match { case _: java.util.Map[_, _] => val fields = settings.fields(record.topic()) val extracted = convertSchemalessJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic())) //not ideal; but the compile is hashmap anyway SinkRecordConverter.fromMap(extracted.asInstanceOf[java.util.Map[String, AnyRef]]) -> keys.headOption.map(_ => KeysExtractor.fromMap(extracted, keys)).getOrElse(Iterable.empty) case _: String => val extracted = convertStringSchemaAndJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic())) SinkRecordConverter.fromJson(extracted) -> keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty) case _ => sys.error("For schemaless record only String and Map types are supported") } } else { schema.`type`() match { case Schema.Type.STRING => val extracted = convertStringSchemaAndJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic())) SinkRecordConverter.fromJson(extracted) -> keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty) case Schema.Type.STRUCT => val extracted = convert(record, settings.fields(record.topic()), settings.ignoredField(record.topic())) SinkRecordConverter.fromStruct(extracted) -> keys.headOption.map(_ => KeysExtractor.fromStruct(extracted.value().asInstanceOf[Struct], keys)).getOrElse(Iterable.empty) case other => sys.error(s"$other schema is not supported") } } } }
Example 59
Source File: Transaction.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.blockchain.data import java.util import com.datamountaineer.streamreactor.connect.blockchain.data.Input._ import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.source.SourceRecord case class Transaction(lock_time: Long, ver: Int, size: Long, inputs: Seq[Input], rbf: Option[Boolean], time: Long, tx_index: Long, vin_sz: Int, hash: String, vout_sz: Int, relayed_by: String, out: Seq[Output]) object Transaction { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.transaction") .field("lock_time", Schema.INT64_SCHEMA) .field("ver", Schema.INT32_SCHEMA) .field("size", Schema.INT64_SCHEMA) .field("inputs", SchemaBuilder.array(Input.ConnectSchema).optional().build()) .field("rbf", Schema.OPTIONAL_BOOLEAN_SCHEMA) .field("time", Schema.INT64_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("vin_sz", Schema.INT32_SCHEMA) .field("hash", Schema.STRING_SCHEMA) .field("vout_sz", Schema.INT32_SCHEMA) .field("relayed_by", Schema.STRING_SCHEMA) .field("out", SchemaBuilder.array(Output.ConnectSchema).optional().build()) .build() implicit class TransactionToSourceRecordConverter(val tx: Transaction) extends AnyVal { def toSourceRecord(topic: String, partition: Int, key: Option[String]): SourceRecord = { new SourceRecord( null, null, topic, partition, key.map(_ => Schema.STRING_SCHEMA).orNull, key.orNull, ConnectSchema, tx.toStruct() ) } //private def getOffset() = Collections.singletonMap("position", System.currentTimeMillis()) def toStruct(): Struct = { val struct = new Struct(ConnectSchema) .put("lock_time", tx.lock_time) .put("ver", tx.ver) .put("size", tx.size) .put("time", tx.time) .put("tx_index", tx.tx_index) .put("vin_sz", tx.vin_sz) .put("hash", tx.hash) .put("vout_sz", tx.vout_sz) .put("relayed_by", tx.relayed_by) tx.out.headOption.foreach { _ => import scala.collection.JavaConverters._ struct.put("out", tx.out.map(_.toStruct()).asJava) } tx.rbf.foreach(struct.put("rbf", _)) tx.inputs.headOption.foreach { _ => val inputs = new util.ArrayList[Struct] tx.inputs.foreach(i => inputs.add(i.toStruct())) struct.put("inputs", inputs) } tx.out.headOption.foreach { _ => val outputs = new util.ArrayList[Struct] tx.out.foreach(output => outputs.add(output.toStruct())) } struct } } }
Example 60
Source File: Output.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.blockchain.data import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Output(addr_tag_link: Option[String], addr_tag: Option[String], spent: Boolean, tx_index: Long, `type`: Int, addr: Option[String], value: Long, n: Int, script: String) object Output { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.output") .doc("The output instance part of a transaction.") .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA) .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA) .field("spent", Schema.BOOLEAN_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("type", Schema.INT32_SCHEMA) .field("addr", Schema.OPTIONAL_STRING_SCHEMA) .field("value", Schema.INT64_SCHEMA) .field("n", Schema.INT32_SCHEMA) .field("script", Schema.STRING_SCHEMA) .build() implicit class OutputToStructConverter(val output: Output) extends AnyVal { def toStruct(): Struct = { val struct = new Struct(ConnectSchema) .put("spent", output.spent) .put("tx_index", output.tx_index) .put("type", output.`type`) .put("value", output.value) .put("n", output.n) .put("script", output.script) output.addr.foreach(struct.put("addr", _)) output.addr_tag.foreach(struct.put("addr_tag", _)) output.addr_tag_link.foreach(struct.put("addr_tag_link", _)) struct } } }
Example 61
Source File: Output.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.cassandra.sink import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Output(addr_tag_link: Option[String], addr_tag: Option[String], spent: Boolean, tx_index: Long, `type`: Int, addr: Option[String], value: Long, n: Int, script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() addr_tag_link.foreach(map.put("addr_tag_link", _)) addr_tag_link.foreach(map.put("addr_tag", _)) map.put("spent", spent) map.put("tx_index", tx_index) map.put("type", `type`) addr.foreach(map.put("addr", _)) map.put("value", value) map.put("n", n) map.put("script", script) map } } object Output { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.output") .doc("The output instance part of a transaction.") .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA) .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA) .field("spent", Schema.BOOLEAN_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("type", Schema.OPTIONAL_INT32_SCHEMA) .field("addr", Schema.OPTIONAL_STRING_SCHEMA) .field("value", Schema.INT64_SCHEMA) .field("n", Schema.INT32_SCHEMA) .field("script", Schema.STRING_SCHEMA) .build() implicit class OutputToStructConverter(val output: Output) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("spent", output.spent) .put("tx_index", output.tx_index) .put("type", output.`type`) .put("value", output.value) .put("n", output.n) .put("script", output.script) output.addr.foreach(struct.put("addr", _)) output.addr_tag.foreach(struct.put("addr_tag", _)) output.addr_tag_link.foreach(struct.put("addr_tag_link", _)) struct } } }
Example 62
Source File: ConnectMongoConverterSpec.scala From kafka-connect-mongodb with Apache License 2.0 | 5 votes |
package com.startapp.data import java.lang.Boolean import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.scalatest.{FlatSpec, Matchers} class ConnectMongoConverterSpec extends FlatSpec with Matchers{ private val FIELD1_NAME = "fieldInt" private val FIELD1_VALUE = new Integer(5) private val FIELD2_NAME = "fieldString" private val FIELD2_VALUE = "str" private val FIELD3_NAME = "fieldBoolean" private val FIELD3_VALUE = new Boolean(true) val schema = SchemaBuilder.struct().name("test schema") .field(FIELD1_NAME, Schema.INT32_SCHEMA) .field(FIELD2_NAME, Schema.STRING_SCHEMA) .field(FIELD3_NAME, Schema.BOOLEAN_SCHEMA) .build() "No Schema Connect Mongo Converter Bad Data" should "throw an exception" in { var exceptionThrown = false val badData = new Struct(schema) try{ checkJsonMap(NoSchemaConnectMongoConverter, badData) } catch { case _ : java.lang.ClassCastException => exceptionThrown = true } exceptionThrown should be(true) } "No Schema Connect Mongo Converter Good Data" should "return the same map" in { val jsonMap = new util.HashMap[String, Object]() jsonMap.put(FIELD1_NAME, FIELD1_VALUE) jsonMap.put(FIELD2_NAME, FIELD2_VALUE) jsonMap.put(FIELD3_NAME, FIELD3_VALUE) checkJsonMap(NoSchemaConnectMongoConverter, jsonMap) } "Schema Connect Mongo Converter Bad Data" should "throw an exception" in { var exceptionThrown = false val badData = new util.HashMap[String, Object]() badData.put(FIELD1_NAME, FIELD1_VALUE) try { checkJsonMap(SchemaConnectMongoConverter, badData) } catch { case _ : java.lang.ClassCastException => exceptionThrown = true } exceptionThrown should be(true) } "Schema Connect Mongo Converter Good Data" should "convert data to json map" in { val data = new Struct(schema) .put(FIELD1_NAME, FIELD1_VALUE) .put(FIELD2_NAME, FIELD2_VALUE) .put(FIELD3_NAME, FIELD3_VALUE) checkJsonMap(SchemaConnectMongoConverter, data) } private def checkJsonMap(converter : ConnectMongoConverter, value: Object): Unit ={ val newJsonMap = converter.toJsonMap(value).toMap newJsonMap(FIELD1_NAME) should be(FIELD1_VALUE) newJsonMap(FIELD2_NAME) should be(FIELD2_VALUE) newJsonMap(FIELD3_NAME) should be(FIELD3_VALUE) } }
Example 63
Source File: HANASourceTaskConversionTest.scala From kafka-connect-sap with Apache License 2.0 | 5 votes |
package com.sap.kafka.connect.source import com.sap.kafka.client.MetaSchema import org.apache.kafka.connect.data.Schema.Type import org.apache.kafka.connect.data.{Field, Schema, Struct} import org.apache.kafka.connect.source.SourceRecord import scala.collection.JavaConverters._ class HANASourceTaskConversionTest extends HANASourceTaskTestBase { override def beforeAll(): Unit = { super.beforeAll() task.start(singleTableConfig()) } override def afterAll(): Unit = { task.stop() super.afterAll() } test("boolean type") { typeConversion(Schema.BOOLEAN_SCHEMA, true, java.lang.Boolean.FALSE, Schema.BOOLEAN_SCHEMA, java.lang.Boolean.FALSE) } test("int type") { typeConversion(Schema.INT32_SCHEMA, true, new java.lang.Integer(1), Schema.INT32_SCHEMA, new Integer(1)) } test("long type") { typeConversion(Schema.INT64_SCHEMA, true, new java.lang.Long(1), Schema.INT64_SCHEMA, new java.lang.Long(1)) } test("double type") { typeConversion(Schema.FLOAT64_SCHEMA, true, new java.lang.Double(1.0), Schema.FLOAT64_SCHEMA, new java.lang.Double(1.0)) } test("string type") { typeConversion(Schema.STRING_SCHEMA, true, "'a'", Schema.STRING_SCHEMA, "a") } private def typeConversion(sqlType: Schema, nullable: Boolean, sqlValue: Object, convertedSchema: Schema, convertedValue: Object): Unit = { val fields = Seq(new Field("id", 1, sqlType)) jdbcClient.createTable(Some("TEST"), "EMPLOYEES_SOURCE", MetaSchema(null, fields), 3000) val connection = jdbcClient.getConnection val stmt = connection.createStatement() stmt.execute("insert into \"TEST\".\"EMPLOYEES_SOURCE\" values(" + sqlValue.toString + ")") val records = task.poll() validateRecords(records.asScala.toList, convertedSchema, convertedValue) stmt.execute("drop table \"TEST\".\"EMPLOYEES_SOURCE\"") } private def validateRecords(records: List[SourceRecord], expectedFieldSchema: Schema, expectedValue: Object): Unit = { assert(records.size === 1) val objValue = records.head.value() assert(objValue.isInstanceOf[Struct]) val value = objValue.asInstanceOf[Struct] val schema = value.schema() assert(Type.STRUCT === schema.`type`()) val fields = schema.fields() assert(fields.size() === 1) val fieldSchema = fields.get(0).schema() assert(expectedFieldSchema === fieldSchema) assert(expectedValue === value.get(fields.get(0))) } }
Example 64
Source File: TableQuerier.scala From kafka-connect-sap with Apache License 2.0 | 5 votes |
package com.sap.kafka.connect.source.querier import com.sap.kafka.client.hana.HANAJdbcClient import com.sap.kafka.connect.config.{BaseConfig, BaseConfigConstants} import com.sap.kafka.connect.config.hana.HANAConfig import com.sap.kafka.utils.hana.HANAJdbcTypeConverter import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.source.SourceRecord import org.slf4j.LoggerFactory import scala.util.Random abstract class TableQuerier(mode: String, tableOrQuery: String, topic: String, config: BaseConfig, var jdbcClient: Option[HANAJdbcClient]) extends Comparable[TableQuerier] { var tableName: String = if (mode.equals(BaseConfigConstants.QUERY_MODE_TABLE)) tableOrQuery else null var query: String = if (mode.equals(BaseConfigConstants.QUERY_MODE_SQL)) tableOrQuery else null var lastUpdate: Long = 0 var schema: Schema = _ var queryString: Option[String] = None var resultList: Option[List[Struct]] = None val log = LoggerFactory.getLogger(getClass) def getLastUpdate(): Long = lastUpdate def getOrCreateQueryString(): Option[String] = { createQueryString() queryString } def createQueryString(): Unit def querying(): Boolean = resultList.isDefined def maybeStartQuery(): Unit = { if (resultList.isEmpty) { schema = getSchema() queryString = getOrCreateQueryString() val batchMaxRows = config.batchMaxRows resultList = getOrCreateJdbcClient().get.executeQuery(schema, queryString.get, 0, batchMaxRows) log.info(resultList.size.toString) } } def extractRecords(): List[SourceRecord] def close(now: Long): Unit = { resultList = None schema = null lastUpdate = now } protected def getOrCreateJdbcClient(): Option[HANAJdbcClient] = { if (jdbcClient.isDefined) { return jdbcClient } config match { case hanaConfig: HANAConfig => Some(HANAJdbcClient(hanaConfig)) case _ => throw new RuntimeException("Cannot create Jdbc Client") } } private def getSchema(): Schema = { mode match { case BaseConfigConstants.QUERY_MODE_TABLE => if (getOrCreateJdbcClient().get.isInstanceOf[HANAJdbcClient]) { val metadata = getOrCreateJdbcClient().get.getMetaData(tableOrQuery, None) HANAJdbcTypeConverter.convertHANAMetadataToSchema(tableName, metadata) } else { throw new RuntimeException("Jdbc Client is not available") } case BaseConfigConstants.QUERY_MODE_SQL => if (getOrCreateJdbcClient().get.isInstanceOf[HANAJdbcClient]) { val metadata = getOrCreateJdbcClient().get.getMetadata(tableOrQuery) HANAJdbcTypeConverter.convertHANAMetadataToSchema("Query" + Random.nextInt, metadata) } else { throw new RuntimeException("Jdbc Client is not available") } case _ => throw new RuntimeException("Other Query modes are not supported") } } override def compareTo(other: TableQuerier): Int = { if (this.lastUpdate < other.lastUpdate) { -1 } else if (this.lastUpdate > other.lastUpdate) { 0 } else { this.tableName.compareTo(other.tableName) } } }
Example 65
Source File: SQSSourceTask.scala From sqs-kafka-connect with Apache License 2.0 | 5 votes |
package com.hivehome.kafka.connect.sqs import java.util.{List => JList, Map => JMap} import javax.jms._ import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.source.{SourceRecord, SourceTask} import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ import scala.util.Try import scala.util.control.NonFatal object SQSSourceTask { private val SqsQueueField: String = "queue" private val MessageId: String = "messageId" private val ValueSchema = Schema.STRING_SCHEMA } class SQSSourceTask extends SourceTask { val logger = LoggerFactory.getLogger(getClass.getName) private var conf: Conf = _ private var consumer: MessageConsumer = null // MessageId to MessageHandle used to ack the message on the commitRecord method invocation private var unAcknowledgedMessages = Map[String, Message]() def version: String = Version() def start(props: JMap[String, String]): Unit = { conf = Conf.parse(props.asScala.toMap).get logger.debug("Creating consumer...") synchronized { try { consumer = SQSConsumer(conf) logger.info("Created consumer to SQS topic {} for reading", conf.queueName) } catch { case NonFatal(e) => logger.error("Exception", e) } } } import com.hivehome.kafka.connect.sqs.SQSSourceTask._ @throws(classOf[InterruptedException]) def poll: JList[SourceRecord] = { def toRecord(msg: Message): SourceRecord = { val extracted = MessageExtractor(msg) val key = Map(SqsQueueField -> conf.queueName.get).asJava val value = Map(MessageId -> msg.getJMSMessageID).asJava new SourceRecord(key, value, conf.topicName.get, ValueSchema, extracted) } assert(consumer != null) // should be initialised as part of start() Try { Option(consumer.receive).map { msg => logger.info("Received message {}", msg) // This operation is not threadsafe as a result the plugin is not threadsafe. // However KafkaConnect assigns a single thread to each task and the poll // method is always called by a single thread. unAcknowledgedMessages = unAcknowledgedMessages.updated(msg.getJMSMessageID, msg) toRecord(msg) }.toSeq }.recover { case NonFatal(e) => logger.error("Exception while processing message", e) List.empty }.get.asJava } @throws(classOf[InterruptedException]) override def commitRecord(record: SourceRecord): Unit = { val msgId = record.sourceOffset().get(MessageId).asInstanceOf[String] val maybeMsg = unAcknowledgedMessages.get(msgId) maybeMsg.foreach(_.acknowledge()) unAcknowledgedMessages = unAcknowledgedMessages - msgId } def stop() { logger.debug("Stopping task") synchronized { unAcknowledgedMessages = Map() try { if (consumer != null) { consumer.close() logger.debug("Closed input stream") } } catch { case NonFatal(e) => logger.error("Failed to close consumer stream: ", e) } this.notify() } } }
Example 66
Source File: FieldValueGetter.scala From kafka-connect-kcql-smt with Apache License 2.0 | 5 votes |
package com.landoop.connect.sql import org.apache.kafka.connect.data.{Schema, Struct} trait FieldValueGetter { def get(value: Any, schema: Schema, path: Seq[String]): Option[Any] = { path.headOption.map { parent => schema.`type`() match { case Schema.Type.STRUCT => if (Option(value).isEmpty) None else fromRecord(value, schema, path) case Schema.Type.MAP => if (Option(value).isEmpty) None else fromMap(value, schema, path) case _ => throw new IllegalArgumentException(s"Can't select $parent field from schema:$schema") } }.getOrElse { schema.`type`() match { case Schema.Type.BOOLEAN | Schema.Type.FLOAT64 | Schema.Type.FLOAT32 | Schema.Type.INT64 | Schema.Type.INT32 | Schema.Type.INT16 | Schema.Type.INT8 | Schema.Type.BYTES | Schema.Type.STRING => Option(value) case Schema.Type.ARRAY | Schema.Type.MAP | Schema.Type.STRUCT => throw new IllegalArgumentException(s"Can't select an element from an array(schema:$schema)") case other => throw new IllegalArgumentException(s"Invalid Avro schema type:$other") } } } private def fromRecord(value: Any, schema: Schema, path: Seq[String]) = { val field = Option(schema.field(path.head)) .getOrElse(throw new IllegalArgumentException(s"Can't find field:${path.head} in schema:$schema")) val v = value.asInstanceOf[Struct].get(path.head) get(v, field.schema(), path.tail) } private def fromMap(value: Any, schema: Schema, path: Seq[String]) = { val field = Option(schema.field(path.head)) .getOrElse(throw new IllegalArgumentException(s"Can't find field:${path.head} in schema:$schema")) val v = value.asInstanceOf[Struct].get(path.head) get(v, field.schema(), path.tail) } }
Example 67
Source File: IotMessageConverter.scala From toketi-kafka-connect-iothub with MIT License | 5 votes |
// Copyright (c) Microsoft. All rights reserved. package com.microsoft.azure.iot.kafka.connect.source import java.time.Instant import java.util.Date import com.microsoft.azure.eventhubs.impl.AmqpConstants import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import scala.collection.JavaConverters._ import scala.reflect.ClassTag object IotMessageConverter { val offsetKey = "offset" private val schemaName = "iothub.kafka.connect" private val schemaVersion = 1 private val deviceIdKey = "deviceId" private val contentTypeKey = "contentType" private val sequenceNumberKey = "sequenceNumber" private val enqueuedTimeKey = "enqueuedTime" private val contentKey = "content" private val systemPropertiesKey = "systemProperties" private val propertiesKey = "properties" private val deviceIdIotHubKey = "iothub-connection-device-id" // Public for testing purposes lazy val schema: Schema = SchemaBuilder.struct() .name(schemaName) .version(schemaVersion) .field(deviceIdKey, Schema.STRING_SCHEMA) .field(offsetKey, Schema.STRING_SCHEMA) .field(contentTypeKey, Schema.OPTIONAL_STRING_SCHEMA) .field(enqueuedTimeKey, Schema.STRING_SCHEMA) .field(sequenceNumberKey, Schema.INT64_SCHEMA) .field(contentKey, Schema.STRING_SCHEMA) .field(systemPropertiesKey, propertiesMapSchema) .field(propertiesKey, propertiesMapSchema) private lazy val propertiesMapSchema: Schema = SchemaBuilder.map(Schema.STRING_SCHEMA, Schema.STRING_SCHEMA) def getIotMessageStruct(iotMessage: IotMessage): Struct = { val systemProperties = iotMessage.systemProperties val deviceId: String = getOrDefaultAndRemove(systemProperties, deviceIdIotHubKey, "") val offset: String = getOrDefaultAndRemove(systemProperties, AmqpConstants.OFFSET_ANNOTATION_NAME, "") val sequenceNumber: Long = getOrDefaultAndRemove(systemProperties, AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME, 0) val enqueuedTime: Option[Instant] = getEnqueuedTime(systemProperties) val enqueuedTimeStr = if(enqueuedTime.isDefined) enqueuedTime.get.toString else "" val properties = iotMessage.properties val contentType: String = getOrDefaultAndRemove(properties, contentTypeKey, "") val systemPropertiesMap = systemProperties.map(i => (i._1, i._2.toString)) new Struct(schema) .put(deviceIdKey, deviceId) .put(offsetKey, offset) .put(contentTypeKey, contentType) .put(enqueuedTimeKey, enqueuedTimeStr) .put(sequenceNumberKey, sequenceNumber) .put(contentKey, iotMessage.content) .put(systemPropertiesKey, systemPropertiesMap.asJava) .put(propertiesKey, properties.asJava) } private def getEnqueuedTime(map: scala.collection.mutable.Map[String, Object]): Option[Instant] = { val enqueuedTimeValue: Date = getOrDefaultAndRemove(map, AmqpConstants.ENQUEUED_TIME_UTC_ANNOTATION_NAME, null) if (enqueuedTimeValue != null) Some(enqueuedTimeValue.toInstant) else None } private def getOrDefaultAndRemove[T: ClassTag, S: ClassTag](map: scala.collection.mutable.Map[String, S], key: String, defaultVal: T): T = { if (map.contains(key)) { val retVal: T = map(key).asInstanceOf[T] map.remove(key) retVal } else { defaultVal } } }
Example 68
Source File: DataServiceTest.scala From kafka-jdbc-connector with Apache License 2.0 | 5 votes |
package com.agoda.kafka.connector.jdbc.services import java.sql.{Connection, PreparedStatement, ResultSet, ResultSetMetaData} import com.agoda.kafka.connector.jdbc.utils.DataConverter import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.source.SourceRecord import org.scalatest.mockito.MockitoSugar import org.mockito.Mockito._ import org.scalatest.{Matchers, WordSpec} import scala.concurrent.duration._ import scala.util.Success class DataServiceTest extends WordSpec with Matchers with MockitoSugar { "Data Service" should { val spName = "stored-procedure" val connection = mock[Connection] val converter = mock[DataConverter] val sourceRecord1 = mock[SourceRecord] val sourceRecord2 = mock[SourceRecord] val resultSet = mock[ResultSet] val resultSetMetadata = mock[ResultSetMetaData] val preparedStatement = mock[PreparedStatement] val schema = mock[Schema] val dataService = new DataService { override def storedProcedureName: String = spName override protected def createPreparedStatement(connection: Connection) = Success(preparedStatement) override protected def extractRecords(resultSet: ResultSet, schema: Schema) = Success(Seq(sourceRecord1, sourceRecord2)) override def dataConverter: DataConverter = converter } "get records" in { doNothing().when(preparedStatement).setQueryTimeout(1) when(preparedStatement.executeQuery).thenReturn(resultSet) when(resultSet.getMetaData).thenReturn(resultSetMetadata) when(converter.convertSchema(spName, resultSetMetadata)).thenReturn(Success(schema)) dataService.getRecords(connection, 1.second) shouldBe Success(Seq(sourceRecord1, sourceRecord2)) verify(preparedStatement).setQueryTimeout(1) verify(preparedStatement).executeQuery verify(resultSet).getMetaData verify(converter).convertSchema(spName, resultSetMetadata) } } }
Example 69
Source File: DataService.scala From kafka-jdbc-connector with Apache License 2.0 | 5 votes |
package com.agoda.kafka.connector.jdbc.services import java.sql.{Connection, PreparedStatement, ResultSet} import com.agoda.kafka.connector.jdbc.utils.DataConverter import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.source.SourceRecord import scala.concurrent.duration.Duration import scala.util.Try trait DataService { def getRecords(connection: Connection, timeout: Duration): Try[Seq[SourceRecord]] = { for { preparedStatement <- createPreparedStatement(connection) resultSet <- executeStoredProcedure(preparedStatement, timeout) schema <- dataConverter.convertSchema(storedProcedureName, resultSet.getMetaData) records <- extractRecords(resultSet, schema) } yield records } protected def createPreparedStatement(connection: Connection): Try[PreparedStatement] protected def extractRecords(resultSet: ResultSet, schema: Schema): Try[Seq[SourceRecord]] private def executeStoredProcedure(preparedStatement: PreparedStatement, timeout: Duration): Try[ResultSet] = Try { preparedStatement.setQueryTimeout(timeout.toSeconds.toInt) preparedStatement.executeQuery } }
Example 70
Source File: TimeBasedDataService.scala From kafka-jdbc-connector with Apache License 2.0 | 5 votes |
package com.agoda.kafka.connector.jdbc.services import java.sql.{Connection, PreparedStatement, ResultSet, Timestamp} import java.util.{Date, GregorianCalendar, TimeZone} import com.agoda.kafka.connector.jdbc.JdbcSourceConnectorConstants import com.agoda.kafka.connector.jdbc.models.DatabaseProduct import com.agoda.kafka.connector.jdbc.models.DatabaseProduct.{MsSQL, MySQL} import com.agoda.kafka.connector.jdbc.models.Mode.TimestampMode import com.agoda.kafka.connector.jdbc.utils.DataConverter import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.source.SourceRecord import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer import scala.util.Try case class TimeBasedDataService(databaseProduct: DatabaseProduct, storedProcedureName: String, batchSize: Int, batchSizeVariableName: String, timestampVariableName: String, var timestampOffset: Long, timestampFieldName: String, topic: String, keyFieldOpt: Option[String], dataConverter: DataConverter, calendar: GregorianCalendar = new GregorianCalendar(TimeZone.getTimeZone("UTC")) ) extends DataService { override def createPreparedStatement(connection: Connection): Try[PreparedStatement] = Try { val preparedStatement = databaseProduct match { case MsSQL => connection.prepareStatement(s"EXECUTE $storedProcedureName @$timestampVariableName = ?, @$batchSizeVariableName = ?") case MySQL => connection.prepareStatement(s"CALL $storedProcedureName (@$timestampVariableName := ?, @$batchSizeVariableName := ?)") } preparedStatement.setTimestamp(1, new Timestamp(timestampOffset), calendar) preparedStatement.setObject(2, batchSize) preparedStatement } override def extractRecords(resultSet: ResultSet, schema: Schema): Try[Seq[SourceRecord]] = Try { val sourceRecords = ListBuffer.empty[SourceRecord] var max = timestampOffset while (resultSet.next()) { dataConverter.convertRecord(schema, resultSet) map { record => val time = record.get(timestampFieldName).asInstanceOf[Date].getTime max = if(time > max) { keyFieldOpt match { case Some(keyField) => sourceRecords += new SourceRecord( Map(JdbcSourceConnectorConstants.STORED_PROCEDURE_NAME_KEY -> storedProcedureName).asJava, Map(TimestampMode.entryName -> time).asJava, topic, null, schema, record.get(keyField), schema, record ) case None => sourceRecords += new SourceRecord( Map(JdbcSourceConnectorConstants.STORED_PROCEDURE_NAME_KEY -> storedProcedureName).asJava, Map(TimestampMode.entryName -> time).asJava, topic, schema, record ) } time } else max } } timestampOffset = max sourceRecords } override def toString: String = { s""" |{ | "name" : "${this.getClass.getSimpleName}" | "mode" : "${TimestampMode.entryName}" | "stored-procedure.name" : "$storedProcedureName" |} """.stripMargin } }
Example 71
Source File: SchemaSpec.scala From kafka-connect-cassandra with Apache License 2.0 | 5 votes |
package com.tuplejump.kafka.connect.cassandra import com.datastax.driver.core.{ DataType, TestUtil} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord class SchemaSpec extends AbstractFlatSpec { it should "convert a struct schema with single field" in { val topic = "topicx" val sc = sinkConfig(topic, "keyspacex", "tablex", List("id")) sc.options.consistency should be (TaskConfig.DefaultSinkConsistency) sc.schema.columnNames should === (List("id")) sc.query.cql should be ("INSERT INTO keyspacex.tablex(id) VALUES(?)") val schema = SchemaBuilder.struct.name("record").version(1).field("id", Schema.INT32_SCHEMA).build val value = new Struct(schema).put("id", 1) val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0) sc.schema.route.topic should be (record.topic) sc.schema.route.keyspace should be ("keyspacex") sc.schema.route.table should be ("tablex") sc.schema is record should be (true) val query = record.as(sc.schema.namespace) query.cql should be("INSERT INTO keyspacex.tablex(id) VALUES(1)") } it should "convert a struct schema with multiple fields" in { val topic = "test_kfk" val sc = sinkConfig(topic, "keyspacex", "tablex", List("available", "name", "age")) val schema = SchemaBuilder.struct.name("record").version(1) .field("available", Schema.BOOLEAN_SCHEMA) .field("name", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA).build val value = new Struct(schema).put("name", "user").put("available", false).put("age", 15) val record = new SinkRecord("test_kfk", 1, SchemaBuilder.struct.build, "key", schema, value, 0) schema.asColumnNames should be (sc.schema.columnNames) sc.schema.route.topic should be (record.topic) sc.schema is record should be (true) sc.query.cql should be ("INSERT INTO keyspacex.tablex(available,name,age) VALUES(?,?,?)") val query = record.as(sc.schema.namespace) query.cql should be("INSERT INTO keyspacex.tablex(available,name,age) VALUES(false,'user',15)") } it should "convert cassandra column defs to a source schema" in { val colDef = Map( "id" -> DataType.cint(), "name" -> DataType.varchar()) val columns = TestUtil.getColumnDef(colDef) val expectedSchema = SchemaBuilder.struct() .field("id", Schema.INT32_SCHEMA) .field("name", Schema.STRING_SCHEMA).build() columns.asSchema should be(expectedSchema) } it should "convert kafka schema and struct to cassandra columns and schema mapping" in { import scala.collection.JavaConverters._ val topic = "a" val route = InternalConfig.Route(TaskConfig.SinkRoute + topic, "ks1.t1").get val schemaMap = new InternalConfig.Schema(route, Nil, Nil, Nil, List("available","name","age"), "") val schema = SchemaBuilder.struct.name("record").version(1) .field("available", Schema.BOOLEAN_SCHEMA) .field("name", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA).build val struct = new Struct(schema).put("name", "user").put("available", false).put("age", 15) val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0) schema.asColumnNames should ===(schemaMap.columnNames) schemaMap.columnNames should ===(schema.fields.asScala.map(_.name).toList) schemaMap is record should be (true) } }
Example 72
Source File: CassandraSinkTaskSpec.scala From kafka-connect-cassandra with Apache License 2.0 | 5 votes |
package com.tuplejump.kafka.connect.cassandra import scala.collection.JavaConverters._ import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.{SinkRecord, SinkTaskContext} class CassandraSinkTaskSpec extends AbstractFlatSpec { val topicName = "test_kv_topic" val tableName = "test.kv" val config = sinkProperties(Map(topicName -> tableName)) it should "start sink task" in { val sinkTask = new CassandraSinkTask() val mockContext = mock[SinkTaskContext] sinkTask.initialize(mockContext) sinkTask.start(config.asJava) sinkTask.stop() } it should "save records in cassandra" in { val sinkTask = new CassandraSinkTask() val mockContext = mock[SinkTaskContext] sinkTask.initialize(mockContext) sinkTask.start(config.asJava) val valueSchema = SchemaBuilder.struct.name("record").version(1) .field("key", Schema.STRING_SCHEMA) .field("value", Schema.INT32_SCHEMA).build val value1 = new Struct(valueSchema).put("key", "pqr").put("value", 15) val value2 = new Struct(valueSchema).put("key", "abc").put("value", 17) val record1 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value1, 0) val record2 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value2, 0) sinkTask.put(List(record1, record2).asJavaCollection) sinkTask.stop() val cc = CassandraCluster.local val session = cc.session val result = session.execute(s"select count(1) from $tableName").one() val rowCount = result.getLong(0) rowCount should be(2) cc.shutdown() } }
Example 73
Source File: IgnoreEvolutionPolicy.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.evolution import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.kafka.connect.data.Schema import scala.collection.JavaConverters._ import scala.util.Try object IgnoreEvolutionPolicy extends EvolutionPolicy { override def evolve(dbName: DatabaseName, tableName: TableName, metastoreSchema: Schema, inputSchema: Schema) (implicit client: IMetaStoreClient): Try[Schema] = Try { HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value)) }.map { schema => val compatible = schema.fields().asScala.forall { field => inputSchema.field(field.name) != null || field.schema().isOptional || field.schema().defaultValue() != null } if (compatible) schema else sys.error("Input Schema is not compatible with the metastore") } }
Example 74
Source File: MapValueConverterTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.json.sql.JacksonJson import org.apache.kafka.connect.data.{Schema, Struct} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import scala.collection.JavaConverters._ class MapValueConverterTest extends AnyFunSuite with Matchers { test("converts nested payload") { val json = """ |{ | "idType": 3, | "colorDepth": "", | "threshold" : 45.77, | "evars": { | "evars": { | "eVar1": "Tue Aug 27 2019 12:08:10", | "eVar2": 156692207943934897 | } | }, | "exclude": { | "id": 0, | "value": false | } |} |""".stripMargin val map = JacksonJson.toMap[Any](json) val struct = MapValueConverter.convert(map) //Jackson transforming the json to Map the fields order is not retained struct.schema().fields().asScala.map(_.name()).sorted shouldBe List("idType", "colorDepth", "threshold", "evars", "exclude").sorted struct.schema().field("idType").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA struct.schema().field("colorDepth").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA struct.schema().field("threshold").schema() shouldBe Schema.OPTIONAL_FLOAT64_SCHEMA struct.schema().field("exclude").schema().`type`() shouldBe Schema.Type.STRUCT struct.schema().field("exclude").schema().isOptional shouldBe true struct.schema().field("evars").schema().`type`() shouldBe Schema.Type.STRUCT struct.schema().field("evars").schema().isOptional shouldBe true struct.schema().field("evars").schema().fields().asScala.map(_.name()) shouldBe List("evars") val evarsInner = struct.schema().field("evars").schema().field("evars") evarsInner.schema().`type`() shouldBe Schema.Type.STRUCT evarsInner.schema().isOptional shouldBe true evarsInner.schema().fields().asScala.map(_.name()).sorted shouldBe List("eVar1", "eVar2").sorted evarsInner.schema().field("eVar1").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA evarsInner.schema().field("eVar2").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA val exclude = struct.schema().field("exclude").schema() exclude.schema().`type`() shouldBe Schema.Type.STRUCT exclude.schema().isOptional shouldBe true exclude.schema().fields().asScala.map(_.name()).sorted shouldBe List("id", "value").sorted exclude.schema().field("id").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA exclude.schema().field("value").schema() shouldBe Schema.OPTIONAL_BOOLEAN_SCHEMA struct.get("idType") shouldBe 3L struct.get("colorDepth") shouldBe "" struct.get("threshold") shouldBe 45.77D val evarsStruct = struct.get("evars").asInstanceOf[Struct].get("evars").asInstanceOf[Struct] evarsStruct.get("eVar1") shouldBe "Tue Aug 27 2019 12:08:10" evarsStruct.get("eVar2") shouldBe 156692207943934897L val excludeStruct = struct.get("exclude").asInstanceOf[Struct] excludeStruct.get("id") shouldBe 0L excludeStruct.get("value") shouldBe false } }
Example 75
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter} package object parquet { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = { if (fs.isDirectory(path)) { logger.debug(s"$path is a directory, reading constituent files") val remote = fs.listFiles(path, false) new Iterator[Path] { override def hasNext: Boolean = remote.hasNext override def next(): Path = remote.next().getPath }.toList } else { logger.debug(s"Reading $path as a single file") List(path) } } def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = { ParquetReader.builder(new StructReadSupport, file) .withConf(fs.getConf) .build() } def parquetWriter(path: Path, schema: Schema, config: ParquetSinkConfig): ParquetWriter[Struct] = { new StructParquetWriterBuilder(path, schema) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(config.enableDictionary) .withValidation(config.validation) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withWriteMode(if (config.overwrite) { ParquetFileWriter.Mode.OVERWRITE } else { ParquetFileWriter.Mode.CREATE }).build() } }
Example 76
Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ // derived from Apache Spark's parquet write support, archive and license here: // https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private val schemaName = if (schema.name() == null) "schema" else schema.name() private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName) private val metadata = new java.util.HashMap[String, String]() metadata.put("written_by", "streamreactor") // The Parquet `RecordConsumer` to which all structs are written private var consumer: RecordConsumer = _ type ValueWriter = (Any) => Unit override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String]) override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata) override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer override def write(struct: Struct): Unit = { writeMessage { writeStructFields(struct) } } private def writeStructFields(struct: Struct): Unit = { for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) { val value = struct.get(field) if (value != null) { val writer = valueWriter(field.schema()) writeField(field.name, index) { writer(value) } } } } def valueWriter(schema: Schema): ValueWriter = { // todo perhaps introduce something like spark's SpecializedGetters schema.`type`() match { case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean]) case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt) case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong) case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes)) case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat) case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble) case Schema.Type.STRUCT => value => { logger.debug(s"Writing nested struct") val struct = value.asInstanceOf[Struct] writeGroup { schema.fields.asScala .map { field => field -> struct.get(field) } .zipWithIndex.foreach { case ((field, v), k) => writeField(field.name, k) { valueWriter(field.schema)(v) } } } } case _ => throw UnsupportedSchemaType(schema.`type`.toString) } } private def writeMessage(f: => Unit): Unit = { consumer.startMessage() f consumer.endMessage() } private def writeGroup(f: => Unit): Unit = { consumer.startGroup() // consumer.startMessage() f //consumer.endMessage() consumer.endGroup() } private def writeField(name: String, k: Int)(f: => Unit): Unit = { consumer.startField(name, k) f consumer.endField(name, k) } }
Example 77
Source File: Converters.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.kafka.connect.data.{Field, Schema} import org.apache.parquet.io.api.Converter object Converters { def get(field: Field, builder: scala.collection.mutable.Map[String, Any]): Converter = { field.schema().`type`() match { case Schema.Type.STRUCT => new NestedGroupConverter(field.schema(), field, builder) case Schema.Type.INT64 | Schema.Type.INT32 | Schema.Type.INT16 | Schema.Type.INT8 => new AppendingPrimitiveConverter(field, builder) case Schema.Type.FLOAT64 | Schema.Type.FLOAT32 => new AppendingPrimitiveConverter(field, builder) // case Schema.Type.INT64 => new TimestampPrimitiveConverter(field, builder) case Schema.Type.STRING => new DictionaryStringPrimitiveConverter(field, builder) case Schema.Type.ARRAY => ??? case other => throw UnsupportedSchemaType(s"Unsupported data type $other") } } }
Example 78
Source File: NestedGroupConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.{Field, Schema} import org.apache.parquet.io.api.{Converter, GroupConverter} import scala.collection.JavaConverters._ class NestedGroupConverter(schema: Schema, field: Field, parentBuilder: scala.collection.mutable.Map[String, Any]) extends GroupConverter with StrictLogging { private[parquet] val builder = scala.collection.mutable.Map.empty[String, Any] private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq override def getConverter(k: Int): Converter = converters(k) override def start(): Unit = builder.clear() override def end(): Unit = parentBuilder.put(field.name, builder.result) }
Example 79
Source File: RootGroupConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.io.api.{Converter, GroupConverter} import scala.collection.JavaConverters._ class RootGroupConverter(schema: Schema) extends GroupConverter with StrictLogging { require(schema.`type`() == Schema.Type.STRUCT) var struct: Struct = _ private val builder = scala.collection.mutable.Map.empty[String, Any] private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq override def getConverter(k: Int): Converter = converters(k) override def start(): Unit = builder.clear() override def end(): Unit = struct = { val struct = new Struct(schema) schema.fields.asScala.map { field => val value = builder.getOrElse(field.name, null) try { struct.put(field, value) } catch { case t: Exception => throw t } } struct } }
Example 80
Source File: PartitionValueMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source.mapper import com.landoop.streamreactor.connect.hive.{Partition, StructMapper} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import scala.collection.JavaConverters._ class PartitionValueMapper(partition: Partition) extends StructMapper { override def map(input: Struct): Struct = { val builder = SchemaBuilder.struct() input.schema.fields.asScala.foreach { field => builder.field(field.name, field.schema) } partition.entries.toList.foreach { entry => builder.field(entry._1.value, Schema.STRING_SCHEMA) } val schema = builder.build() val struct = new Struct(schema) input.schema.fields.asScala.foreach { field => struct.put(field.name, input.get(field.name)) } partition.entries.toList.foreach { entry => struct.put(entry._1.value, entry._2) } struct } }
Example 81
Source File: MetastoreSchemaAlignMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import com.landoop.streamreactor.connect.hive.StructMapper import org.apache.kafka.connect.data.{Schema, Struct} import scala.util.Try class MetastoreSchemaAlignMapper(schema: Schema) extends StructMapper { import scala.collection.JavaConverters._ override def map(input: Struct): Struct = { //hive converts everything to lowercase val inputFieldsMapping = input.schema().fields().asScala.map { f => f.name().toLowerCase() -> f.name() }.toMap val struct = schema.fields.asScala.foldLeft(new Struct(schema)) { (struct, field) => Try(input.get(inputFieldsMapping(field.name))).toOption match { case Some(value) => struct.put(field.name, value) case None if field.schema.isOptional => struct.put(field.name, null) case None => sys.error(s"Cannot map struct to required schema; ${field.name} is missing, no default value has been supplied and null is not permitted") } } struct } }
Example 82
Source File: HiveWriterManager.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive.{Offset, TopicPartition, TopicPartitionOffset} import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveWriter} import com.landoop.streamreactor.connect.hive.sink.staging.StageManager import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.Schema def flush(offsets: Map[TopicPartition, Offset]): Unit = { logger.info(s"Flushing offsets $offsets") // we may not have an offset for a given topic/partition if no data was written to that TP writers.foreach { case (key, writer) => writer.close() offsets.get(key.tp).foreach { offset => stageManager.commit(writer.file, key.tp.withOffset(offset)) } writers.remove(key) } } def getWriters: Seq[OpenWriter] = writers.map { case (key, writer) => OpenWriter(key.tp, key.dir, writer) }.toList }
Example 83
Source File: ValueConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ object ValueConverter { def apply(record: SinkRecord): Struct = record.value match { case struct: Struct => StructValueConverter.convert(struct) case map: Map[_, _] => MapValueConverter.convert(map) case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap) case string: String => StringValueConverter.convert(string) case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}") } } trait ValueConverter[T] { def convert(value: T): Struct } object StructValueConverter extends ValueConverter[Struct] { override def convert(struct: Struct): Struct = struct } object MapValueConverter extends ValueConverter[Map[_, _]] { def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = { value match { case s: String => builder.field(key, Schema.OPTIONAL_STRING_SCHEMA) s case l: Long => builder.field(key, Schema.OPTIONAL_INT64_SCHEMA) l case i: Int => builder.field(key, Schema.OPTIONAL_INT64_SCHEMA) i.toLong case b: Boolean => builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA) b case f: Float => builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA) f.toDouble case d: Double => builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA) d case innerMap: java.util.Map[_, _] => val innerStruct = convert(innerMap.asScala.toMap, true) builder.field(key, innerStruct.schema()) innerStruct case innerMap: Map[_, _] => val innerStruct = convert(innerMap, true) builder.field(key, innerStruct.schema()) innerStruct } } def convert(map: Map[_, _], optional: Boolean) = { val builder = SchemaBuilder.struct() val values = map.map { case (k, v) => val key = k.toString val value = convertValue(v, key, builder) key -> value }.toList if (optional) builder.optional() val schema = builder.build val struct = new Struct(schema) values.foreach { case (key, value) => struct.put(key.toString, value) } struct } override def convert(map: Map[_, _]): Struct = convert(map, false) } object StringValueConverter extends ValueConverter[String] { override def convert(string: String): Struct = { val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build() new Struct(schema).put("a", string) } }
Example 84
Source File: AddEvolutionPolicy.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.evolution import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.kafka.connect.data.Schema import scala.collection.JavaConverters._ import scala.util.Try object AddEvolutionPolicy extends EvolutionPolicy { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def evolve(dbName: DatabaseName, tableName: TableName, metastoreSchema: Schema, inputSchema: Schema) (implicit client: IMetaStoreClient): Try[Schema] = Try { val missing = inputSchema.fields.asScala .filter(f => metastoreSchema.field(f.name) == null) .map(HiveSchemas.toFieldSchema) if (missing.nonEmpty) { logger.info(s"Evolving hive metastore to add: ${missing.mkString(",")}") val table = client.getTable(dbName.value, tableName.value) val cols = table.getSd.getCols missing.foreach(field => cols.add(field)) table.getSd.setCols(cols) client.alter_table(dbName.value, tableName.value, table) HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value)) } else { metastoreSchema } } }
Example 85
Source File: ValidatorTask.scala From ohara with Apache License 2.0 | 5 votes |
package oharastream.ohara.connector.validation import java.util import java.util.concurrent.TimeUnit import oharastream.ohara.client.configurator.InspectApi.{RdbInfo, RdbQuery} import oharastream.ohara.client.configurator.{ErrorApi, InspectApi} import oharastream.ohara.client.database.DatabaseClient import oharastream.ohara.common.data.Serializer import oharastream.ohara.common.util.VersionUtils import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.source.{SourceRecord, SourceTask} import spray.json.{JsObject, _} import scala.jdk.CollectionConverters._ class ValidatorTask extends SourceTask { private[this] var done = false private[this] var props: Map[String, String] = _ private[this] val topic: String = InspectApi.INTERNAL_TOPIC_KEY.topicNameOnKafka private[this] var requestId: String = _ override def start(props: util.Map[String, String]): Unit = { this.props = props.asScala.toMap requestId = require(InspectApi.REQUEST_ID) } override def poll(): util.List[SourceRecord] = if (done) { // just wait the configurator to close this connector TimeUnit.SECONDS.sleep(2) null } else try information match { case query: RdbQuery => toSourceRecord(validate(query)) } catch { case e: Throwable => toSourceRecord(ErrorApi.of(e)) } finally done = true override def stop(): Unit = { // do nothing } override def version(): String = VersionUtils.VERSION private[this] def validate(query: RdbQuery): RdbInfo = { val client = DatabaseClient.builder.url(query.url).user(query.user).password(query.password).build try RdbInfo( name = client.databaseType, tables = client.tableQuery .catalog(query.catalogPattern.orNull) .schema(query.schemaPattern.orNull) .tableName(query.tableName.orNull) .execute() ) finally client.close() } private[this] def toJsObject: JsObject = props(InspectApi.SETTINGS_KEY).parseJson.asJsObject private[this] def information = require(InspectApi.TARGET_KEY) match { case InspectApi.RDB_KIND => InspectApi.RDB_QUERY_FORMAT.read(toJsObject) case other: String => throw new IllegalArgumentException( s"valid targets are ${InspectApi.RDB_KIND}. current is $other" ) } private[this] def toSourceRecord(data: Object): util.List[SourceRecord] = util.Arrays.asList( new SourceRecord( null, null, topic, Schema.BYTES_SCHEMA, Serializer.STRING.to(requestId), Schema.BYTES_SCHEMA, Serializer.OBJECT.to(data) ) ) private[this] def require(key: String): String = props.getOrElse(key, throw new IllegalArgumentException(s"the $key is required")) }
Example 86
Source File: StrictEvolutionPolicy.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.evolution import com.landoop.streamreactor.connect.hive.{DatabaseName, HiveSchemas, TableName} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.kafka.connect.data.Schema import scala.collection.JavaConverters._ import scala.util.Try object StrictEvolutionPolicy extends EvolutionPolicy { override def evolve(dbName: DatabaseName, tableName: TableName, metastoreSchema: Schema, inputSchema: Schema) (implicit client: IMetaStoreClient): Try[Schema] = Try { val schema = HiveSchemas.toKafka(client.getTable(dbName.value, tableName.value)) schema }.map { schema => //Hive keeps the fields in lowercase val inputFields = inputSchema.fields().asScala.map { f => f.name().toLowerCase() }.toSet schema.fields().asScala.foreach { field => val exists = inputFields.contains(field.name) val optional = field.schema().isOptional val default = field.schema().defaultValue() val compatible = exists || optional || default != null if (!compatible) { sys.error(s"Input Schema is not compatible with the metastore for field [${field.name()}]") } } schema } }
Example 87
Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.sink.config.TableOptions import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.Table import org.apache.kafka.connect.data.{Schema, Struct} case class HiveSinkState(offsets: Map[TopicPartition, Offset], committedOffsets: Map[TopicPartition, Offset], table: Table, tableLocation: Path, plan: Option[PartitionPlan], metastoreSchema: Schema, mapper: Struct => Struct, lastSchema: Schema) { def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = { copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset)) } def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(offsets = offsets + (tp -> offset)) } def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = { copy(committedOffsets = committedOffsets ++ offsets) } def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(committedOffsets = committedOffsets + (tp -> offset)) } def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema) } object HiveSinkState { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def from(schema: Schema, table: TableOptions, dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = { logger.info(s"Init sink for schema $schema") val hiveTable = getOrCreateTable(table, dbName, schema) val tableLocation = new Path(hiveTable.getSd.getLocation) val plan = hive.partitionPlan(hiveTable) val metastoreSchema = table.evolutionPolicy .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema) .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema")) val mapperFns: Seq[Struct => Struct] = Seq( table.projection.map(new ProjectionMapper(_)), Some(new MetastoreSchemaAlignMapper(metastoreSchema)), plan.map(new DropPartitionValuesMapper(_)) ).flatten.map(mapper => mapper.map _) val mapper = Function.chain(mapperFns) HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema) } def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema) (implicit client: IMetaStoreClient, fs: FileSystem): Table = { def create: Table = { val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",") logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]") hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format) } logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}") client.tableExists(dbName.value, table.tableName.value) match { case true if table.overwriteTable => hive.dropTable(dbName, table.tableName, true) create case true => client.getTable(dbName.value, table.tableName.value) case false if table.createTable => create case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist") } } }
Example 88
Source File: domain.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import cats.Show import cats.data.NonEmptyList import org.apache.hadoop.fs.Path import org.apache.kafka.common.{TopicPartition => KafkaTopicPartition} import org.apache.kafka.connect.data.Schema case class Topic(value: String) { require(value != null && value.trim.nonEmpty) } case class Offset(value: Long) { require(value >= 0) } case class TopicPartition(topic: Topic, partition: Int) { def withOffset(offset: Offset): TopicPartitionOffset = TopicPartitionOffset(topic, partition, offset) def toKafka = new KafkaTopicPartition(topic.value, partition) } case class TopicPartitionOffset(topic: Topic, partition: Int, offset: Offset) { def toTopicPartition = TopicPartition(topic, partition) } case class DatabaseName(value: String) { require(value != null && value.trim.nonEmpty) } case class TableName(value: String) { require(value != null && value.trim.nonEmpty) } // contains all the partition keys for a particular table case class PartitionPlan(tableName: TableName, keys: NonEmptyList[PartitionKey]) // contains a partition key, which you can think of as like a partition column name case class PartitionKey(value: String) // defines a partition key field case class PartitionField(name: String, schema: Schema = Schema.STRING_SCHEMA, comment: Option[String] = None) { require(name != null && name.trim.nonEmpty) } // contains a single partition in a table, that is one set of unique values, one per partition key case class Partition(entries: NonEmptyList[(PartitionKey, String)], location: Option[Path]) case class Serde(serializationLib: String, inputFormat: String, outputFormat: String, params: Map[String, String]) // generates the default hive metatstore location string for a partition object DefaultPartitionLocation extends Show[Partition] { override def show(t: Partition): String = { t.entries.map { case (key, value) => key.value + "=" + value }.toList.mkString("/") } }
Example 89
Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.Serde import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.ParquetWriter import scala.util.Try object ParquetHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", Map("serialization.format" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating parquet writer at $path") val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val createdTime: Long = System.currentTimeMillis() var lastKnownFileSize: Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { writer.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing writer at path $path") writer.close() } override def currentCount: Long = count override def file: Path = path override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating parquet reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path) var offset = startAt override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 90
Source File: SinkRecordParser.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.influx.converters import com.datamountaineer.streamreactor.connect.influx.helpers.Util import com.datamountaineer.streamreactor.connect.influx.writers.KcqlDetails.Path import com.datamountaineer.streamreactor.connect.influx.writers.ValuesExtractor import com.fasterxml.jackson.databind.JsonNode import com.landoop.json.sql.JacksonJson import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.util.Try object SinkRecordParser { type Field = String trait ParsedSinkRecord { def valueFields(ignored: Set[Path]): Seq[(String, Any)] def field(path: Path): Option[Any] } trait ParsedKeyValueSinkRecord extends ParsedSinkRecord { def keyFields(ignored: Set[Path]): Seq[(String, Any)] } private case class JsonSinkRecord(json: JsonNode) extends ParsedSinkRecord { override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(json, ignored.map(_.value.last)) override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(json, path.value)) } private case class StructSinkRecord(struct: Struct) extends ParsedSinkRecord { override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(struct, ignored.map(_.value.last)) override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(struct, path.value)) } private case class MapSinkRecord(map: java.util.Map[String, Any]) extends ParsedSinkRecord { override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(map, ignored.map(_.value.last)) override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(map, path.value)) } private case class KeyValueRecord(key: ParsedSinkRecord, value: ParsedSinkRecord) extends ParsedKeyValueSinkRecord { override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = value.valueFields(ignored) override def field(path: Path): Option[Any] = path.value.headOption match { case Some(fieldName) if Util.caseInsensitiveComparison(fieldName, Util.KEY_CONSTANT) => key.field(Path(path.value.tail)) case Some(_) => value.field(path) case None => throw new IllegalArgumentException("Unreachable situation detected. Path should never be empty") } override def keyFields(ignored: Set[Path]): Seq[(String, Any)] = key.valueFields(ignored) } def build(record: SinkRecord): Try[ParsedKeyValueSinkRecord] = { val key = Option(record.keySchema()).map(_.`type`()) match { case Some(Schema.Type.STRING) => Try(JsonSinkRecord(JacksonJson.asJson(record.key().asInstanceOf[String]))) case Some(Schema.Type.STRUCT) => Try(StructSinkRecord(record.key().asInstanceOf[Struct])) case None => Try(MapSinkRecord(record.key().asInstanceOf[java.util.Map[String, Any]])) } val value = Option(record.valueSchema()).map(_.`type`()) match { case Some(Schema.Type.STRING) => Try(require(record.value() != null && record.value().getClass == classOf[String], "The SinkRecord payload should be of type String")).flatMap(_ => Try(JsonSinkRecord(JacksonJson.asJson(record.value().asInstanceOf[String])))) case Some(Schema.Type.STRUCT) => Try(require(record.value() != null && record.value().getClass == classOf[Struct], "The SinkRecord payload should be of type Struct")).flatMap(_ => Try(StructSinkRecord(record.value().asInstanceOf[Struct]))) case None => Try(require(record.value() != null && record.value().isInstanceOf[java.util.Map[_, _]], "The SinkRecord payload should be of type java.util.Map[String, Any]")).flatMap(_ => Try(MapSinkRecord(record.value().asInstanceOf[java.util.Map[String, Any]]))) } key .flatMap(key => value.map(key -> _)) .map { case (k, v) => KeyValueRecord(k, v) } } }
Example 91
Source File: SinkRecordKeyRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.hbase import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._ import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.sink.SinkRecord import org.mockito.MockitoSugar import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class SinkRecordKeyRowKeyBuilderTest extends AnyWordSpec with Matchers with MockitoSugar { val keyRowKeyBuilder = new SinkRecordKeyRowKeyBuilderBytes() "SinkRecordKeyRowKeyBuilder" should { "create the right key from the Schema key value - Byte" in { val b = 123.toByte val sinkRecord = new SinkRecord("", 1, Schema.INT8_SCHEMA, b, Schema.FLOAT64_SCHEMA, Nil, 0) keyRowKeyBuilder.build(sinkRecord, "Should not matter") shouldBe Array(b) } "create the right key from the Schema key value - String" in { val s = "somekey" val sinkRecord = new SinkRecord("", 1, Schema.STRING_SCHEMA, s, Schema.FLOAT64_SCHEMA, Nil, 0) keyRowKeyBuilder.build(sinkRecord, Nil) shouldBe s.fromString() } "create the right key from the Schema key value - Bytes" in { val bArray = Array(23.toByte, 24.toByte, 242.toByte) val sinkRecord = new SinkRecord("", 1, Schema.BYTES_SCHEMA, bArray, Schema.FLOAT64_SCHEMA, Nil, 0) keyRowKeyBuilder.build(sinkRecord, Nil) shouldBe bArray } "create the right key from the Schema key value - Boolean" in { val bool = true val sinkRecord = new SinkRecord("", 1, Schema.BOOLEAN_SCHEMA, bool, Schema.FLOAT64_SCHEMA, Nil, 0) keyRowKeyBuilder.build(sinkRecord, Nil) shouldBe bool.fromBoolean() } } }
Example 92
Source File: StructFieldsRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.hbase import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._ import org.apache.hadoop.hbase.util.Bytes import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StructFieldsRowKeyBuilderTest extends AnyWordSpec with Matchers { "StructFieldsRowKeyBuilder" should { "raise an exception if the field is not present in the struct" in { intercept[IllegalArgumentException] { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) //val field = Field("threshold", "threshold", false) StructFieldsRowKeyBuilderBytes(List("threshold")).build(sinkRecord, null) } } "create the row key based on one single field in the struct" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) //val field = Field("firstName", "firstName", true) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StructFieldsRowKeyBuilderBytes(List("firstName")).build(sinkRecord, null) shouldBe "Alex".fromString } "create the row key based on more thant one field in the struct" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) //val field = Field("firstName", "firstName", true) //val field2 = Field("age", "age", true) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StructFieldsRowKeyBuilderBytes(List("firstName", "age")).build(sinkRecord, null) shouldBe Bytes.add("Alex".fromString(), "\n".fromString(), 30.fromInt()) } } }
Example 93
Source File: GenericRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.hbase import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._ import org.apache.hadoop.hbase.util.Bytes import org.apache.kafka.connect.data.Schema import org.apache.kafka.connect.sink.SinkRecord import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class GenericRowKeyBuilderTest extends AnyWordSpec with Matchers { "GenericRowKeyBuilder" should { "use the topic, partition and offset to make the key" in { val topic = "sometopic" val partition = 2 val offset = 1243L val sinkRecord = new SinkRecord(topic, partition, Schema.INT32_SCHEMA, 345, Schema.STRING_SCHEMA, "", offset) val keyBuilder = new GenericRowKeyBuilderBytes() val expected = Bytes.add(Array(topic.fromString(), keyBuilder.delimiterBytes, partition.fromString(), keyBuilder.delimiterBytes, offset.fromString())) keyBuilder.build(sinkRecord, Nil) shouldBe expected } } }
Example 94
Source File: ObjectMessageConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.jms.sink.converters import com.datamountaineer.streamreactor.connect.jms.config.JMSSetting import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import javax.jms.{ObjectMessage, Session} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ class ObjectMessageConverter extends JMSMessageConverter with ConverterUtil { override def convert(record: SinkRecord, session: Session, setting: JMSSetting): (String, ObjectMessage) = { val converted = super[ConverterUtil].convert(record, setting.fields, setting.ignoreField) val msg = session.createObjectMessage() val value = converted.value() val schema = converted.valueSchema() schema.`type`() match { case Schema.Type.STRUCT => val struct = value.asInstanceOf[Struct] struct.schema().fields().asScala.foreach { f => ObjectMessageConverterFn(f.name(), struct.get(f), f.schema(), msg, session) } case _ => ObjectMessageConverterFn("field", value, schema, msg, session) } (setting.source, msg) } } object ObjectMessageConverterFn { def apply(fieldName: String, value: AnyRef, schema: Schema, msg: ObjectMessage, session: Session): Unit = { schema.`type`() match { case Schema.Type.BYTES => msg.setObjectProperty(fieldName, value.asInstanceOf[Array[Byte]].toList.asJava) case Schema.Type.BOOLEAN => msg.setBooleanProperty(fieldName, value.asInstanceOf[Boolean]) case Schema.Type.FLOAT32 => msg.setFloatProperty(fieldName, value.asInstanceOf[Float]) case Schema.Type.FLOAT64 => msg.setDoubleProperty(fieldName, value.asInstanceOf[Double]) case Schema.Type.INT8 => msg.setByteProperty(fieldName, value.asInstanceOf[Byte]) case Schema.Type.INT16 => msg.setShortProperty(fieldName, value.asInstanceOf[Short]) case Schema.Type.INT32 => msg.setIntProperty(fieldName, value.asInstanceOf[Int]) case Schema.Type.INT64 => msg.setLongProperty(fieldName, value.asInstanceOf[Long]) case Schema.Type.STRING => msg.setStringProperty(fieldName, value.asInstanceOf[String]) case Schema.Type.MAP => msg.setObjectProperty(fieldName, value) case Schema.Type.ARRAY => msg.setObjectProperty(fieldName, value) case Schema.Type.STRUCT => val nestedMsg = session.createObjectMessage() val struct = value.asInstanceOf[Struct] struct.schema().fields().asScala.foreach { f => ObjectMessageConverterFn(f.name(), struct.get(f), f.schema(), nestedMsg, session) } msg.setObjectProperty(fieldName, nestedMsg) } } }
Example 95
Source File: MapMessageConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.jms.sink.converters import com.datamountaineer.streamreactor.connect.jms.config.JMSSetting import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import javax.jms.{MapMessage, Session} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ class MapMessageConverter extends JMSMessageConverter with ConverterUtil { override def convert(record: SinkRecord, session: Session, setting: JMSSetting): (String, MapMessage) = { val converted = super[ConverterUtil].convert(record, setting.fields, setting.ignoreField) val msg = session.createMapMessage() val value = converted.value() val schema = converted.valueSchema() schema.`type`() match { case Schema.Type.STRUCT => val struct = value.asInstanceOf[Struct] struct.schema().fields().asScala.foreach { f => MapMessageBuilderFn(f.name(), struct.get(f), f.schema(), msg, session) } case _ => MapMessageBuilderFn("field", value, schema, msg, session) } (setting.source, msg) } } object MapMessageBuilderFn { def apply(fieldName: String, value: AnyRef, schema: Schema, msg: MapMessage, session: Session): Unit = { schema.`type`() match { case Schema.Type.BYTES => msg.setBytes(fieldName, value.asInstanceOf[Array[Byte]]) case Schema.Type.BOOLEAN => msg.setBoolean(fieldName, value.asInstanceOf[Boolean]) case Schema.Type.FLOAT32 => msg.setFloat(fieldName, value.asInstanceOf[Float]) case Schema.Type.FLOAT64 => msg.setDouble(fieldName, value.asInstanceOf[Double]) case Schema.Type.INT8 => msg.setByte(fieldName, value.asInstanceOf[Byte]) case Schema.Type.INT16 => msg.setShort(fieldName, value.asInstanceOf[Short]) case Schema.Type.INT32 => msg.setInt(fieldName, value.asInstanceOf[Int]) case Schema.Type.INT64 => msg.setLong(fieldName, value.asInstanceOf[Long]) case Schema.Type.STRING => msg.setString(fieldName, value.asInstanceOf[String]) case Schema.Type.MAP => msg.setObject(fieldName, value) case Schema.Type.ARRAY => msg.setObject(fieldName, value) case Schema.Type.STRUCT => val nestedMsg = session.createMapMessage() val struct = value.asInstanceOf[Struct] struct.schema().fields().asScala.foreach { f => MapMessageBuilderFn(f.name(), struct.get(f), f.schema(), nestedMsg, session) } msg.setObject(fieldName, nestedMsg) } } }