org.apache.kafka.connect.data.Struct Scala Examples
The following examples show how to use org.apache.kafka.connect.data.Struct.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StructVectorReader.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.orc.vectors import com.landoop.streamreactor.connect.hive.orc.OrcSchemas import org.apache.hadoop.hive.ql.exec.vector.{ColumnVector, StructColumnVector} import org.apache.kafka.connect.data.Struct import org.apache.orc.TypeDescription import scala.collection.JavaConverters._ class StructVectorReader(readers: IndexedSeq[OrcVectorReader[_, _]], typeDescription: TypeDescription) extends OrcVectorReader[StructColumnVector, Struct] { val schema = OrcSchemas.toKafka(typeDescription) override def read(offset: Int, vector: StructColumnVector): Option[Struct] = { val struct = new Struct(schema) val y = if (vector.isRepeating) 0 else offset typeDescription.getFieldNames.asScala.zipWithIndex.foreach { case (name, k) => val fieldReader = readers(k).asInstanceOf[OrcVectorReader[ColumnVector, Any]] val fieldVector = vector.fields(k) val value = fieldReader.read(y, fieldVector) struct.put(name, value.orNull) } Some(struct) } }
Example 2
Source File: ProjectionMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source.mapper import cats.data.NonEmptyList import com.landoop.streamreactor.connect.hive.StructMapper import com.landoop.streamreactor.connect.hive.source.config.ProjectionField import org.apache.kafka.connect.data.{SchemaBuilder, Struct} class ProjectionMapper(projection: NonEmptyList[ProjectionField]) extends StructMapper { override def map(input: Struct): Struct = { val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, projectionField) => Option(input.schema.field(projectionField.name)) .fold(sys.error(s"Projection field ${projectionField.name} cannot be found in input")) { field => builder.field(projectionField.alias, field.schema) } } val schema = builder.build() projection.foldLeft(new Struct(schema)) { (struct, field) => struct.put(field.alias, input.get(field.name)) } } }
Example 3
Source File: PartitionValueMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source.mapper import com.landoop.streamreactor.connect.hive.{Partition, StructMapper} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import scala.collection.JavaConverters._ class PartitionValueMapper(partition: Partition) extends StructMapper { override def map(input: Struct): Struct = { val builder = SchemaBuilder.struct() input.schema.fields.asScala.foreach { field => builder.field(field.name, field.schema) } partition.entries.toList.foreach { entry => builder.field(entry._1.value, Schema.STRING_SCHEMA) } val schema = builder.build() val struct = new Struct(schema) input.schema.fields.asScala.foreach { field => struct.put(field.name, input.get(field.name)) } partition.entries.toList.foreach { entry => struct.put(entry._1.value, entry._2) } struct } }
Example 4
Source File: HiveSource.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveReader, Record} import com.landoop.streamreactor.connect.hive.source.config.HiveSourceConfig import com.landoop.streamreactor.connect.hive.source.mapper.{PartitionValueMapper, ProjectionMapper} import com.landoop.streamreactor.connect.hive.source.offset.HiveSourceOffsetStorageReader import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.kafka.connect.data.Struct import org.apache.kafka.connect.source.SourceRecord import scala.collection.JavaConverters._ class HiveSource(db: DatabaseName, tableName: TableName, topic: Topic, offsetReader: HiveSourceOffsetStorageReader, config: HiveSourceConfig) (implicit client: IMetaStoreClient, fs: FileSystem) extends Iterator[SourceRecord] { val tableConfig = config.tableOptions.filter(_.tableName == tableName).find(_.topic == topic) .getOrElse(sys.error(s"Cannot find table configuration for ${db.value}.${tableName.value} => ${topic.value}")) private val table = client.getTable(db.value, tableName.value) private val format = HiveFormat(hive.serde(table)) private val metastoreSchema = HiveSchemas.toKafka(table) private val parts = TableFileScanner.scan(db, tableName) private val readers = parts.map { case (path, partition) => val fns: Seq[Struct => Struct] = Seq( partition.map(new PartitionValueMapper(_).map _), tableConfig.projection.map(new ProjectionMapper(_).map _) ).flatten val mapper: Struct => Struct = Function.chain(fns) val sourceOffset = offsetReader.offset(SourcePartition(db, tableName, topic, path)).getOrElse(SourceOffset(0)) new HiveReader { lazy val reader = format.reader(path, sourceOffset.rowNumber, metastoreSchema) override def iterator: Iterator[Record] = reader.iterator.map { record => Record(mapper(record.struct), record.path, record.offset) } override def close(): Unit = reader.close() } } private val iterator: Iterator[Record] = readers.map(_.iterator).reduce(_ ++ _).take(tableConfig.limit) override def hasNext: Boolean = iterator.hasNext override def next(): SourceRecord = { val record = iterator.next val sourcePartition = SourcePartition(db, tableName, topic, record.path) val offset = SourceOffset(record.offset) new SourceRecord( fromSourcePartition(sourcePartition).asJava, fromSourceOffset(offset).asJava, topic.value, record.struct.schema, record.struct ) } def close(): Unit = { readers.foreach(_.close()) } }
Example 5
Source File: ProjectionMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import cats.data.NonEmptyList import com.datamountaineer.kcql.Field import com.landoop.streamreactor.connect.hive.StructMapper import org.apache.kafka.connect.data.{SchemaBuilder, Struct} class ProjectionMapper(projection: NonEmptyList[Field]) extends StructMapper { override def map(input: Struct): Struct = { // the compatible output schema built from projected fields with aliases applied val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, kcqlField) => Option(input.schema.field(kcqlField.getName)).fold(sys.error(s"Missing field $kcqlField")) { field => builder.field(kcqlField.getAlias, field.schema) } } val schema = builder.build() projection.foldLeft(new Struct(schema)) { (struct, field) => struct.put(field.getAlias, input.get(field.getName)) } } }
Example 6
Source File: MetastoreSchemaAlignMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import com.landoop.streamreactor.connect.hive.StructMapper import org.apache.kafka.connect.data.{Schema, Struct} import scala.util.Try class MetastoreSchemaAlignMapper(schema: Schema) extends StructMapper { import scala.collection.JavaConverters._ override def map(input: Struct): Struct = { //hive converts everything to lowercase val inputFieldsMapping = input.schema().fields().asScala.map { f => f.name().toLowerCase() -> f.name() }.toMap val struct = schema.fields.asScala.foldLeft(new Struct(schema)) { (struct, field) => Try(input.get(inputFieldsMapping(field.name))).toOption match { case Some(value) => struct.put(field.name, value) case None if field.schema.isOptional => struct.put(field.name, null) case None => sys.error(s"Cannot map struct to required schema; ${field.name} is missing, no default value has been supplied and null is not permitted") } } struct } }
Example 7
Source File: DropPartitionValuesMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import com.landoop.streamreactor.connect.hive.{PartitionPlan, StructMapper} import org.apache.kafka.connect.data.{SchemaBuilder, Struct} class DropPartitionValuesMapper(plan: PartitionPlan) extends StructMapper { import scala.collection.JavaConverters._ override def map(input: Struct): Struct = { val partitionKeys = plan.keys.map(_.value).toList val dataFields = input.schema.fields().asScala.filterNot(field => partitionKeys.contains(field.name)) val builder = dataFields.foldLeft(SchemaBuilder.struct) { (builder, field) => builder.field(field.name, field.schema) } val schema = builder.build() dataFields.foldLeft(new Struct(schema)) { (struct, field) => struct.put(field.name, input.get(field.name)) } } }
Example 8
Source File: ValueConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ object ValueConverter { def apply(record: SinkRecord): Struct = record.value match { case struct: Struct => StructValueConverter.convert(struct) case map: Map[_, _] => MapValueConverter.convert(map) case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap) case string: String => StringValueConverter.convert(string) case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}") } } trait ValueConverter[T] { def convert(value: T): Struct } object StructValueConverter extends ValueConverter[Struct] { override def convert(struct: Struct): Struct = struct } object MapValueConverter extends ValueConverter[Map[_, _]] { def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = { value match { case s: String => builder.field(key, Schema.OPTIONAL_STRING_SCHEMA) s case l: Long => builder.field(key, Schema.OPTIONAL_INT64_SCHEMA) l case i: Int => builder.field(key, Schema.OPTIONAL_INT64_SCHEMA) i.toLong case b: Boolean => builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA) b case f: Float => builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA) f.toDouble case d: Double => builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA) d case innerMap: java.util.Map[_, _] => val innerStruct = convert(innerMap.asScala.toMap, true) builder.field(key, innerStruct.schema()) innerStruct case innerMap: Map[_, _] => val innerStruct = convert(innerMap, true) builder.field(key, innerStruct.schema()) innerStruct } } def convert(map: Map[_, _], optional: Boolean) = { val builder = SchemaBuilder.struct() val values = map.map { case (k, v) => val key = k.toString val value = convertValue(v, key, builder) key -> value }.toList if (optional) builder.optional() val schema = builder.build val struct = new Struct(schema) values.foreach { case (key, value) => struct.put(key.toString, value) } struct } override def convert(map: Map[_, _]): Struct = convert(map, false) } object StringValueConverter extends ValueConverter[String] { override def convert(string: String): Struct = { val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build() new Struct(schema).put("a", string) } }
Example 9
Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.sink.config.TableOptions import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.Table import org.apache.kafka.connect.data.{Schema, Struct} case class HiveSinkState(offsets: Map[TopicPartition, Offset], committedOffsets: Map[TopicPartition, Offset], table: Table, tableLocation: Path, plan: Option[PartitionPlan], metastoreSchema: Schema, mapper: Struct => Struct, lastSchema: Schema) { def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = { copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset)) } def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(offsets = offsets + (tp -> offset)) } def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = { copy(committedOffsets = committedOffsets ++ offsets) } def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(committedOffsets = committedOffsets + (tp -> offset)) } def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema) } object HiveSinkState { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def from(schema: Schema, table: TableOptions, dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = { logger.info(s"Init sink for schema $schema") val hiveTable = getOrCreateTable(table, dbName, schema) val tableLocation = new Path(hiveTable.getSd.getLocation) val plan = hive.partitionPlan(hiveTable) val metastoreSchema = table.evolutionPolicy .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema) .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema")) val mapperFns: Seq[Struct => Struct] = Seq( table.projection.map(new ProjectionMapper(_)), Some(new MetastoreSchemaAlignMapper(metastoreSchema)), plan.map(new DropPartitionValuesMapper(_)) ).flatten.map(mapper => mapper.map _) val mapper = Function.chain(mapperFns) HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema) } def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema) (implicit client: IMetaStoreClient, fs: FileSystem): Table = { def create: Table = { val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",") logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]") hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format) } logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}") client.tableExists(dbName.value, table.tableName.value) match { case true if table.overwriteTable => hive.dropTable(dbName, table.tableName, true) create case true => client.getTable(dbName.value, table.tableName.value) case false if table.createTable => create case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist") } } }
Example 10
Source File: OrcSink.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.orc import com.landoop.streamreactor.connect.hive.orc.vectors.{OrcVectorWriter, StructVectorWriter} import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, StructUtils} import com.typesafe.scalalogging.StrictLogging import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector import org.apache.kafka.connect.data.{Schema, Struct} import scala.collection.JavaConverters._ class OrcSink(path: Path, schema: Schema, config: OrcSinkConfig)(implicit fs: FileSystem) extends StrictLogging { private val typeDescription = OrcSchemas.toOrc(schema) private val structWriter = new StructVectorWriter(typeDescription.getChildren.asScala.map(OrcVectorWriter.fromSchema)) private val batch = typeDescription.createRowBatch(config.batchSize) private val vector = new StructColumnVector(batch.numCols, batch.cols: _*) private val orcWriter = createOrcWriter(path, typeDescription, config) private var n = 0 def flush(): Unit = { logger.debug(s"Writing orc batch [size=$n, path=$path]") batch.size = n orcWriter.addRowBatch(batch) orcWriter.writeIntermediateFooter batch.reset() n = 0 } def write(struct: Struct): Unit = { structWriter.write(vector, n, Some(StructUtils.extractValues(struct))) n = n + 1 if (n == config.batchSize) flush() } def close(): Unit = { if (n > 0) flush() orcWriter.close() } }
Example 11
Source File: OrcSource.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.orc import com.landoop.streamreactor.connect.hive.OrcSourceConfig import com.landoop.streamreactor.connect.hive.orc.vectors.OrcVectorReader.fromSchema import com.landoop.streamreactor.connect.hive.orc.vectors.StructVectorReader import com.typesafe.scalalogging.StrictLogging import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.ql.exec.vector.{StructColumnVector, VectorizedRowBatch} import org.apache.kafka.connect.data.Struct import org.apache.orc.OrcFile.ReaderOptions import org.apache.orc.{OrcFile, Reader} import scala.collection.JavaConverters._ class OrcSource(path: Path, config: OrcSourceConfig)(implicit fs: FileSystem) extends StrictLogging { private val reader = OrcFile.createReader(path, new ReaderOptions(fs.getConf)) private val typeDescription = reader.getSchema private val schema = OrcSchemas.toKafka(typeDescription) private val readers = typeDescription.getChildren.asScala.map(fromSchema) private val vectorReader = new StructVectorReader(readers.toIndexedSeq, typeDescription) private val batch = typeDescription.createRowBatch() private val recordReader = reader.rows(new Reader.Options()) def close(): Unit = { recordReader.close() } def iterator: Iterator[Struct] = new Iterator[Struct] { var iter = new BatchIterator(batch) override def hasNext: Boolean = iter.hasNext || { batch.reset() recordReader.nextBatch(batch) iter = new BatchIterator(batch) !batch.endOfFile && batch.size > 0 && iter.hasNext } override def next(): Struct = iter.next() } // iterates over a batch, be careful not to mutate the batch while it is being iterated class BatchIterator(batch: VectorizedRowBatch) extends Iterator[Struct] { var offset = 0 val vector = new StructColumnVector(batch.numCols, batch.cols: _*) override def hasNext: Boolean = offset < batch.size override def next(): Struct = { val struct = vectorReader.read(offset, vector) offset = offset + 1 struct.orNull } } }
Example 12
Source File: RootGroupConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.io.api.{Converter, GroupConverter} import scala.collection.JavaConverters._ class RootGroupConverter(schema: Schema) extends GroupConverter with StrictLogging { require(schema.`type`() == Schema.Type.STRUCT) var struct: Struct = _ private val builder = scala.collection.mutable.Map.empty[String, Any] private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq override def getConverter(k: Int): Converter = converters(k) override def start(): Unit = builder.clear() override def end(): Unit = struct = { val struct = new Struct(schema) schema.fields.asScala.map { field => val value = builder.getOrElse(field.name, null) try { struct.put(field, value) } catch { case t: Exception => throw t } } struct } }
Example 13
Source File: OrcHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, Serde} import com.landoop.streamreactor.connect.hive.orc.OrcSink import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import scala.util.Try object OrcHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.orc.OrcSerde", "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", Map("org.apache.hadoop.hive.ql.io.orc.OrcSerde" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating orc writer at $path") val sink: OrcSink = com.landoop.streamreactor.connect.hive.orc.sink(path, schema, OrcSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val cretedTimestamp: Long = System.currentTimeMillis() var lastKnownFileSize:Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { sink.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing orc writer at path $path") sink.close() } override def file: Path = path override def currentCount: Long = count override def createdTime: Long = cretedTimestamp override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating orc reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.orc.source(path, OrcSourceConfig()) var offset = startAt override def iterator: Iterator[Record] = reader.iterator.map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 14
Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.Serde import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.ParquetWriter import scala.util.Try object ParquetHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", Map("serialization.format" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating parquet writer at $path") val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val createdTimestamp: Long = System.currentTimeMillis() var lastKnownFileSize:Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { writer.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing writer at path $path") writer.close() } override def currentCount: Long = count override def file: Path = path override def createdTime: Long = createdTimestamp override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating parquet reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path) var offset = startAt override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 15
Source File: RedisStreamTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.redis.sink.writer /* * Copyright 2017 Datamountaineer. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util import com.datamountaineer.streamreactor.connect.redis.sink.RedisSinkTask import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.mockito.MockitoSugar import org.scalatest.BeforeAndAfterAll import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import redis.clients.jedis.{Jedis, StreamEntryID} import scala.collection.JavaConverters._ class RedisStreamTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar { // // val redisServer = new RedisServer(6379) // // override def beforeAll() = redisServer.start() // // override def afterAll() = redisServer.stop() "Redis Stream writer" should { "write Kafka records to a Redis Stream" in { val TOPIC = "cpuTopic" val KCQL = s"INSERT INTO stream1 SELECT * from $TOPIC STOREAS STREAM" println("Testing KCQL : " + KCQL) val props = Map( RedisConfigConstants.REDIS_HOST->"localhost", RedisConfigConstants.REDIS_PORT->"6379", RedisConfigConstants.KCQL_CONFIG->KCQL, RedisConfigConstants.REDIS_PASSWORD -> "" ).asJava val config = RedisConfig(props) val connectionInfo = new RedisConnectionInfo("localhost", 6379, None) val settings = RedisSinkSettings(config) val writer = new RedisStreams(settings) val schema = SchemaBuilder.struct().name("com.example.Cpu") .field("type", Schema.STRING_SCHEMA) .field("temperature", Schema.FLOAT64_SCHEMA) .field("voltage", Schema.FLOAT64_SCHEMA) .field("ts", Schema.INT64_SCHEMA).build() val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L) val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1) val jedis = mock[Jedis] writer.jedis = jedis val map = new util.HashMap[String, String]() map.put("type", "Xeon") map.put("temperature", "60.4") map.put("voltage", "90.1") map.put("ts", 1482180657010L.toString) when(jedis.auth("")).isLenient() when(jedis.xadd("stream1", null, map)).thenReturn(mock[StreamEntryID]) writer.initialize(1, settings.errorPolicy) writer.write(Seq(sinkRecord1)) } } }
Example 16
Source File: RedisPubSubTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.redis.sink.writer import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.mockito.MockitoSugar import org.scalatest.BeforeAndAfterAll import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import redis.clients.jedis.{Jedis, JedisPubSub} import redis.embedded.RedisServer import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer class RedisPubSubTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar { val redisServer = new RedisServer(6379) override def beforeAll() = redisServer.start() override def afterAll() = redisServer.stop() "Redis PUBSUB writer" should { "write Kafka records to a Redis PubSub" in { val TOPIC = "cpuTopic" val KCQL = s"SELECT * from $TOPIC STOREAS PubSub (channel=type)" println("Testing KCQL : " + KCQL) val props = Map( RedisConfigConstants.REDIS_HOST->"localhost", RedisConfigConstants.REDIS_PORT->"6379", RedisConfigConstants.KCQL_CONFIG->KCQL ).asJava val config = RedisConfig(props) val connectionInfo = new RedisConnectionInfo("localhost", 6379, None) val settings = RedisSinkSettings(config) val writer = new RedisPubSub(settings) writer.createClient(settings) val schema = SchemaBuilder.struct().name("com.example.Cpu") .field("type", Schema.STRING_SCHEMA) .field("temperature", Schema.FLOAT64_SCHEMA) .field("voltage", Schema.FLOAT64_SCHEMA) .field("ts", Schema.INT64_SCHEMA).build() val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L) val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L) val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L) val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1) val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2) val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3) val jedis = new Jedis(connectionInfo.host, connectionInfo.port) // Clean up in-memory jedis jedis.flushAll() val messagesMap = collection.mutable.Map[String, ListBuffer[String]]() val t = new Thread { private val pubsub = new JedisPubSub { override def onMessage(channel: String, message: String): Unit = { messagesMap.get(channel) match { case Some(msgs) => messagesMap.put(channel, msgs += message) case None => messagesMap.put(channel, ListBuffer(message)) } } } override def run(): Unit = { jedis.subscribe(pubsub, "Xeon", "i7", "i7-i") } override def interrupt(): Unit = { pubsub.punsubscribe("*") super.interrupt() } } t.start() t.join(5000) if (t.isAlive) t.interrupt() writer.write(Seq(sinkRecord1)) writer.write(Seq(sinkRecord2, sinkRecord3)) messagesMap.size shouldBe 3 messagesMap("Xeon").head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}""" messagesMap("i7").head shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}""" messagesMap("i7-i").head shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}""" } } }
Example 17
Source File: RedisInsertSortedSetTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.redis.sink.writer import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisConfig, RedisConfigConstants, RedisConnectionInfo, RedisSinkSettings} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.mockito.MockitoSugar import org.scalatest.BeforeAndAfterAll import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import redis.clients.jedis.Jedis import redis.embedded.RedisServer import scala.collection.JavaConverters._ class RedisInsertSortedSetTest extends AnyWordSpec with Matchers with BeforeAndAfterAll with MockitoSugar { val redisServer = new RedisServer(6379) override def beforeAll() = redisServer.start() override def afterAll() = redisServer.stop() "Redis INSERT into Sorted Set (SS) writer" should { "write Kafka records to a Redis Sorted Set" in { val TOPIC = "cpuTopic" val KCQL = s"INSERT INTO cpu_stats SELECT * from $TOPIC STOREAS SortedSet(score=ts)" println("Testing KCQL : " + KCQL) val props = Map( RedisConfigConstants.REDIS_HOST->"localhost", RedisConfigConstants.REDIS_PORT->"6379", RedisConfigConstants.KCQL_CONFIG->KCQL ).asJava val config = RedisConfig(props) val connectionInfo = new RedisConnectionInfo("localhost", 6379, None) val settings = RedisSinkSettings(config) val writer = new RedisInsertSortedSet(settings) writer.createClient(settings) val schema = SchemaBuilder.struct().name("com.example.Cpu") .field("type", Schema.STRING_SCHEMA) .field("temperature", Schema.FLOAT64_SCHEMA) .field("voltage", Schema.FLOAT64_SCHEMA) .field("ts", Schema.INT64_SCHEMA).build() val struct1 = new Struct(schema).put("type", "Xeon").put("temperature", 60.4).put("voltage", 90.1).put("ts", 1482180657010L) val struct2 = new Struct(schema).put("type", "i7").put("temperature", 62.1).put("voltage", 103.3).put("ts", 1482180657020L) val struct3 = new Struct(schema).put("type", "i7-i").put("temperature", 64.5).put("voltage", 101.1).put("ts", 1482180657030L) val sinkRecord1 = new SinkRecord(TOPIC, 0, null, null, schema, struct1, 1) val sinkRecord2 = new SinkRecord(TOPIC, 0, null, null, schema, struct2, 2) val sinkRecord3 = new SinkRecord(TOPIC, 0, null, null, schema, struct3, 3) val jedis = new Jedis(connectionInfo.host, connectionInfo.port) // Clean up in-memory jedis jedis.flushAll() writer.write(Seq(sinkRecord1)) writer.write(Seq(sinkRecord2, sinkRecord3)) // Redis cardinality should now be 3 jedis.zcard("cpu_stats") shouldBe 3 val allSSrecords = jedis.zrange("cpu_stats", 0, 999999999999L) val results = allSSrecords.asScala.toList results.head shouldBe """{"type":"Xeon","temperature":60.4,"voltage":90.1,"ts":1482180657010}""" results(1) shouldBe """{"type":"i7","temperature":62.1,"voltage":103.3,"ts":1482180657020}""" results(2) shouldBe """{"type":"i7-i","temperature":64.5,"voltage":101.1,"ts":1482180657030}""" } } }
Example 18
Source File: RedisFieldsKeyBuilder.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.redis.sink.writer import com.datamountaineer.streamreactor.connect.rowkeys.StringKeyBuilder import org.apache.kafka.connect.data.{Field, Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.annotation.tailrec import scala.collection.JavaConverters._ override def build(record: SinkRecord): String = { val struct: Struct = record.value.asInstanceOf[Struct] val schema: Schema = struct.schema def extractAvailableFieldNames(schema: Schema): Seq[String] = { if (schema.`type` == Schema.Type.STRUCT) { val fields = schema.fields fields.asScala.map(_.name) ++ fields.asScala.flatMap { f => extractAvailableFieldNames(f.schema).map(name => f.name + "." + name) } } else Seq.empty } val availableFields = extractAvailableFieldNames(schema) val missingKeys = keys.filterNot(availableFields.contains) require( missingKeys.isEmpty, s"${missingKeys.mkString(",")} keys are not present in the SinkRecord payload: ${availableFields.mkString(", ")}" ) def getValue(key: String): AnyRef = { @tailrec def findValue(keyParts: List[String], obj: AnyRef): Option[AnyRef] = (obj, keyParts) match { case (f: Field, k :: tail) => findValue(tail, f.schema.field(k)) case (s: Struct, k :: tail) => findValue(tail, s.get(k)) case (v, _) => Option(v) } findValue(key.split('.').toList, struct).getOrElse { throw new IllegalArgumentException( s"$key field value is null. Non null value is required for the fields creating the row key" ) } } keys.map(getValue).mkString(pkDelimiter) } }
Example 19
Source File: RedisGeoAdd.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.redis.sink.writer import com.datamountaineer.kcql.Kcql import com.datamountaineer.streamreactor.connect.redis.sink.config.{RedisKCQLSetting, RedisSinkSettings} import com.datamountaineer.streamreactor.connect.schemas.StructFieldsExtractor import org.apache.kafka.connect.data.Struct import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ import scala.util.Try import scala.util.control.Exception.allCatch class RedisGeoAdd(sinkSettings: RedisSinkSettings) extends RedisWriter with GeoAddSupport { val configs: Set[Kcql] = sinkSettings.kcqlSettings.map(_.kcqlConfig) configs.foreach { c => assert(c.getSource.trim.length > 0, "You need to supply a valid source kafka topic to fetch records from. Review your KCQL syntax") assert(c.getPrimaryKeys.asScala.length >= 1, "The Redis GeoAdd mode requires at least 1 PK (Primary Key) to be defined") assert(c.getStoredAs.equalsIgnoreCase("GeoAdd"), "The Redis GeoAdd mode requires the KCQL syntax: STOREAS GeoAdd") } // Write a sequence of SinkRecords to Redis override def write(records: Seq[SinkRecord]): Unit = { if (records.isEmpty) logger.debug("No records received on 'GeoAdd' Redis writer") else { logger.debug(s"'GeoAdd' Redis writer received ${records.size} records") insert(records.groupBy(_.topic)) } } // Insert a batch of sink records def insert(records: Map[String, Seq[SinkRecord]]): Unit = { records.foreach { case (topic, sinkRecords: Seq[SinkRecord]) => { val topicSettings: Set[RedisKCQLSetting] = sinkSettings.kcqlSettings.filter(_.kcqlConfig.getSource == topic) if (topicSettings.isEmpty) logger.warn(s"Received a batch for topic $topic - but no KCQL supports it") //pass try to error handler and try val t = Try { sinkRecords.foreach { record => topicSettings.map { KCQL => val extractor = StructFieldsExtractor(includeAllFields = false, KCQL.kcqlConfig.getPrimaryKeys.asScala.map(f => f.getName -> f.getName).toMap) val fieldsAndValues = extractor.get(record.value.asInstanceOf[Struct]).toMap val pkValue = KCQL.kcqlConfig.getPrimaryKeys.asScala.map(pk => fieldsAndValues(pk.getName).toString).mkString(":") // Use the target (and optionally the prefix) to name the GeoAdd key val optionalPrefix = if (Option(KCQL.kcqlConfig.getTarget).isEmpty) "" else KCQL.kcqlConfig.getTarget.trim val key = optionalPrefix + pkValue val recordToSink = convert(record, fields = KCQL.fieldsAndAliases, ignoreFields = KCQL.ignoredFields) val payload = convertValueToJson(recordToSink) val longitudeField = getLongitudeField(KCQL.kcqlConfig) val latitudeField = getLatitudeField(KCQL.kcqlConfig) val longitude = getFieldValue(record, longitudeField) val latitude = getFieldValue(record, latitudeField) if (isDoubleNumber(longitude) && isDoubleNumber(latitude)) { logger.debug(s"GEOADD $key longitude=$longitude latitude=$latitude payload = ${payload.toString}") val response = jedis.geoadd(key, longitude.toDouble, latitude.toDouble, payload.toString) if (response == 1) { logger.debug("New element added") } else if (response == 0) logger.debug("The element was already a member of the sorted set and the score was updated") response } else { logger.warn(s"GeoAdd record contains invalid longitude=$longitude and latitude=$latitude values, " + s"Record with key ${record.key} is skipped"); None } } } } handleTry(t) } logger.debug(s"Wrote ${sinkRecords.size} rows for topic $topic") } } def getFieldValue(record: SinkRecord, fieldName: String): String = { val struct = record.value().asInstanceOf[Struct] struct.getString(fieldName) } def isDoubleNumber(s: String): Boolean = (allCatch opt s.toDouble).isDefined }
Example 20
Source File: PulsarWriterTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.pulsar.sink import com.datamountaineer.streamreactor.connect.pulsar.ProducerConfigFactory import com.datamountaineer.streamreactor.connect.pulsar.config.{PulsarConfigConstants, PulsarSinkConfig, PulsarSinkSettings} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.apache.pulsar.client.api.{Message, MessageId, Producer, PulsarClient} import org.mockito.ArgumentMatchers.any import org.mockito.MockitoSugar import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.collection.JavaConverters._ class PulsarWriterTest extends AnyWordSpec with MockitoSugar with Matchers { val pulsarTopic = "persistent://landoop/standalone/connect/kafka-topic" def getSchema: Schema = { SchemaBuilder.struct .field("int8", SchemaBuilder.int8().defaultValue(2.toByte).doc("int8 field").build()) .field("int16", Schema.INT16_SCHEMA) .field("int32", Schema.INT32_SCHEMA) .field("int64", Schema.INT64_SCHEMA) .field("float32", Schema.FLOAT32_SCHEMA) .field("float64", Schema.FLOAT64_SCHEMA) .field("boolean", Schema.BOOLEAN_SCHEMA) .field("string", Schema.STRING_SCHEMA) .build() } def getStruct(schema: Schema): Struct = { new Struct(schema) .put("int8", 12.toByte) .put("int16", 12.toShort) .put("int32", 12) .put("int64", 12L) .put("float32", 12.2f) .put("float64", 12.2) .put("boolean", true) .put("string", "foo") } "should write messages" in { val config = PulsarSinkConfig(Map( PulsarConfigConstants.HOSTS_CONFIG -> "pulsar://localhost:6650", PulsarConfigConstants.KCQL_CONFIG -> s"INSERT INTO $pulsarTopic SELECT * FROM kafka_topic BATCH = 10 WITHPARTITIONER = SinglePartition WITHCOMPRESSION = ZLIB WITHDELAY = 1000" ).asJava) val schema = getSchema val struct = getStruct(schema) val record1 = new SinkRecord("kafka_topic", 0, null, null, schema, struct, 1) val settings = PulsarSinkSettings(config) val producerConfig = ProducerConfigFactory("test", settings.kcql) val client = mock[PulsarClient] val producer = mock[Producer] val messageId = mock[MessageId] when(client.createProducer(pulsarTopic, producerConfig(pulsarTopic))).thenReturn(producer) when(producer.send(any[Message])).thenReturn(messageId) val writer = PulsarWriter(client, "test", settings) writer.write(List(record1)) } }
Example 21
Source File: ChangeFeedStructBuilder.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.rethink.source import com.fasterxml.jackson.databind.ObjectMapper import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} object ChangeFeedStructBuilder extends StrictLogging { val mapper = new ObjectMapper() val oldVal = "old_val" val newVal = "new_val" val state = "state" val `type` = "type" val schema: Schema = SchemaBuilder.struct.name("ReThinkChangeFeed") .version(1) .field(state, Schema.OPTIONAL_STRING_SCHEMA) .field(oldVal, Schema.OPTIONAL_STRING_SCHEMA) .field(newVal, Schema.OPTIONAL_STRING_SCHEMA) .field(`type`, Schema.OPTIONAL_STRING_SCHEMA) .build def apply(hm: Map[String, Object]): Struct = { val struct = new Struct(schema) hm.foreach({ case (k, v) => if (v != null) struct.put(k, v.toString) }) struct } }
Example 22
Source File: ParquetWriterTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive.StructUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class ParquetWriterTest extends AnyWordSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(conf) "ParquetWriter" should { "write parquet files" in { val schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .field("title", SchemaBuilder.string().optional().build()) .field("salary", SchemaBuilder.float64().optional().build()) .build() val users = List( new Struct(schema).put("name", "sam").put("title", "mr").put("salary", 100.43), new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06) ) val path = new Path("sinktest.parquet") val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) users.foreach(writer.write) writer.close() val reader = parquetReader(path) val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList reader.close() actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues) fs.delete(path, false) } "support writing nulls" in { val schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .field("title", SchemaBuilder.string().optional().build()) .field("salary", SchemaBuilder.float64().optional().build()) .build() val users = List( new Struct(schema).put("name", "sam").put("title", null).put("salary", 100.43), new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06) ) val path = new Path("sinktest.parquet") val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) users.foreach(writer.write) writer.close() val reader = parquetReader(path) val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList reader.close() actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues) fs.delete(path, false) } } }
Example 23
Source File: VoltDbWriter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.voltdb.writers import com.datamountaineer.streamreactor.connect.errors.ErrorHandler import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import com.datamountaineer.streamreactor.connect.sink.DbWriter import com.datamountaineer.streamreactor.connect.voltdb.config.VoltSettings import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.Struct import org.apache.kafka.connect.sink.SinkRecord import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException import org.voltdb.client.{ClientConfig, ClientFactory} import scala.util.Try class VoltDbWriter(settings: VoltSettings) extends DbWriter with StrictLogging with ConverterUtil with ErrorHandler { //ValidateStringParameterFn(settings.servers, "settings") //ValidateStringParameterFn(settings.user, "settings") //initialize error tracker initialize(settings.maxRetries, settings.errorPolicy) private val voltConfig = new ClientConfig(settings.user, settings.password) private val client = ClientFactory.createClient(voltConfig) VoltConnectionConnectFn(client, settings) private val proceduresMap = settings.fieldsExtractorMap.values.map { extract => val procName = s"${extract.targetTable}.${if (extract.isUpsert) "upsert" else "insert"}" logger.info(s"Retrieving the metadata for $procName ...") val fields = VoltDbMetadataReader.getProcedureParameters(client, extract.targetTable).map(_.toUpperCase) logger.info(s"$procName expected arguments are: ${fields.mkString(",")}") extract.targetTable -> ProcAndFields(procName, fields) }.toMap override def write(records: Seq[SinkRecord]): Unit = { if (records.isEmpty) { logger.debug("No records received.") } else { val t = Try(records.withFilter(_.value() != null).foreach(insert)) t.foreach(_ => logger.info("Writing complete")) handleTry(t) } } private def insert(record: SinkRecord) = { require(record.value().getClass == classOf[Struct], "Only Struct payloads are handled") val extractor = settings.fieldsExtractorMap.getOrElse(record.topic(), throw new ConfigException(s"${record.topic()} is not handled by the configuration:${settings.fieldsExtractorMap.keys.mkString(",")}")) val fieldsAndValuesMap = extractor.get(record.value().asInstanceOf[Struct]).map { case (k, v) => (k.toUpperCase, v) } logger.info(fieldsAndValuesMap.mkString(",")) val procAndFields: ProcAndFields = proceduresMap(extractor.targetTable) //get the list of arguments to pass to the table insert/upsert procedure. if the procedure expects a field and is //not present in the incoming SinkRecord it would use null //No table evolution is supported yet val arguments: Array[String] = PrepareProcedureFieldsFn(procAndFields.fields, fieldsAndValuesMap).toArray logger.info(s"Calling procedure:${procAndFields.procName} with parameters:${procAndFields.fields.mkString(",")} with arguments:${arguments.mkString(",")}") client.callProcedure(procAndFields.procName, arguments: _*) } override def close(): Unit = client.close() private case class ProcAndFields(procName: String, fields: Seq[String]) }
Example 24
Source File: StructFieldExtractorTest.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.schemas import org.apache.kafka.connect.data.{Date, Schema, SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StructFieldExtractorTest extends AnyWordSpec with Matchers { "StructFieldExtractor" should { "return all the fields and their bytes value" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = new StructFieldsExtractor(true, Map.empty).get(struct).toMap map.get("firstName").get shouldBe "Alex" map.get("lastName").get shouldBe "Smith" map.get("age").get shouldBe 30 } "return all fields and apply the mapping" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = new StructFieldsExtractor(true, Map("lastName" -> "Name", "age" -> "a")).get(struct).toMap map.get("firstName").get shouldBe "Alex" map.get("Name").get shouldBe "Smith" map.get("a").get shouldBe 30 } "return only the specified fields" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = new StructFieldsExtractor(false, Map("lastName" -> "Name", "age" -> "age")).get(struct).toMap map.get("Name").get shouldBe "Smith" map.get("age").get shouldBe 30 map.size shouldBe 2 } } "handle Date fieldds" in { val dateSchema = Date.builder().build() val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("date", dateSchema).build() val date = java.sql.Date.valueOf("2017-04-25") val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) .put("date", date) val map1 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap map1.get("date").get shouldBe date map1.size shouldBe 1 val d = Date.toLogical(dateSchema, 10000) struct.put("date", d) val map2 = new StructFieldsExtractor(false, Map("date" -> "date")).get(struct).toMap map2.get("date").get shouldBe d map2.size shouldBe 1 } }
Example 25
Source File: TestUtilsBase.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect import java.util import java.util.Collections import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.apache.kafka.connect.source.SourceTaskContext import org.apache.kafka.connect.storage.OffsetStorageReader import org.mockito.Mockito._ import org.mockito.MockitoSugar import org.scalatest.BeforeAndAfter import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.collection.JavaConverters._ //set up partition val partition: util.Map[String, String] = Collections.singletonMap(lookupPartitionKey, table) //as a list to search for val partitionList: util.List[util.Map[String, String]] = List(partition).asJava //set up the offset val offset: util.Map[String, Object] = (Collections.singletonMap(offsetColumn,offsetValue )) //create offsets to initialize from val offsets :util.Map[util.Map[String, String],util.Map[String, Object]] = Map(partition -> offset).asJava //mock out reader and task context val taskContext = mock[SourceTaskContext] val reader = mock[OffsetStorageReader] when(reader.offsets(partitionList)).thenReturn(offsets) when(taskContext.offsetStorageReader()).thenReturn(reader) taskContext } }
Example 26
Source File: JsonConverterWithSchemaEvolutionTest.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.source import java.util.Collections import com.datamountaineer.streamreactor.connect.converters.MsgKey import com.sksamuel.avro4s.{RecordFormat, SchemaFor} import io.confluent.connect.avro.AvroData import org.apache.avro.Schema import org.apache.kafka.connect.data.Struct import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class JsonConverterWithSchemaEvolutionTest extends AnyWordSpec with Matchers { val topic = "the_real_topic" val sourceTopic = "source_topic" val avroData = new AvroData(4) "JsonConverter" should { "throw IllegalArgumentException if payload is null" in { intercept[IllegalArgumentException] { val converter = new JsonConverterWithSchemaEvolution val record = converter.convert("topic", "somesource", "1000", null) } } "handle a simple json" in { val json = JacksonJson.toJson(Car("LaFerrari", "Ferrari", 2015, 963, 0.0001)) val converter = new JsonConverterWithSchemaEvolution val record = converter.convert(topic, sourceTopic, "100", json.getBytes) record.keySchema() shouldBe MsgKey.schema record.key().asInstanceOf[Struct].getString("topic") shouldBe sourceTopic record.key().asInstanceOf[Struct].getString("id") shouldBe "100" val schema = new Schema.Parser().parse( SchemaFor[CarOptional]().toString .replace("\"name\":\"CarOptional\"", s"""\"name\":\"$sourceTopic\"""") .replace(s""",\"namespace\":\"${getClass.getCanonicalName.dropRight(getClass.getSimpleName.length+1)}\"""", "") ) val format = RecordFormat[CarOptional] val carOptional = format.to(CarOptional(Option("LaFerrari"), Option("Ferrari"), Option(2015), Option(963), Option(0.0001))) record.valueSchema() shouldBe avroData.toConnectSchema(schema) record.value() shouldBe avroData.toConnectData(schema, carOptional).value() record.sourcePartition() shouldBe null record.sourceOffset() shouldBe Collections.singletonMap(JsonConverterWithSchemaEvolution.ConfigKey, avroData.fromConnectSchema(avroData.toConnectSchema(schema)).toString()) } } } case class Car(name: String, manufacturer: String, model: Long, bhp: Long, price: Double) case class CarOptional(name: Option[String], manufacturer: Option[String], model: Option[Long], bhp: Option[Long], price: Option[Double])
Example 27
Source File: StringStructFieldsStringKeyBuilderTest.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.sink import com.datamountaineer.streamreactor.connect.rowkeys.StringStructFieldsStringKeyBuilder import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StringStructFieldsStringKeyBuilderTest extends AnyWordSpec with Matchers { "StructFieldsStringKeyBuilder" should { "raise an exception if the field is not present in the struct" in { intercept[IllegalArgumentException] { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StringStructFieldsStringKeyBuilder(Seq("threshold")).build(sinkRecord) } } "create the row key based on one single field in the struct" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex" } "create the row key based on one single field with doc in the struct" in { val firstNameSchema = SchemaBuilder.`type`(Schema.Type.STRING).doc("first name") val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", firstNameSchema) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StringStructFieldsStringKeyBuilder(Seq("firstName")).build(sinkRecord) shouldBe "Alex" } "create the row key based on more thant one field in the struct" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StringStructFieldsStringKeyBuilder(Seq("firstName", "age")).build(sinkRecord) shouldBe "Alex.30" } } }
Example 28
Source File: AvroConverter.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.converters.source import java.io.File import java.util.Collections import com.datamountaineer.streamreactor.connect.converters.MsgKey import io.confluent.connect.avro.AvroData import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.io.DecoderFactory import org.apache.avro.{Schema => AvroSchema} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.source.SourceRecord import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException class AvroConverter extends Converter { private val avroData = new AvroData(8) private var sourceToSchemaMap: Map[String, AvroSchema] = Map.empty private var avroReadersMap: Map[String, GenericDatumReader[GenericRecord]] = Map.empty override def convert(kafkaTopic: String, sourceTopic: String, messageId: String, bytes: Array[Byte], keys: Seq[String] = Seq.empty, keyDelimiter: String = "."): SourceRecord = { Option(bytes) match { case None => new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, avroData.toConnectSchema(sourceToSchemaMap(sourceTopic)), null) case Some(_) => val reader = avroReadersMap.getOrElse(sourceTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $sourceTopic")) val decoder = DecoderFactory.get().binaryDecoder(bytes, null) val record = reader.read(null, decoder) val schemaAndValue = avroData.toConnectData(sourceToSchemaMap(sourceTopic.toLowerCase), record) val value = schemaAndValue.value() value match { case s: Struct if keys.nonEmpty => val keysValue = keys.flatMap { key => Option(KeyExtractor.extract(s, key.split('.').toVector)).map(_.toString) }.mkString(keyDelimiter) new SourceRecord( Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, Schema.STRING_SCHEMA, keysValue, schemaAndValue.schema(), schemaAndValue.value()) case _ => new SourceRecord( Collections.singletonMap(Converter.TopicKey, sourceTopic), null, kafkaTopic, MsgKey.schema, MsgKey.getStruct(sourceTopic, messageId), schemaAndValue.schema(), schemaAndValue.value()) } } } override def initialize(config: Map[String, String]): Unit = { sourceToSchemaMap = AvroConverter.getSchemas(config) avroReadersMap = sourceToSchemaMap.map { case (key, schema) => key -> new GenericDatumReader[GenericRecord](schema) } } } object AvroConverter { val SCHEMA_CONFIG = "connect.source.converter.avro.schemas" def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = { config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided")) .toString .split(';') .filter(_.trim.nonEmpty) .map(_.split("=")) .map { case Array(source, path) => val file = new File(path) if (!file.exists()) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!") } val s = source.trim.toLowerCase() if (s.isEmpty) { throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path") } s -> new AvroSchema.Parser().parse(file) case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Source->AVRO_FILE") }.toMap } }
Example 29
Source File: RowKeyBuilderString.scala From kafka-connect-common with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.rowkeys import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ override def build(record: SinkRecord): String = { val struct = record.value().asInstanceOf[Struct] val schema = struct.schema val availableFields = schema.fields().asScala.map(_.name).toSet val missingKeys = keys.filterNot(availableFields.contains) require(missingKeys.isEmpty, s"${missingKeys.mkString(",")} keys are not present in the SinkRecord payload:${availableFields.mkString(",")}") keys.flatMap { case key => val field = schema.field(key) val value = struct.get(field) require(value != null, s"$key field value is null. Non null value is required for the fileds creating the Hbase row key") if (availableSchemaTypes.contains(field.schema().`type`())) Some(value.toString) else None }.mkString(keyDelimiter) } }
Example 30
Source File: SourceRecordProducers.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.ftp.source import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.source.SourceRecord object SourceRecordProducers { type SourceRecordProducer = (ConnectFileMetaDataStore, String, FileMetaData, FileBody) => SourceRecord val fileInfoSchema = SchemaBuilder.struct() .field("name", Schema.STRING_SCHEMA) .field("offset", Schema.INT64_SCHEMA) .build() def stringKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord = new SourceRecord( store.fileMetasToConnectPartition(meta), // source part store.fileMetasToConnectOffset(meta), // source off topic, //topic Schema.STRING_SCHEMA, // key sch meta.attribs.path, // key Schema.BYTES_SCHEMA, // val sch body.bytes // val ) def structKeyRecord(store: ConnectFileMetaDataStore, topic: String, meta: FileMetaData, body: FileBody): SourceRecord = { new SourceRecord( store.fileMetasToConnectPartition(meta), // source part store.fileMetasToConnectOffset(meta), // source off topic, //topic fileInfoSchema, // key sch new Struct(fileInfoSchema) .put("name",meta.attribs.path) .put("offset",body.offset), Schema.BYTES_SCHEMA, // val sch body.bytes // val ) } }
Example 31
Source File: TestCoapMessageConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.coap.domain import com.datamountaineer.streamreactor.connect.coap.TestBase import org.apache.kafka.connect.data.Struct import org.scalatest.wordspec.AnyWordSpec class TestCoapMessageConverter extends AnyWordSpec with TestBase { "should convert a CoapResponse to a Struct " in { val response = getCoapResponse val converter = new CoapMessageConverter val record = converter.convert(RESOURCE_INSECURE ,TOPIC, response) val struct = record.value().asInstanceOf[Struct] struct.getString("payload") shouldBe response.getPayloadString struct.getInt32("raw_code") shouldBe response.getRawCode struct.getBoolean("is_last") shouldBe response.isLast struct.getInt32("content_format") shouldBe response.getOptions.getContentFormat } }
Example 32
Source File: StructFieldsExtractorTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.voltdb import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StructFieldsExtractorTest extends AnyWordSpec with Matchers { "StructFieldsExtractor" should { "return all the fields and their bytes value" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val min = System.currentTimeMillis() val record = StructFieldsExtractor("table", true, Map.empty).get(struct) val map = record map("firstName") shouldBe "Alex" map("lastName") shouldBe "Smith" map("age") shouldBe 30 } "return all fields and apply the mapping" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = StructFieldsExtractor("table", includeAllFields = true, Map("lastName" -> "Name", "age" -> "a")).get(struct) map("firstName") shouldBe "Alex" map("Name") shouldBe "Smith" map("a") shouldBe 30 } "return only the specified fields" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("lastName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema) .put("firstName", "Alex") .put("lastName", "Smith") .put("age", 30) val map = StructFieldsExtractor("table", includeAllFields = false, Map("lastName" -> "Name", "age" -> "age")).get(struct) map("Name") shouldBe "Smith" map("age") shouldBe 30 map.size shouldBe 2 } } }
Example 33
Source File: StructFieldsExtractor.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.voltdb import java.text.SimpleDateFormat import java.util.TimeZone import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.{Field, Struct, _} import scala.collection.JavaConverters._ trait FieldsValuesExtractor { def get(struct: Struct): Map[String, Any] } case class StructFieldsExtractor(targetTable: String, includeAllFields: Boolean, fieldsAliasMap: Map[String, String], isUpsert: Boolean = false) extends FieldsValuesExtractor with StrictLogging { require(targetTable != null && targetTable.trim.length > 0) def get(struct: Struct): Map[String, Any] = { val schema = struct.schema() val fields: Seq[Field] = { if (includeAllFields) { schema.fields().asScala } else { val selectedFields = schema.fields().asScala.filter(f => fieldsAliasMap.contains(f.name())) val diffSet = fieldsAliasMap.keySet.diff(selectedFields.map(_.name()).toSet) if (diffSet.nonEmpty) { val errMsg = s"Following columns ${diffSet.mkString(",")} have not been found. Available columns:${fieldsAliasMap.keys.mkString(",")}" logger.error(errMsg) sys.error(errMsg) } selectedFields } } //need to select all fields including null. the stored proc needs a fixed set of params fields.map { field => val schema = field.schema() val value = Option(struct.get(field)) .map { value => //handle specific schema schema.name() match { case Decimal.LOGICAL_NAME => value.asInstanceOf[Any] match { case _:java.math.BigDecimal => value case arr: Array[Byte] => Decimal.toLogical(schema, arr) case _ => throw new IllegalArgumentException(s"${field.name()} is not handled for value:$value") } case Time.LOGICAL_NAME => value.asInstanceOf[Any] match { case i: Int => StructFieldsExtractor.TimeFormat.format(Time.toLogical(schema, i)) case d:java.util.Date => StructFieldsExtractor.TimeFormat.format(d) case _ => throw new IllegalArgumentException(s"${field.name()} is not handled for value:$value") } case Timestamp.LOGICAL_NAME => value.asInstanceOf[Any] match { case d:java.util.Date => StructFieldsExtractor.DateFormat.format(d) case l: Long => StructFieldsExtractor.DateFormat.format(Timestamp.toLogical(schema, l)) case _ => throw new IllegalArgumentException(s"${field.name()} is not handled for value:$value") } case _ => value } }.orNull fieldsAliasMap.getOrElse(field.name(), field.name()) -> value }.toMap } } object StructFieldsExtractor { val DateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") val TimeFormat: SimpleDateFormat = new SimpleDateFormat("HH:mm:ss.SSSZ") DateFormat.setTimeZone(TimeZone.getTimeZone("UTC")) }
Example 34
Source File: MetastoreSchemaAlignMapperTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import org.apache.kafka.connect.data.{SchemaBuilder, Struct} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import scala.collection.JavaConverters._ class MetastoreSchemaAlignMapperTest extends AnyFunSuite with Matchers { test("pad optional missing fields with null") { val recordSchema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("b", SchemaBuilder.string().required().build()) .field("c", SchemaBuilder.string().required().build()) .build() val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c") val metastoreSchema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("b", SchemaBuilder.string().required().build()) .field("c", SchemaBuilder.string().required().build()) .field("z", SchemaBuilder.string().optional().build()) .build() val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct) output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b", "c", "z") } test("drop fields not specified in metastore") { val recordSchema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("b", SchemaBuilder.string().required().build()) .field("c", SchemaBuilder.string().required().build()) .build() val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c") val metastoreSchema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("b", SchemaBuilder.string().required().build()) .build() val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct) output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b") } }
Example 35
Source File: ParquetWriterTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive.StructUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class ParquetWriterTest extends AnyWordSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(conf) "ParquetWriter" should { "write parquet files" in { val schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .field("title", SchemaBuilder.string().optional().build()) .field("salary", SchemaBuilder.float64().optional().build()) .build() val users = List( new Struct(schema).put("name", "sam").put("title", "mr").put("salary", 100.43), new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06) ) val path = new Path("sinktest.parquet") val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) users.foreach(writer.write) writer.close() val reader = parquetReader(path) val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList reader.close() actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues) fs.delete(path, false) } "support writing nulls" in { val schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .field("title", SchemaBuilder.string().optional().build()) .field("salary", SchemaBuilder.float64().optional().build()) .build() val users = List( new Struct(schema).put("name", "sam").put("title", null).put("salary", 100.43), new Struct(schema).put("name", "laura").put("title", "ms").put("salary", 429.06) ) val path = new Path("sinktest.parquet") val writer = parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) users.foreach(writer.write) writer.close() val reader = parquetReader(path) val actual = Iterator.continually(reader.read).takeWhile(_ != null).toList reader.close() actual.map(StructUtils.extractValues) shouldBe users.map(StructUtils.extractValues) fs.delete(path, false) } } }
Example 36
Source File: MetastoreSchemaAlignMapperTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import org.apache.kafka.connect.data.{SchemaBuilder, Struct} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import scala.collection.JavaConverters._ class MetastoreSchemaAlignMapperTest extends AnyFunSuite with Matchers { test("pad optional missing fields with null") { val recordSchema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("b", SchemaBuilder.string().required().build()) .field("c", SchemaBuilder.string().required().build()) .build() val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c") val metastoreSchema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("b", SchemaBuilder.string().required().build()) .field("c", SchemaBuilder.string().required().build()) .field("z", SchemaBuilder.string().optional().build()) .build() val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct) output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b", "c", "z") } test("drop fields not specified in metastore") { val recordSchema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("b", SchemaBuilder.string().required().build()) .field("c", SchemaBuilder.string().required().build()) .build() val struct = new Struct(recordSchema).put("a", "a").put("b", "b").put("c", "c") val metastoreSchema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("b", SchemaBuilder.string().required().build()) .build() val output = new MetastoreSchemaAlignMapper(metastoreSchema).map(struct) output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "b") } }
Example 37
Source File: DropPartitionValuesMapperTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import cats.data.NonEmptyList import com.landoop.streamreactor.connect.hive.{PartitionKey, PartitionPlan, TableName} import org.apache.kafka.connect.data.{SchemaBuilder, Struct} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import scala.collection.JavaConverters._ class DropPartitionValuesMapperTest extends AnyFunSuite with Matchers { test("strip partition values") { val schema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("p", SchemaBuilder.string().required().build()) .field("q", SchemaBuilder.string().required().build()) .field("z", SchemaBuilder.string().required().build()) .build() val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q"))) val struct = new Struct(schema).put("a", "a").put("p", "p").put("q", "q").put("z", "z") val output = new DropPartitionValuesMapper(plan).map(struct) output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z") } test("handle partition field is missing in input") { val schema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("q", SchemaBuilder.string().required().build()) .field("z", SchemaBuilder.string().required().build()) .build() val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q"))) val struct = new Struct(schema).put("a", "a").put("q", "q").put("z", "z") val output = new DropPartitionValuesMapper(plan).map(struct) output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z") } }
Example 38
Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.concurrent.duration._ class DefaultCommitPolicyTest extends AnyWordSpec with Matchers { val schema: Schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .build() val struct = new Struct(schema) implicit val conf: Configuration = new Configuration() implicit val fs: LocalFileSystem = FileSystem.getLocal(conf) val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100)) private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = { val status = fs.getFileStatus(path) policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime)) } "DefaultCommitPolicy" should { "roll over after interval" in { val policy = DefaultCommitPolicy(None, Option(2.seconds), None) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 10) shouldBe false Thread.sleep(2000) shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file count" in { val policy = DefaultCommitPolicy(None, None, Some(9)) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 7) shouldBe false shouldFlush(policy, path, 8) shouldBe false shouldFlush(policy, path, 9) shouldBe true shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file size" in { val policy = DefaultCommitPolicy(Some(10), None, None) val path = new Path("foo") val out = fs.create(path) shouldFlush(policy, path, 7) shouldBe false out.writeBytes("wibble wobble wabble wubble") out.close() shouldFlush(policy, path, 9) shouldBe true fs.delete(path, false) } } }
Example 39
Source File: MapValueConverterTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.json.sql.JacksonJson import org.apache.kafka.connect.data.{Schema, Struct} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import scala.collection.JavaConverters._ class MapValueConverterTest extends AnyFunSuite with Matchers { test("converts nested payload") { val json = """ |{ | "idType": 3, | "colorDepth": "", | "threshold" : 45.77, | "evars": { | "evars": { | "eVar1": "Tue Aug 27 2019 12:08:10", | "eVar2": 156692207943934897 | } | }, | "exclude": { | "id": 0, | "value": false | } |} |""".stripMargin val map = JacksonJson.toMap[Any](json) val struct = MapValueConverter.convert(map) //Jackson transforming the json to Map the fields order is not retained struct.schema().fields().asScala.map(_.name()).sorted shouldBe List("idType", "colorDepth", "threshold", "evars", "exclude").sorted struct.schema().field("idType").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA struct.schema().field("colorDepth").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA struct.schema().field("threshold").schema() shouldBe Schema.OPTIONAL_FLOAT64_SCHEMA struct.schema().field("exclude").schema().`type`() shouldBe Schema.Type.STRUCT struct.schema().field("exclude").schema().isOptional shouldBe true struct.schema().field("evars").schema().`type`() shouldBe Schema.Type.STRUCT struct.schema().field("evars").schema().isOptional shouldBe true struct.schema().field("evars").schema().fields().asScala.map(_.name()) shouldBe List("evars") val evarsInner = struct.schema().field("evars").schema().field("evars") evarsInner.schema().`type`() shouldBe Schema.Type.STRUCT evarsInner.schema().isOptional shouldBe true evarsInner.schema().fields().asScala.map(_.name()).sorted shouldBe List("eVar1", "eVar2").sorted evarsInner.schema().field("eVar1").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA evarsInner.schema().field("eVar2").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA val exclude = struct.schema().field("exclude").schema() exclude.schema().`type`() shouldBe Schema.Type.STRUCT exclude.schema().isOptional shouldBe true exclude.schema().fields().asScala.map(_.name()).sorted shouldBe List("id", "value").sorted exclude.schema().field("id").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA exclude.schema().field("value").schema() shouldBe Schema.OPTIONAL_BOOLEAN_SCHEMA struct.get("idType") shouldBe 3L struct.get("colorDepth") shouldBe "" struct.get("threshold") shouldBe 45.77D val evarsStruct = struct.get("evars").asInstanceOf[Struct].get("evars").asInstanceOf[Struct] evarsStruct.get("eVar1") shouldBe "Tue Aug 27 2019 12:08:10" evarsStruct.get("eVar2") shouldBe 156692207943934897L val excludeStruct = struct.get("exclude").asInstanceOf[Struct] excludeStruct.get("id") shouldBe 0L excludeStruct.get("value") shouldBe false } }
Example 40
Source File: OrcTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.orc import com.landoop.streamreactor.connect.hive.{OrcSinkConfig, OrcSourceConfig, StructUtils, orc} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{SchemaBuilder, Struct} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class OrcTest extends AnyFlatSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(conf) "Orc" should "read and write orc files" in { val schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().optional().build()) .field("age", SchemaBuilder.int32().optional().build()) .field("salary", SchemaBuilder.float64().optional().build()) .name("from_orc") .build() val users = Seq( new Struct(schema).put("name", "sammy").put("age", 38).put("salary", 54.67), new Struct(schema).put("name", "laura").put("age", 37).put("salary", 91.84) ) val path = new Path("orctest.orc") val sink = orc.sink(path, schema, OrcSinkConfig(overwrite = true)) users.foreach(sink.write) sink.close() val source = orc.source(path, OrcSourceConfig()) val actual = source.iterator.toList actual.head.schema shouldBe schema actual.map(StructUtils.extractValues) shouldBe List(Vector("sammy", 38, 54.67), Vector("laura", 37, 91.84)) fs.delete(path, false) } }
Example 41
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter} package object parquet { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = { if (fs.isDirectory(path)) { logger.debug(s"$path is a directory, reading constituent files") val remote = fs.listFiles(path, false) new Iterator[Path] { override def hasNext: Boolean = remote.hasNext override def next(): Path = remote.next().getPath }.toList } else { logger.debug(s"Reading $path as a single file") List(path) } } def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = { ParquetReader.builder(new StructReadSupport, file) .withConf(fs.getConf) .build() } def parquetWriter(path: Path, schema: Schema, config: ParquetSinkConfig): ParquetWriter[Struct] = { new StructParquetWriterBuilder(path, schema) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(config.enableDictionary) .withValidation(config.validation) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withWriteMode(if (config.overwrite) { ParquetFileWriter.Mode.OVERWRITE } else { ParquetFileWriter.Mode.CREATE }).build() } }
Example 42
Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ // derived from Apache Spark's parquet write support, archive and license here: // https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private val schemaName = if (schema.name() == null) "schema" else schema.name() private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName) private val metadata = new java.util.HashMap[String, String]() metadata.put("written_by", "streamreactor") // The Parquet `RecordConsumer` to which all structs are written private var consumer: RecordConsumer = _ type ValueWriter = (Any) => Unit override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String]) override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata) override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer override def write(struct: Struct): Unit = { writeMessage { writeStructFields(struct) } } private def writeStructFields(struct: Struct): Unit = { for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) { val value = struct.get(field) if (value != null) { val writer = valueWriter(field.schema()) writeField(field.name, index) { writer(value) } } } } def valueWriter(schema: Schema): ValueWriter = { // todo perhaps introduce something like spark's SpecializedGetters schema.`type`() match { case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean]) case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt) case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong) case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes)) case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat) case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble) case Schema.Type.STRUCT => value => { logger.debug(s"Writing nested struct") val struct = value.asInstanceOf[Struct] writeGroup { schema.fields.asScala .map { field => field -> struct.get(field) } .zipWithIndex.foreach { case ((field, v), k) => writeField(field.name, k) { valueWriter(field.schema)(v) } } } } case _ => throw UnsupportedSchemaType(schema.`type`.toString) } } private def writeMessage(f: => Unit): Unit = { consumer.startMessage() f consumer.endMessage() } private def writeGroup(f: => Unit): Unit = { consumer.startGroup() // consumer.startMessage() f //consumer.endMessage() consumer.endGroup() } private def writeField(name: String, k: Int)(f: => Unit): Unit = { consumer.startField(name, k) f consumer.endField(name, k) } }
Example 43
Source File: StructReadSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import java.util import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.Struct import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema.MessageType class StructReadSupport extends ReadSupport[Struct] { override def prepareForRead(configuration: Configuration, metaData: util.Map[String, String], fileSchema: MessageType, context: ReadSupport.ReadContext): RecordMaterializer[Struct] = { // the file schema in here comes from the footer of the parquet file val schema = ParquetSchemas.toKafka(fileSchema) new StructMaterializer(schema) } override def init(context: InitContext): ReadSupport.ReadContext = { new ReadSupport.ReadContext(context.getFileSchema) } }
Example 44
Source File: Output.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.blockchain.data import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Output(addr_tag_link: Option[String], addr_tag: Option[String], spent: Boolean, tx_index: Long, `type`: Int, addr: Option[String], value: Long, n: Int, script: String) object Output { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.output") .doc("The output instance part of a transaction.") .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA) .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA) .field("spent", Schema.BOOLEAN_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("type", Schema.INT32_SCHEMA) .field("addr", Schema.OPTIONAL_STRING_SCHEMA) .field("value", Schema.INT64_SCHEMA) .field("n", Schema.INT32_SCHEMA) .field("script", Schema.STRING_SCHEMA) .build() implicit class OutputToStructConverter(val output: Output) extends AnyVal { def toStruct(): Struct = { val struct = new Struct(ConnectSchema) .put("spent", output.spent) .put("tx_index", output.tx_index) .put("type", output.`type`) .put("value", output.value) .put("n", output.n) .put("script", output.script) output.addr.foreach(struct.put("addr", _)) output.addr_tag.foreach(struct.put("addr_tag", _)) output.addr_tag_link.foreach(struct.put("addr_tag_link", _)) struct } } }
Example 45
Source File: Input.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.cassandra.sink import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Input(sequence: Long, prev_out: Option[Output], script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() map.put("sequence", sequence) prev_out.foreach(p => map.put("prev_out", p.toHashMap)) map.put("script", script) map } } object Input { val ConnectSchema = SchemaBuilder.struct .name("datamountaineer.blockchain.input") .doc("The input instance part of a transaction.") .field("sequence", Schema.INT64_SCHEMA) .field("prev_out", Output.ConnectSchema) .field("script", Schema.STRING_SCHEMA) .build() implicit class InputToStructConverter(val input: Input) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("sequence", input.sequence) .put("script", input.script) input.prev_out.foreach(po => struct.put("prev_out", po.toStruct())) struct } } }
Example 46
Source File: KeyUtils.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.cassandra.utils import com.jayway.jsonpath.{Configuration, JsonPath} import org.apache.kafka.connect.data.{Schema, Struct} object KeyUtils { def keysFromStruct(struct: Struct, schema: Schema, fieldNames: Seq[String]): Seq[Object] = fieldNames.map(getKeyFromStruct(struct, _)) private def getKeyFromStruct(struct: Struct, fieldName: String): Object = { if (fieldName.contains(".")) { val Array(nestedObject, nestedField) = fieldName.split("\\.", 2) getKeyFromStruct(struct.get(nestedObject).asInstanceOf[Struct], nestedField) } else { struct.get(fieldName) } } }
Example 47
Source File: Output.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.mongodb import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Output(addr_tag_link: Option[String], addr_tag: Option[String], spent: Boolean, tx_index: Long, `type`: Int, addr: Option[String], value: Long, n: Int, script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() addr_tag_link.foreach(map.put("addr_tag_link", _)) addr_tag_link.foreach(map.put("addr_tag", _)) map.put("spent", spent) map.put("tx_index", tx_index) map.put("type", `type`) addr.foreach(map.put("addr", _)) map.put("value", value) map.put("n", n) map.put("script", script) map } } object Output { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.output") .doc("The output instance part of a transaction.") .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA) .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA) .field("spent", Schema.BOOLEAN_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("type", Schema.OPTIONAL_INT32_SCHEMA) .field("addr", Schema.OPTIONAL_STRING_SCHEMA) .field("value", Schema.INT64_SCHEMA) .field("n", Schema.INT32_SCHEMA) .field("script", Schema.STRING_SCHEMA) .build() implicit class OutputToStructConverter(val output: Output) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("spent", output.spent) .put("tx_index", output.tx_index) .put("type", output.`type`) .put("value", output.value) .put("n", output.n) .put("script", output.script) output.addr.foreach(struct.put("addr", _)) output.addr_tag.foreach(struct.put("addr_tag", _)) output.addr_tag_link.foreach(struct.put("addr_tag_link", _)) struct } } }
Example 48
Source File: Input.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.mongodb import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Input(sequence: Long, prev_out: Option[Output], script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() map.put("sequence", sequence) prev_out.foreach(p => map.put("prev_out", p.toHashMap)) map.put("script", script) map } } object Input { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.input") .doc("The input instance part of a transaction.") .field("sequence", Schema.INT64_SCHEMA) .field("prev_out", Output.ConnectSchema) .field("script", Schema.STRING_SCHEMA) .build() implicit class InputToStructConverter(val input: Input) extends AnyVal { def toStruct(): Struct = { val struct = new Struct(ConnectSchema) .put("sequence", input.sequence) .put("script", input.script) input.prev_out.foreach(po => struct.put("prev_out", po.toStruct())) struct } } }
Example 49
Source File: SinkRecordToDocument.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.mongodb.sink import com.datamountaineer.streamreactor.connect.mongodb.config.MongoSettings import com.datamountaineer.streamreactor.connect.mongodb.converters.SinkRecordConverter import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.bson.Document object SinkRecordToDocument extends ConverterUtil { def apply(record: SinkRecord, keys: Set[String] = Set.empty)(implicit settings: MongoSettings): (Document, Iterable[(String, Any)]) = { val schema = record.valueSchema() val value = record.value() val fields = settings.fields.getOrElse(record.topic(), Map.empty) val allFields = if (fields.size == 1 && fields.head._1 == "*") true else false if (schema == null) { //try to take it as string value match { case _: java.util.Map[_, _] => val extracted = convertSchemalessJson( record, fields, settings.ignoredField.getOrElse(record.topic(), Set.empty) ) //not ideal; but the compile is hashmap anyway SinkRecordConverter.fromMap(extracted.asInstanceOf[java.util.Map[String, AnyRef]]) -> keys.headOption.map(_ => KeysExtractor.fromMap(extracted, keys)).getOrElse(Iterable.empty) case _ => sys.error("For schemaless record only String and Map types are supported") } } else { schema.`type`() match { case Schema.Type.STRING => val extracted = convertStringSchemaAndJson( record, fields, settings.ignoredField.getOrElse(record.topic(), Set.empty), includeAllFields = allFields) SinkRecordConverter.fromJson(extracted) -> keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty) case Schema.Type.STRUCT => val extracted = convert( record, fields, settings.ignoredField.getOrElse(record.topic(), Set.empty) ) SinkRecordConverter.fromStruct(extracted) -> keys.headOption.map(_ => KeysExtractor.fromStruct(extracted.value().asInstanceOf[Struct], keys)).getOrElse(Iterable.empty) case other => sys.error(s"$other schema is not supported") } } } }
Example 50
Source File: KeysExtractorTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink import java.util import com.datamountaineer.streamreactor.connect.azure.documentdb.Json import com.sksamuel.avro4s.RecordFormat import io.confluent.connect.avro.AvroData import org.apache.kafka.common.config.ConfigException import org.apache.kafka.connect.data.Struct import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.collection.JavaConverters._ class KeysExtractorTest extends AnyWordSpec with Matchers { private val avroData = new AvroData(4) case class WithNested(id: Int, nested: SomeTest) case class SomeTest(name: String, value: Double, flags: Seq[Int], map: Map[String, String]) "KeysExtractor" should { "extract keys from JSON" in { val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction1.json").toURI.getPath).mkString val jvalue = Json.parseJson(json) val actual = KeysExtractor.fromJson(jvalue, Set("lock_time", "rbf")) actual shouldBe List("lock_time" -> 9223372036854775807L, "rbf" -> true) } "throw exception when extracting the keys from JSON" in { val json = scala.io.Source.fromFile(getClass.getResource(s"/transaction1.json").toURI.getPath).mkString val jvalue = Json.parseJson(json) intercept[ConfigException] { val actual = KeysExtractor.fromJson(jvalue, Set("inputs")) } } "extract keys from a Map" in { val actual = KeysExtractor.fromMap(Map("key1" -> 12, "key2" -> 10L, "key3" -> "tripple").asJava, Set("key1", "key3")) actual shouldBe Set("key1" -> 12, "key3" -> "tripple") } "extract keys from a Map should throw an exception if the key is another map" in { intercept[ConfigException] { KeysExtractor.fromMap(Map("key1" -> 12, "key2" -> 10L, "key3" -> Map.empty[String, String]).asJava, Set("key1", "key3")) } } "extract keys from a Map should throw an exception if the key is an array" in { intercept[ConfigException] { KeysExtractor.fromMap(Map("key1" -> 12, "key2" -> 10L, "key3" -> new util.ArrayList[String]).asJava, Set("key1", "key3")) } } "extract from a struct" in { val format = RecordFormat[SomeTest] val avro = format.to(SomeTest("abc", 12.5, Seq.empty, Map.empty)) val struct = avroData.toConnectData(avro.getSchema, avro) KeysExtractor.fromStruct(struct.value().asInstanceOf[Struct], Set("name")) shouldBe Set("name" -> "abc") } "extract from a struct should throw an exception if a key is an array" in { val format = RecordFormat[SomeTest] val avro = format.to(SomeTest("abc", 12.5, Seq.empty, Map.empty)) intercept[ConfigException] { val struct = avroData.toConnectData(avro.getSchema, avro) KeysExtractor.fromStruct(struct.value().asInstanceOf[Struct], Set("flags")) } } "extract from a struct should throw an exception if a key is a map" in { val format = RecordFormat[SomeTest] val avro = format.to(SomeTest("abc", 12.5, Seq.empty, Map.empty)) intercept[ConfigException] { val struct = avroData.toConnectData(avro.getSchema, avro) KeysExtractor.fromStruct(struct.value().asInstanceOf[Struct], Set("map")) } } "extract from a struct should throw an exception if a key is a struct" in { val format = RecordFormat[WithNested] val avro = format.to(WithNested(1, SomeTest("abc", 12.5, Seq.empty, Map.empty))) intercept[ConfigException] { val struct = avroData.toConnectData(avro.getSchema, avro) KeysExtractor.fromStruct(struct.value().asInstanceOf[Struct], Set("nested")) } } } }
Example 51
Source File: Output.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Output(addr_tag_link: Option[String], addr_tag: Option[String], spent: Boolean, tx_index: Long, `type`: Int, addr: Option[String], value: Long, n: Int, script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() addr_tag_link.foreach(map.put("addr_tag_link", _)) addr_tag_link.foreach(map.put("addr_tag", _)) map.put("spent", spent) map.put("tx_index", tx_index) map.put("type", `type`) addr.foreach(map.put("addr", _)) map.put("value", value) map.put("n", n) map.put("script", script) map } } object Output { val ConnectSchema: Schema = SchemaBuilder.struct .name("output") .doc("The output instance part of a transaction.") .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA) .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA) .field("spent", Schema.BOOLEAN_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("type", Schema.OPTIONAL_INT32_SCHEMA) .field("addr", Schema.OPTIONAL_STRING_SCHEMA) .field("value", Schema.INT64_SCHEMA) .field("n", Schema.INT32_SCHEMA) .field("script", Schema.STRING_SCHEMA) .build() implicit class OutputToStructConverter(val output: Output) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("spent", output.spent) .put("tx_index", output.tx_index) .put("type", output.`type`) .put("value", output.value) .put("n", output.n) .put("script", output.script) output.addr.foreach(struct.put("addr", _)) output.addr_tag.foreach(struct.put("addr_tag", _)) output.addr_tag_link.foreach(struct.put("addr_tag_link", _)) struct } } }
Example 52
Source File: Input.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Input(sequence: Long, prev_out: Option[Output], script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() map.put("sequence", sequence) prev_out.foreach(p => map.put("prev_out", p.toHashMap)) map.put("script", script) map } } object Input { val ConnectSchema = SchemaBuilder.struct .name("input") .doc("The input instance part of a transaction.") .field("sequence", Schema.INT64_SCHEMA) .field("prev_out", Output.ConnectSchema) .field("script", Schema.STRING_SCHEMA) .build() implicit class InputToStructConverter(val input: Input) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("sequence", input.sequence) .put("script", input.script) input.prev_out.foreach(po => struct.put("prev_out", po.toStruct())) struct } } }
Example 53
Source File: SinkRecordToDocument.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.azure.documentdb.sink import com.datamountaineer.streamreactor.connect.azure.documentdb.config.DocumentDbSinkSettings import com.datamountaineer.streamreactor.connect.azure.documentdb.converters.SinkRecordConverter import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import com.microsoft.azure.documentdb.Document import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord object SinkRecordToDocument extends ConverterUtil { def apply(record: SinkRecord, keys: Set[String] = Set.empty)(implicit settings: DocumentDbSinkSettings): (Document, Iterable[(String, Any)]) = { val schema = record.valueSchema() val value = record.value() if (schema == null) { //try to take it as string value match { case _: java.util.Map[_, _] => val fields = settings.fields(record.topic()) val extracted = convertSchemalessJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic())) //not ideal; but the compile is hashmap anyway SinkRecordConverter.fromMap(extracted.asInstanceOf[java.util.Map[String, AnyRef]]) -> keys.headOption.map(_ => KeysExtractor.fromMap(extracted, keys)).getOrElse(Iterable.empty) case _: String => val extracted = convertStringSchemaAndJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic())) SinkRecordConverter.fromJson(extracted) -> keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty) case _ => sys.error("For schemaless record only String and Map types are supported") } } else { schema.`type`() match { case Schema.Type.STRING => val extracted = convertStringSchemaAndJson(record, settings.fields(record.topic()), settings.ignoredField(record.topic())) SinkRecordConverter.fromJson(extracted) -> keys.headOption.map(_ => KeysExtractor.fromJson(extracted, keys)).getOrElse(Iterable.empty) case Schema.Type.STRUCT => val extracted = convert(record, settings.fields(record.topic()), settings.ignoredField(record.topic())) SinkRecordConverter.fromStruct(extracted) -> keys.headOption.map(_ => KeysExtractor.fromStruct(extracted.value().asInstanceOf[Struct], keys)).getOrElse(Iterable.empty) case other => sys.error(s"$other schema is not supported") } } } }
Example 54
Source File: Transaction.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.blockchain.data import java.util import com.datamountaineer.streamreactor.connect.blockchain.data.Input._ import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.source.SourceRecord case class Transaction(lock_time: Long, ver: Int, size: Long, inputs: Seq[Input], rbf: Option[Boolean], time: Long, tx_index: Long, vin_sz: Int, hash: String, vout_sz: Int, relayed_by: String, out: Seq[Output]) object Transaction { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.transaction") .field("lock_time", Schema.INT64_SCHEMA) .field("ver", Schema.INT32_SCHEMA) .field("size", Schema.INT64_SCHEMA) .field("inputs", SchemaBuilder.array(Input.ConnectSchema).optional().build()) .field("rbf", Schema.OPTIONAL_BOOLEAN_SCHEMA) .field("time", Schema.INT64_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("vin_sz", Schema.INT32_SCHEMA) .field("hash", Schema.STRING_SCHEMA) .field("vout_sz", Schema.INT32_SCHEMA) .field("relayed_by", Schema.STRING_SCHEMA) .field("out", SchemaBuilder.array(Output.ConnectSchema).optional().build()) .build() implicit class TransactionToSourceRecordConverter(val tx: Transaction) extends AnyVal { def toSourceRecord(topic: String, partition: Int, key: Option[String]): SourceRecord = { new SourceRecord( null, null, topic, partition, key.map(_ => Schema.STRING_SCHEMA).orNull, key.orNull, ConnectSchema, tx.toStruct() ) } //private def getOffset() = Collections.singletonMap("position", System.currentTimeMillis()) def toStruct(): Struct = { val struct = new Struct(ConnectSchema) .put("lock_time", tx.lock_time) .put("ver", tx.ver) .put("size", tx.size) .put("time", tx.time) .put("tx_index", tx.tx_index) .put("vin_sz", tx.vin_sz) .put("hash", tx.hash) .put("vout_sz", tx.vout_sz) .put("relayed_by", tx.relayed_by) tx.out.headOption.foreach { _ => import scala.collection.JavaConverters._ struct.put("out", tx.out.map(_.toStruct()).asJava) } tx.rbf.foreach(struct.put("rbf", _)) tx.inputs.headOption.foreach { _ => val inputs = new util.ArrayList[Struct] tx.inputs.foreach(i => inputs.add(i.toStruct())) struct.put("inputs", inputs) } tx.out.headOption.foreach { _ => val outputs = new util.ArrayList[Struct] tx.out.foreach(output => outputs.add(output.toStruct())) } struct } } }
Example 55
Source File: Output.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.cassandra.sink import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Output(addr_tag_link: Option[String], addr_tag: Option[String], spent: Boolean, tx_index: Long, `type`: Int, addr: Option[String], value: Long, n: Int, script: String) { def toHashMap: util.HashMap[String, Any] = { val map = new util.HashMap[String, Any]() addr_tag_link.foreach(map.put("addr_tag_link", _)) addr_tag_link.foreach(map.put("addr_tag", _)) map.put("spent", spent) map.put("tx_index", tx_index) map.put("type", `type`) addr.foreach(map.put("addr", _)) map.put("value", value) map.put("n", n) map.put("script", script) map } } object Output { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.output") .doc("The output instance part of a transaction.") .field("addr_tag_link", Schema.OPTIONAL_STRING_SCHEMA) .field("addr_tag", Schema.OPTIONAL_STRING_SCHEMA) .field("spent", Schema.BOOLEAN_SCHEMA) .field("tx_index", Schema.INT64_SCHEMA) .field("type", Schema.OPTIONAL_INT32_SCHEMA) .field("addr", Schema.OPTIONAL_STRING_SCHEMA) .field("value", Schema.INT64_SCHEMA) .field("n", Schema.INT32_SCHEMA) .field("script", Schema.STRING_SCHEMA) .build() implicit class OutputToStructConverter(val output: Output) extends AnyVal { def toStruct() = { val struct = new Struct(ConnectSchema) .put("spent", output.spent) .put("tx_index", output.tx_index) .put("type", output.`type`) .put("value", output.value) .put("n", output.n) .put("script", output.script) output.addr.foreach(struct.put("addr", _)) output.addr_tag.foreach(struct.put("addr_tag", _)) output.addr_tag_link.foreach(struct.put("addr_tag_link", _)) struct } } }
Example 56
Source File: Input.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.blockchain.data import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} case class Input(sequence: Long, prev_out: Option[Output], script: String) object Input { val ConnectSchema: Schema = SchemaBuilder.struct .name("datamountaineer.blockchain.input") .doc("The input instance part of a transaction.") .field("sequence", Schema.INT64_SCHEMA) .field("prev_out", Output.ConnectSchema) .field("script", Schema.STRING_SCHEMA) .build() implicit class InputToStructConverter(val input: Input) extends AnyVal { def toStruct(): Struct = { val struct = new Struct(ConnectSchema) .put("sequence", input.sequence) .put("script", input.script) input.prev_out.foreach(po=>struct.put("prev_out", po.toStruct())) struct } } }
Example 57
Source File: ConnectMongoConverterSpec.scala From kafka-connect-mongodb with Apache License 2.0 | 5 votes |
package com.startapp.data import java.lang.Boolean import java.util import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.scalatest.{FlatSpec, Matchers} class ConnectMongoConverterSpec extends FlatSpec with Matchers{ private val FIELD1_NAME = "fieldInt" private val FIELD1_VALUE = new Integer(5) private val FIELD2_NAME = "fieldString" private val FIELD2_VALUE = "str" private val FIELD3_NAME = "fieldBoolean" private val FIELD3_VALUE = new Boolean(true) val schema = SchemaBuilder.struct().name("test schema") .field(FIELD1_NAME, Schema.INT32_SCHEMA) .field(FIELD2_NAME, Schema.STRING_SCHEMA) .field(FIELD3_NAME, Schema.BOOLEAN_SCHEMA) .build() "No Schema Connect Mongo Converter Bad Data" should "throw an exception" in { var exceptionThrown = false val badData = new Struct(schema) try{ checkJsonMap(NoSchemaConnectMongoConverter, badData) } catch { case _ : java.lang.ClassCastException => exceptionThrown = true } exceptionThrown should be(true) } "No Schema Connect Mongo Converter Good Data" should "return the same map" in { val jsonMap = new util.HashMap[String, Object]() jsonMap.put(FIELD1_NAME, FIELD1_VALUE) jsonMap.put(FIELD2_NAME, FIELD2_VALUE) jsonMap.put(FIELD3_NAME, FIELD3_VALUE) checkJsonMap(NoSchemaConnectMongoConverter, jsonMap) } "Schema Connect Mongo Converter Bad Data" should "throw an exception" in { var exceptionThrown = false val badData = new util.HashMap[String, Object]() badData.put(FIELD1_NAME, FIELD1_VALUE) try { checkJsonMap(SchemaConnectMongoConverter, badData) } catch { case _ : java.lang.ClassCastException => exceptionThrown = true } exceptionThrown should be(true) } "Schema Connect Mongo Converter Good Data" should "convert data to json map" in { val data = new Struct(schema) .put(FIELD1_NAME, FIELD1_VALUE) .put(FIELD2_NAME, FIELD2_VALUE) .put(FIELD3_NAME, FIELD3_VALUE) checkJsonMap(SchemaConnectMongoConverter, data) } private def checkJsonMap(converter : ConnectMongoConverter, value: Object): Unit ={ val newJsonMap = converter.toJsonMap(value).toMap newJsonMap(FIELD1_NAME) should be(FIELD1_VALUE) newJsonMap(FIELD2_NAME) should be(FIELD2_VALUE) newJsonMap(FIELD3_NAME) should be(FIELD3_VALUE) } }
Example 58
Source File: HANASourceTaskConversionTest.scala From kafka-connect-sap with Apache License 2.0 | 5 votes |
package com.sap.kafka.connect.source import com.sap.kafka.client.MetaSchema import org.apache.kafka.connect.data.Schema.Type import org.apache.kafka.connect.data.{Field, Schema, Struct} import org.apache.kafka.connect.source.SourceRecord import scala.collection.JavaConverters._ class HANASourceTaskConversionTest extends HANASourceTaskTestBase { override def beforeAll(): Unit = { super.beforeAll() task.start(singleTableConfig()) } override def afterAll(): Unit = { task.stop() super.afterAll() } test("boolean type") { typeConversion(Schema.BOOLEAN_SCHEMA, true, java.lang.Boolean.FALSE, Schema.BOOLEAN_SCHEMA, java.lang.Boolean.FALSE) } test("int type") { typeConversion(Schema.INT32_SCHEMA, true, new java.lang.Integer(1), Schema.INT32_SCHEMA, new Integer(1)) } test("long type") { typeConversion(Schema.INT64_SCHEMA, true, new java.lang.Long(1), Schema.INT64_SCHEMA, new java.lang.Long(1)) } test("double type") { typeConversion(Schema.FLOAT64_SCHEMA, true, new java.lang.Double(1.0), Schema.FLOAT64_SCHEMA, new java.lang.Double(1.0)) } test("string type") { typeConversion(Schema.STRING_SCHEMA, true, "'a'", Schema.STRING_SCHEMA, "a") } private def typeConversion(sqlType: Schema, nullable: Boolean, sqlValue: Object, convertedSchema: Schema, convertedValue: Object): Unit = { val fields = Seq(new Field("id", 1, sqlType)) jdbcClient.createTable(Some("TEST"), "EMPLOYEES_SOURCE", MetaSchema(null, fields), 3000) val connection = jdbcClient.getConnection val stmt = connection.createStatement() stmt.execute("insert into \"TEST\".\"EMPLOYEES_SOURCE\" values(" + sqlValue.toString + ")") val records = task.poll() validateRecords(records.asScala.toList, convertedSchema, convertedValue) stmt.execute("drop table \"TEST\".\"EMPLOYEES_SOURCE\"") } private def validateRecords(records: List[SourceRecord], expectedFieldSchema: Schema, expectedValue: Object): Unit = { assert(records.size === 1) val objValue = records.head.value() assert(objValue.isInstanceOf[Struct]) val value = objValue.asInstanceOf[Struct] val schema = value.schema() assert(Type.STRUCT === schema.`type`()) val fields = schema.fields() assert(fields.size() === 1) val fieldSchema = fields.get(0).schema() assert(expectedFieldSchema === fieldSchema) assert(expectedValue === value.get(fields.get(0))) } }
Example 59
Source File: TableQuerier.scala From kafka-connect-sap with Apache License 2.0 | 5 votes |
package com.sap.kafka.connect.source.querier import com.sap.kafka.client.hana.HANAJdbcClient import com.sap.kafka.connect.config.{BaseConfig, BaseConfigConstants} import com.sap.kafka.connect.config.hana.HANAConfig import com.sap.kafka.utils.hana.HANAJdbcTypeConverter import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.source.SourceRecord import org.slf4j.LoggerFactory import scala.util.Random abstract class TableQuerier(mode: String, tableOrQuery: String, topic: String, config: BaseConfig, var jdbcClient: Option[HANAJdbcClient]) extends Comparable[TableQuerier] { var tableName: String = if (mode.equals(BaseConfigConstants.QUERY_MODE_TABLE)) tableOrQuery else null var query: String = if (mode.equals(BaseConfigConstants.QUERY_MODE_SQL)) tableOrQuery else null var lastUpdate: Long = 0 var schema: Schema = _ var queryString: Option[String] = None var resultList: Option[List[Struct]] = None val log = LoggerFactory.getLogger(getClass) def getLastUpdate(): Long = lastUpdate def getOrCreateQueryString(): Option[String] = { createQueryString() queryString } def createQueryString(): Unit def querying(): Boolean = resultList.isDefined def maybeStartQuery(): Unit = { if (resultList.isEmpty) { schema = getSchema() queryString = getOrCreateQueryString() val batchMaxRows = config.batchMaxRows resultList = getOrCreateJdbcClient().get.executeQuery(schema, queryString.get, 0, batchMaxRows) log.info(resultList.size.toString) } } def extractRecords(): List[SourceRecord] def close(now: Long): Unit = { resultList = None schema = null lastUpdate = now } protected def getOrCreateJdbcClient(): Option[HANAJdbcClient] = { if (jdbcClient.isDefined) { return jdbcClient } config match { case hanaConfig: HANAConfig => Some(HANAJdbcClient(hanaConfig)) case _ => throw new RuntimeException("Cannot create Jdbc Client") } } private def getSchema(): Schema = { mode match { case BaseConfigConstants.QUERY_MODE_TABLE => if (getOrCreateJdbcClient().get.isInstanceOf[HANAJdbcClient]) { val metadata = getOrCreateJdbcClient().get.getMetaData(tableOrQuery, None) HANAJdbcTypeConverter.convertHANAMetadataToSchema(tableName, metadata) } else { throw new RuntimeException("Jdbc Client is not available") } case BaseConfigConstants.QUERY_MODE_SQL => if (getOrCreateJdbcClient().get.isInstanceOf[HANAJdbcClient]) { val metadata = getOrCreateJdbcClient().get.getMetadata(tableOrQuery) HANAJdbcTypeConverter.convertHANAMetadataToSchema("Query" + Random.nextInt, metadata) } else { throw new RuntimeException("Jdbc Client is not available") } case _ => throw new RuntimeException("Other Query modes are not supported") } } override def compareTo(other: TableQuerier): Int = { if (this.lastUpdate < other.lastUpdate) { -1 } else if (this.lastUpdate > other.lastUpdate) { 0 } else { this.tableName.compareTo(other.tableName) } } }
Example 60
Source File: FieldValueGetter.scala From kafka-connect-kcql-smt with Apache License 2.0 | 5 votes |
package com.landoop.connect.sql import org.apache.kafka.connect.data.{Schema, Struct} trait FieldValueGetter { def get(value: Any, schema: Schema, path: Seq[String]): Option[Any] = { path.headOption.map { parent => schema.`type`() match { case Schema.Type.STRUCT => if (Option(value).isEmpty) None else fromRecord(value, schema, path) case Schema.Type.MAP => if (Option(value).isEmpty) None else fromMap(value, schema, path) case _ => throw new IllegalArgumentException(s"Can't select $parent field from schema:$schema") } }.getOrElse { schema.`type`() match { case Schema.Type.BOOLEAN | Schema.Type.FLOAT64 | Schema.Type.FLOAT32 | Schema.Type.INT64 | Schema.Type.INT32 | Schema.Type.INT16 | Schema.Type.INT8 | Schema.Type.BYTES | Schema.Type.STRING => Option(value) case Schema.Type.ARRAY | Schema.Type.MAP | Schema.Type.STRUCT => throw new IllegalArgumentException(s"Can't select an element from an array(schema:$schema)") case other => throw new IllegalArgumentException(s"Invalid Avro schema type:$other") } } } private def fromRecord(value: Any, schema: Schema, path: Seq[String]) = { val field = Option(schema.field(path.head)) .getOrElse(throw new IllegalArgumentException(s"Can't find field:${path.head} in schema:$schema")) val v = value.asInstanceOf[Struct].get(path.head) get(v, field.schema(), path.tail) } private def fromMap(value: Any, schema: Schema, path: Seq[String]) = { val field = Option(schema.field(path.head)) .getOrElse(throw new IllegalArgumentException(s"Can't find field:${path.head} in schema:$schema")) val v = value.asInstanceOf[Struct].get(path.head) get(v, field.schema(), path.tail) } }
Example 61
Source File: IotHubSourceTaskTest.scala From toketi-kafka-connect-iothub with MIT License | 5 votes |
// Copyright (c) Microsoft. All rights reserved. package com.microsoft.azure.iot.kafka.connect.source import java.time.{Duration, Instant} import java.util import com.microsoft.azure.iot.kafka.connect.source.testhelpers.{DeviceTemperature, MockDataReceiver, TestConfig, TestIotHubSourceTask} import org.apache.kafka.connect.data.Struct import org.json4s.jackson.Serialization.read import org.scalatest.{FlatSpec, GivenWhenThen} class IotHubSourceTaskTest extends FlatSpec with GivenWhenThen with JsonSerialization { "IotHubSourceTask poll" should "return a list of SourceRecords with the right format" in { Given("IotHubSourceTask instance") val iotHubSourceTask = new TestIotHubSourceTask iotHubSourceTask.start(TestConfig.sourceTaskTestProps) When("IotHubSourceTask.poll is called") val sourceRecords = iotHubSourceTask.poll() Then("It returns a list of SourceRecords") assert(sourceRecords != null) assert(sourceRecords.size() == 15) for (i <- 0 until 15) { val record = sourceRecords.get(i) assert(record.topic() == TestConfig.sourceTaskTestProps.get(IotHubSourceConfig.KafkaTopic)) assert(record.valueSchema() == IotMessageConverter.schema) val messageStruct = record.value().asInstanceOf[Struct] assert(messageStruct.getString("deviceId").startsWith("device")) assert(messageStruct.getString("contentType") == "temperature") val enqueuedTime = Instant.parse(messageStruct.getString("enqueuedTime")) assert(enqueuedTime.isAfter(Instant.parse("2016-11-20T00:00:00Z"))) val systemProperties = messageStruct.getMap[String, String]("systemProperties") assert(systemProperties != null) assert(systemProperties.get("sequenceNumber") != "") assert(systemProperties.get("correlationId") != "") val properties = messageStruct.getMap[String, String]("properties") assert(properties != null) assert(properties.get("timestamp") != "") val deviceTemperature = read[DeviceTemperature](messageStruct.get("content").asInstanceOf[String]) assert(deviceTemperature != null) assert(deviceTemperature.unit == "F") assert(deviceTemperature.value != 0) } } "IotHubSourceTask start" should "initialize all properties" in { Given("A list of properties for IotHubSourceTask") val props: util.Map[String, String] = TestConfig.sourceTaskTestProps When("IotHubSourceTask is started") val task = new TestIotHubSourceTask task.start(props) Then("Data receiver should be properly initialized") assert(task.partitionSources.length == 3) assert(!task.partitionSources.exists(s => s.dataReceiver == null)) for (ps ← task.partitionSources) { val dataReceiver = ps.dataReceiver.asInstanceOf[MockDataReceiver] assert(dataReceiver.offset.isDefined) assert(dataReceiver.startTime.isEmpty) assert(dataReceiver.connectionString != "") assert(dataReceiver.receiverConsumerGroup != "") assert(dataReceiver.receiveTimeout == Duration.ofSeconds(5)) } } it should "initialize start time correctly on the data receiver when it is passed in the config" in { Given("A list of properties with StartTime for IotHubSourceTask") val props: util.Map[String, String] = TestConfig.sourceTaskTestPropsStartTime When("IotHubSourceTask is started") val task = new TestIotHubSourceTask task.start(props) Then("Data receiver should be properly initialized, with StartTime, while Offsets value should be ignored") assert(task.partitionSources.length == 3) assert(!task.partitionSources.exists(s => s.dataReceiver == null)) for (ps ← task.partitionSources) { val dataReceiver = ps.dataReceiver.asInstanceOf[MockDataReceiver] assert(dataReceiver.offset.isEmpty) assert(dataReceiver.startTime.isDefined) assert(dataReceiver.startTime.get == Instant.parse("2016-12-10T00:00:00Z")) assert(dataReceiver.connectionString != "") assert(dataReceiver.receiverConsumerGroup != "") } } }
Example 62
Source File: IotMessageConverterTest.scala From toketi-kafka-connect-iothub with MIT License | 5 votes |
// Copyright (c) Microsoft. All rights reserved. package com.microsoft.azure.iot.kafka.connect.source import java.text.SimpleDateFormat import java.time.Instant import com.microsoft.azure.eventhubs.impl.AmqpConstants import com.microsoft.azure.iot.kafka.connect.source.testhelpers.DeviceTemperature import org.apache.kafka.connect.data.Struct import org.json4s.jackson.Serialization._ import org.scalatest.{FlatSpec, GivenWhenThen} import scala.collection.mutable import scala.util.Random class IotMessageConverterTest extends FlatSpec with GivenWhenThen with JsonSerialization { private val random: Random = new Random "IotMessage Converter" should "populate right values for kafka message struct fields" in { Given("IotMessage object") val deviceTemp = DeviceTemperature(100.01, "F") val deviceTempStr = write(deviceTemp) val sequenceNumber = random.nextLong() val correlationId = random.nextString(10) val offset = random.nextString(10) val enqueuedDate = new SimpleDateFormat("MM/dd/yyyy").parse("12/01/2016") val systemProperties = mutable.Map[String, Object]( "iothub-connection-device-id" → "device10", AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME → sequenceNumber.asInstanceOf[Object], AmqpConstants.AMQP_PROPERTY_CORRELATION_ID → correlationId, AmqpConstants.OFFSET_ANNOTATION_NAME → offset, AmqpConstants.ENQUEUED_TIME_UTC_ANNOTATION_NAME → enqueuedDate) val timestamp = Instant.now().toString val messageProperties = mutable.Map[String, Object]( "timestamp" → timestamp, "contentType" → "temperature" ) val iotMessage = IotMessage(deviceTempStr, systemProperties, messageProperties) When("getIotMessageStruct is called with IotMessage object") val kafkaMessageStruct: Struct = IotMessageConverter.getIotMessageStruct(iotMessage) Then("The struct has all the expected properties") assert(kafkaMessageStruct.getString("deviceId") == "device10") assert(kafkaMessageStruct.getString("offset") == offset) assert(kafkaMessageStruct.getString("contentType") == "temperature") assert(kafkaMessageStruct.getString("enqueuedTime") == enqueuedDate.toInstant.toString) assert(kafkaMessageStruct.getInt64("sequenceNumber") == sequenceNumber) assert(kafkaMessageStruct.getString("content") == deviceTempStr) val structSystemProperties = kafkaMessageStruct.getMap[String, String]("systemProperties") assert(structSystemProperties != null) assert(structSystemProperties.size == 1) assert(structSystemProperties.get(AmqpConstants.AMQP_PROPERTY_CORRELATION_ID) == correlationId) val structProperties = kafkaMessageStruct.getMap[String, String]("properties") assert(structProperties != null) assert(structProperties.size == 1) assert(structProperties.get("timestamp") == timestamp) } it should "use default values for missing properties" in { val deviceTemp = DeviceTemperature(100.01, "F") val deviceTempStr = write(deviceTemp) val systemProperties = mutable.Map.empty[String, Object] val messageProperties = mutable.Map.empty[String, Object] val iotMessage = IotMessage(deviceTempStr, systemProperties, messageProperties) When("getIotMessageStruct is called with IotMessage object") val kafkaMessageStruct: Struct = IotMessageConverter.getIotMessageStruct(iotMessage) Then("The struct has all the expected properties") assert(kafkaMessageStruct.getString("deviceId") == "") assert(kafkaMessageStruct.getString("offset") == "") assert(kafkaMessageStruct.getString("contentType") == "") assert(kafkaMessageStruct.getString("enqueuedTime") == "") assert(kafkaMessageStruct.getInt64("sequenceNumber") == 0) assert(kafkaMessageStruct.getString("content") == deviceTempStr) val structSystemProperties = kafkaMessageStruct.getMap[String, String]("systemProperties") assert(structSystemProperties != null) assert(structSystemProperties.size == 0) val structProperties = kafkaMessageStruct.getMap[String, String]("properties") assert(structProperties != null) assert(structProperties.size == 0) } }
Example 63
Source File: IotHubPartitionSource.scala From toketi-kafka-connect-iothub with MIT License | 5 votes |
// Copyright (c) Microsoft. All rights reserved. package com.microsoft.azure.iot.kafka.connect.source import java.util.{Collections, Map} import com.typesafe.scalalogging.LazyLogging import org.apache.kafka.connect.data.Struct import org.apache.kafka.connect.errors.ConnectException import org.apache.kafka.connect.source.SourceRecord import scala.collection.mutable.ListBuffer import scala.util.control.NonFatal class IotHubPartitionSource(val dataReceiver: DataReceiver, val partition: String, val topic: String, val batchSize: Int, val eventHubName: String, val sourcePartition: Map[String, String]) extends LazyLogging with JsonSerialization { def getRecords: List[SourceRecord] = { logger.debug(s"Polling for data from eventHub $eventHubName partition $partition") val list = ListBuffer.empty[SourceRecord] try { val messages: Iterable[IotMessage] = this.dataReceiver.receiveData(batchSize) if (messages.isEmpty) { logger.debug(s"Finished processing all messages from eventHub $eventHubName " + s"partition ${this.partition}") } else { logger.debug(s"Received ${messages.size} messages from eventHub $eventHubName " + s"partition ${this.partition} (requested $batchSize batch)") for (msg: IotMessage <- messages) { val kafkaMessage: Struct = IotMessageConverter.getIotMessageStruct(msg) val sourceOffset = Collections.singletonMap("EventHubOffset", kafkaMessage.getString(IotMessageConverter.offsetKey)) val sourceRecord = new SourceRecord(sourcePartition, sourceOffset, this.topic, kafkaMessage.schema(), kafkaMessage) list += sourceRecord } } } catch { case NonFatal(e) => val errorMsg = s"Error while getting SourceRecords for eventHub $eventHubName " + s"partition $partition. Exception - ${e.toString} Stack trace - ${e.printStackTrace()}" logger.error(errorMsg) throw new ConnectException(errorMsg, e) } logger.debug(s"Obtained ${list.length} SourceRecords from IotHub") list.toList } }
Example 64
Source File: IotMessageConverter.scala From toketi-kafka-connect-iothub with MIT License | 5 votes |
// Copyright (c) Microsoft. All rights reserved. package com.microsoft.azure.iot.kafka.connect.source import java.time.Instant import java.util.Date import com.microsoft.azure.eventhubs.impl.AmqpConstants import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import scala.collection.JavaConverters._ import scala.reflect.ClassTag object IotMessageConverter { val offsetKey = "offset" private val schemaName = "iothub.kafka.connect" private val schemaVersion = 1 private val deviceIdKey = "deviceId" private val contentTypeKey = "contentType" private val sequenceNumberKey = "sequenceNumber" private val enqueuedTimeKey = "enqueuedTime" private val contentKey = "content" private val systemPropertiesKey = "systemProperties" private val propertiesKey = "properties" private val deviceIdIotHubKey = "iothub-connection-device-id" // Public for testing purposes lazy val schema: Schema = SchemaBuilder.struct() .name(schemaName) .version(schemaVersion) .field(deviceIdKey, Schema.STRING_SCHEMA) .field(offsetKey, Schema.STRING_SCHEMA) .field(contentTypeKey, Schema.OPTIONAL_STRING_SCHEMA) .field(enqueuedTimeKey, Schema.STRING_SCHEMA) .field(sequenceNumberKey, Schema.INT64_SCHEMA) .field(contentKey, Schema.STRING_SCHEMA) .field(systemPropertiesKey, propertiesMapSchema) .field(propertiesKey, propertiesMapSchema) private lazy val propertiesMapSchema: Schema = SchemaBuilder.map(Schema.STRING_SCHEMA, Schema.STRING_SCHEMA) def getIotMessageStruct(iotMessage: IotMessage): Struct = { val systemProperties = iotMessage.systemProperties val deviceId: String = getOrDefaultAndRemove(systemProperties, deviceIdIotHubKey, "") val offset: String = getOrDefaultAndRemove(systemProperties, AmqpConstants.OFFSET_ANNOTATION_NAME, "") val sequenceNumber: Long = getOrDefaultAndRemove(systemProperties, AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME, 0) val enqueuedTime: Option[Instant] = getEnqueuedTime(systemProperties) val enqueuedTimeStr = if(enqueuedTime.isDefined) enqueuedTime.get.toString else "" val properties = iotMessage.properties val contentType: String = getOrDefaultAndRemove(properties, contentTypeKey, "") val systemPropertiesMap = systemProperties.map(i => (i._1, i._2.toString)) new Struct(schema) .put(deviceIdKey, deviceId) .put(offsetKey, offset) .put(contentTypeKey, contentType) .put(enqueuedTimeKey, enqueuedTimeStr) .put(sequenceNumberKey, sequenceNumber) .put(contentKey, iotMessage.content) .put(systemPropertiesKey, systemPropertiesMap.asJava) .put(propertiesKey, properties.asJava) } private def getEnqueuedTime(map: scala.collection.mutable.Map[String, Object]): Option[Instant] = { val enqueuedTimeValue: Date = getOrDefaultAndRemove(map, AmqpConstants.ENQUEUED_TIME_UTC_ANNOTATION_NAME, null) if (enqueuedTimeValue != null) Some(enqueuedTimeValue.toInstant) else None } private def getOrDefaultAndRemove[T: ClassTag, S: ClassTag](map: scala.collection.mutable.Map[String, S], key: String, defaultVal: T): T = { if (map.contains(key)) { val retVal: T = map(key).asInstanceOf[T] map.remove(key) retVal } else { defaultVal } } }
Example 65
Source File: SchemaSpec.scala From kafka-connect-cassandra with Apache License 2.0 | 5 votes |
package com.tuplejump.kafka.connect.cassandra import com.datastax.driver.core.{ DataType, TestUtil} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord class SchemaSpec extends AbstractFlatSpec { it should "convert a struct schema with single field" in { val topic = "topicx" val sc = sinkConfig(topic, "keyspacex", "tablex", List("id")) sc.options.consistency should be (TaskConfig.DefaultSinkConsistency) sc.schema.columnNames should === (List("id")) sc.query.cql should be ("INSERT INTO keyspacex.tablex(id) VALUES(?)") val schema = SchemaBuilder.struct.name("record").version(1).field("id", Schema.INT32_SCHEMA).build val value = new Struct(schema).put("id", 1) val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0) sc.schema.route.topic should be (record.topic) sc.schema.route.keyspace should be ("keyspacex") sc.schema.route.table should be ("tablex") sc.schema is record should be (true) val query = record.as(sc.schema.namespace) query.cql should be("INSERT INTO keyspacex.tablex(id) VALUES(1)") } it should "convert a struct schema with multiple fields" in { val topic = "test_kfk" val sc = sinkConfig(topic, "keyspacex", "tablex", List("available", "name", "age")) val schema = SchemaBuilder.struct.name("record").version(1) .field("available", Schema.BOOLEAN_SCHEMA) .field("name", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA).build val value = new Struct(schema).put("name", "user").put("available", false).put("age", 15) val record = new SinkRecord("test_kfk", 1, SchemaBuilder.struct.build, "key", schema, value, 0) schema.asColumnNames should be (sc.schema.columnNames) sc.schema.route.topic should be (record.topic) sc.schema is record should be (true) sc.query.cql should be ("INSERT INTO keyspacex.tablex(available,name,age) VALUES(?,?,?)") val query = record.as(sc.schema.namespace) query.cql should be("INSERT INTO keyspacex.tablex(available,name,age) VALUES(false,'user',15)") } it should "convert cassandra column defs to a source schema" in { val colDef = Map( "id" -> DataType.cint(), "name" -> DataType.varchar()) val columns = TestUtil.getColumnDef(colDef) val expectedSchema = SchemaBuilder.struct() .field("id", Schema.INT32_SCHEMA) .field("name", Schema.STRING_SCHEMA).build() columns.asSchema should be(expectedSchema) } it should "convert kafka schema and struct to cassandra columns and schema mapping" in { import scala.collection.JavaConverters._ val topic = "a" val route = InternalConfig.Route(TaskConfig.SinkRoute + topic, "ks1.t1").get val schemaMap = new InternalConfig.Schema(route, Nil, Nil, Nil, List("available","name","age"), "") val schema = SchemaBuilder.struct.name("record").version(1) .field("available", Schema.BOOLEAN_SCHEMA) .field("name", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA).build val struct = new Struct(schema).put("name", "user").put("available", false).put("age", 15) val record = new SinkRecord(topic, 1, SchemaBuilder.struct.build, "key", schema, value, 0) schema.asColumnNames should ===(schemaMap.columnNames) schemaMap.columnNames should ===(schema.fields.asScala.map(_.name).toList) schemaMap is record should be (true) } }
Example 66
Source File: ProjectionMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import cats.data.NonEmptyList import com.datamountaineer.kcql.Field import com.landoop.streamreactor.connect.hive.StructMapper import org.apache.kafka.connect.data.{SchemaBuilder, Struct} class ProjectionMapper(projection: NonEmptyList[Field]) extends StructMapper { override def map(input: Struct): Struct = { // the compatible output schema built from projected fields with aliases applied val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, kcqlField) => Option(input.schema.field(kcqlField.getName)).fold(sys.error(s"Missing field $kcqlField")) { field => builder.field(kcqlField.getAlias, field.schema) } } val schema = builder.build() projection.foldLeft(new Struct(schema)) { (struct, field) => struct.put(field.getAlias, input.get(field.getName)) } } }
Example 67
Source File: DropPartitionValuesMapperTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import cats.data.NonEmptyList import com.landoop.streamreactor.connect.hive.{PartitionKey, PartitionPlan, TableName} import org.apache.kafka.connect.data.{SchemaBuilder, Struct} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import scala.collection.JavaConverters._ class DropPartitionValuesMapperTest extends AnyFunSuite with Matchers { test("strip partition values") { val schema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("p", SchemaBuilder.string().required().build()) .field("q", SchemaBuilder.string().required().build()) .field("z", SchemaBuilder.string().required().build()) .build() val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q"))) val struct = new Struct(schema).put("a", "a").put("p", "p").put("q", "q").put("z", "z") val output = new DropPartitionValuesMapper(plan).map(struct) output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z") } test("handle partition field is missing in input") { val schema = SchemaBuilder.struct() .field("a", SchemaBuilder.string().required().build()) .field("q", SchemaBuilder.string().required().build()) .field("z", SchemaBuilder.string().required().build()) .build() val plan = PartitionPlan(TableName("foo"), NonEmptyList.of(PartitionKey("p"), PartitionKey("q"))) val struct = new Struct(schema).put("a", "a").put("q", "q").put("z", "z") val output = new DropPartitionValuesMapper(plan).map(struct) output.schema().fields().asScala.map(_.name) shouldBe Seq("a", "z") } }
Example 68
Source File: DefaultCommitPolicyTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.staging import com.landoop.streamreactor.connect.hive.{Offset, Topic, TopicPartitionOffset} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import scala.concurrent.duration._ class DefaultCommitPolicyTest extends AnyWordSpec with Matchers { val schema: Schema = SchemaBuilder.struct() .field("name", SchemaBuilder.string().required().build()) .build() val struct = new Struct(schema) implicit val conf: Configuration = new Configuration() implicit val fs: LocalFileSystem = FileSystem.getLocal(conf) val tpo = TopicPartitionOffset(Topic("mytopic"), 1, Offset(100)) private def shouldFlush(policy: CommitPolicy, path: Path, count: Long) = { val status = fs.getFileStatus(path) policy.shouldFlush(CommitContext(tpo, path, count, status.getLen, status.getModificationTime)) } "DefaultCommitPolicy" should { "roll over after interval" in { val policy = DefaultCommitPolicy(None, Option(2.seconds), None) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 10) shouldBe false Thread.sleep(2000) shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file count" in { val policy = DefaultCommitPolicy(None, None, Some(9)) val path = new Path("foo") fs.create(path) shouldFlush(policy, path, 7) shouldBe false shouldFlush(policy, path, 8) shouldBe false shouldFlush(policy, path, 9) shouldBe true shouldFlush(policy, path, 10) shouldBe true fs.delete(path, false) } "roll over after file size" in { val policy = DefaultCommitPolicy(Some(10), None, None) val path = new Path("foo") val out = fs.create(path) shouldFlush(policy, path, 7) shouldBe false out.writeBytes("wibble wobble wabble wubble") out.close() shouldFlush(policy, path, 9) shouldBe true fs.delete(path, false) } } }
Example 69
Source File: MapValueConverterTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.json.sql.JacksonJson import org.apache.kafka.connect.data.{Schema, Struct} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import scala.collection.JavaConverters._ class MapValueConverterTest extends AnyFunSuite with Matchers { test("converts nested payload") { val json = """ |{ | "idType": 3, | "colorDepth": "", | "threshold" : 45.77, | "evars": { | "evars": { | "eVar1": "Tue Aug 27 2019 12:08:10", | "eVar2": 156692207943934897 | } | }, | "exclude": { | "id": 0, | "value": false | } |} |""".stripMargin val map = JacksonJson.toMap[Any](json) val struct = MapValueConverter.convert(map) //Jackson transforming the json to Map the fields order is not retained struct.schema().fields().asScala.map(_.name()).sorted shouldBe List("idType", "colorDepth", "threshold", "evars", "exclude").sorted struct.schema().field("idType").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA struct.schema().field("colorDepth").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA struct.schema().field("threshold").schema() shouldBe Schema.OPTIONAL_FLOAT64_SCHEMA struct.schema().field("exclude").schema().`type`() shouldBe Schema.Type.STRUCT struct.schema().field("exclude").schema().isOptional shouldBe true struct.schema().field("evars").schema().`type`() shouldBe Schema.Type.STRUCT struct.schema().field("evars").schema().isOptional shouldBe true struct.schema().field("evars").schema().fields().asScala.map(_.name()) shouldBe List("evars") val evarsInner = struct.schema().field("evars").schema().field("evars") evarsInner.schema().`type`() shouldBe Schema.Type.STRUCT evarsInner.schema().isOptional shouldBe true evarsInner.schema().fields().asScala.map(_.name()).sorted shouldBe List("eVar1", "eVar2").sorted evarsInner.schema().field("eVar1").schema() shouldBe Schema.OPTIONAL_STRING_SCHEMA evarsInner.schema().field("eVar2").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA val exclude = struct.schema().field("exclude").schema() exclude.schema().`type`() shouldBe Schema.Type.STRUCT exclude.schema().isOptional shouldBe true exclude.schema().fields().asScala.map(_.name()).sorted shouldBe List("id", "value").sorted exclude.schema().field("id").schema() shouldBe Schema.OPTIONAL_INT64_SCHEMA exclude.schema().field("value").schema() shouldBe Schema.OPTIONAL_BOOLEAN_SCHEMA struct.get("idType") shouldBe 3L struct.get("colorDepth") shouldBe "" struct.get("threshold") shouldBe 45.77D val evarsStruct = struct.get("evars").asInstanceOf[Struct].get("evars").asInstanceOf[Struct] evarsStruct.get("eVar1") shouldBe "Tue Aug 27 2019 12:08:10" evarsStruct.get("eVar2") shouldBe 156692207943934897L val excludeStruct = struct.get("exclude").asInstanceOf[Struct] excludeStruct.get("id") shouldBe 0L excludeStruct.get("value") shouldBe false } }
Example 70
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter} package object parquet { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = { if (fs.isDirectory(path)) { logger.debug(s"$path is a directory, reading constituent files") val remote = fs.listFiles(path, false) new Iterator[Path] { override def hasNext: Boolean = remote.hasNext override def next(): Path = remote.next().getPath }.toList } else { logger.debug(s"Reading $path as a single file") List(path) } } def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = { ParquetReader.builder(new StructReadSupport, file) .withConf(fs.getConf) .build() } def parquetWriter(path: Path, schema: Schema, config: ParquetSinkConfig): ParquetWriter[Struct] = { new StructParquetWriterBuilder(path, schema) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(config.enableDictionary) .withValidation(config.validation) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withWriteMode(if (config.overwrite) { ParquetFileWriter.Mode.OVERWRITE } else { ParquetFileWriter.Mode.CREATE }).build() } }
Example 71
Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ // derived from Apache Spark's parquet write support, archive and license here: // https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private val schemaName = if (schema.name() == null) "schema" else schema.name() private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName) private val metadata = new java.util.HashMap[String, String]() metadata.put("written_by", "streamreactor") // The Parquet `RecordConsumer` to which all structs are written private var consumer: RecordConsumer = _ type ValueWriter = (Any) => Unit override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String]) override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata) override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer override def write(struct: Struct): Unit = { writeMessage { writeStructFields(struct) } } private def writeStructFields(struct: Struct): Unit = { for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) { val value = struct.get(field) if (value != null) { val writer = valueWriter(field.schema()) writeField(field.name, index) { writer(value) } } } } def valueWriter(schema: Schema): ValueWriter = { // todo perhaps introduce something like spark's SpecializedGetters schema.`type`() match { case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean]) case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt) case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong) case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes)) case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat) case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble) case Schema.Type.STRUCT => value => { logger.debug(s"Writing nested struct") val struct = value.asInstanceOf[Struct] writeGroup { schema.fields.asScala .map { field => field -> struct.get(field) } .zipWithIndex.foreach { case ((field, v), k) => writeField(field.name, k) { valueWriter(field.schema)(v) } } } } case _ => throw UnsupportedSchemaType(schema.`type`.toString) } } private def writeMessage(f: => Unit): Unit = { consumer.startMessage() f consumer.endMessage() } private def writeGroup(f: => Unit): Unit = { consumer.startGroup() // consumer.startMessage() f //consumer.endMessage() consumer.endGroup() } private def writeField(name: String, k: Int)(f: => Unit): Unit = { consumer.startField(name, k) f consumer.endField(name, k) } }
Example 72
Source File: StructReadSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import java.util import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.Struct import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema.MessageType class StructReadSupport extends ReadSupport[Struct] { override def prepareForRead(configuration: Configuration, metaData: util.Map[String, String], fileSchema: MessageType, context: ReadSupport.ReadContext): RecordMaterializer[Struct] = { // the file schema in here comes from the footer of the parquet file val schema = ParquetSchemas.toKafka(fileSchema) new StructMaterializer(schema) } override def init(context: InitContext): ReadSupport.ReadContext = { new ReadSupport.ReadContext(context.getFileSchema) } }
Example 73
Source File: RootGroupConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.typesafe.scalalogging.StrictLogging import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.io.api.{Converter, GroupConverter} import scala.collection.JavaConverters._ class RootGroupConverter(schema: Schema) extends GroupConverter with StrictLogging { require(schema.`type`() == Schema.Type.STRUCT) var struct: Struct = _ private val builder = scala.collection.mutable.Map.empty[String, Any] private val converters = schema.fields.asScala.map(Converters.get(_, builder)).toIndexedSeq override def getConverter(k: Int): Converter = converters(k) override def start(): Unit = builder.clear() override def end(): Unit = struct = { val struct = new Struct(schema) schema.fields.asScala.map { field => val value = builder.getOrElse(field.name, null) try { struct.put(field, value) } catch { case t: Exception => throw t } } struct } }
Example 74
Source File: ProjectionMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source.mapper import cats.data.NonEmptyList import com.landoop.streamreactor.connect.hive.StructMapper import com.landoop.streamreactor.connect.hive.source.config.ProjectionField import org.apache.kafka.connect.data.{SchemaBuilder, Struct} class ProjectionMapper(projection: NonEmptyList[ProjectionField]) extends StructMapper { override def map(input: Struct): Struct = { val builder = projection.foldLeft(SchemaBuilder.struct) { (builder, projectionField) => Option(input.schema.field(projectionField.name)) .fold(sys.error(s"Projection field ${projectionField.name} cannot be found in input")) { field => builder.field(projectionField.alias, field.schema) } } val schema = builder.build() projection.foldLeft(new Struct(schema)) { (struct, field) => struct.put(field.alias, input.get(field.name)) } } }
Example 75
Source File: PartitionValueMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source.mapper import com.landoop.streamreactor.connect.hive.{Partition, StructMapper} import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import scala.collection.JavaConverters._ class PartitionValueMapper(partition: Partition) extends StructMapper { override def map(input: Struct): Struct = { val builder = SchemaBuilder.struct() input.schema.fields.asScala.foreach { field => builder.field(field.name, field.schema) } partition.entries.toList.foreach { entry => builder.field(entry._1.value, Schema.STRING_SCHEMA) } val schema = builder.build() val struct = new Struct(schema) input.schema.fields.asScala.foreach { field => struct.put(field.name, input.get(field.name)) } partition.entries.toList.foreach { entry => struct.put(entry._1.value, entry._2) } struct } }
Example 76
Source File: HiveSource.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.source import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.formats.{HiveFormat, HiveReader, Record} import com.landoop.streamreactor.connect.hive.source.config.HiveSourceConfig import com.landoop.streamreactor.connect.hive.source.mapper.{PartitionValueMapper, ProjectionMapper} import com.landoop.streamreactor.connect.hive.source.offset.HiveSourceOffsetStorageReader import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.kafka.connect.data.Struct import org.apache.kafka.connect.source.SourceRecord import scala.collection.JavaConverters._ class HiveSource(db: DatabaseName, tableName: TableName, topic: Topic, offsetReader: HiveSourceOffsetStorageReader, config: HiveSourceConfig) (implicit client: IMetaStoreClient, fs: FileSystem) extends Iterator[SourceRecord] { val tableConfig = config.tableOptions.filter(_.tableName == tableName).find(_.topic == topic) .getOrElse(sys.error(s"Cannot find table configuration for ${db.value}.${tableName.value} => ${topic.value}")) private val table = client.getTable(db.value, tableName.value) private val format = HiveFormat(hive.serde(table)) private val metastoreSchema = HiveSchemas.toKafka(table) private val parts = TableFileScanner.scan(db, tableName) private val readers = parts.map { case (path, partition) => val fns: Seq[Struct => Struct] = Seq( partition.map(new PartitionValueMapper(_).map _), tableConfig.projection.map(new ProjectionMapper(_).map _) ).flatten val mapper: Struct => Struct = Function.chain(fns) val sourceOffset = offsetReader.offset(SourcePartition(db, tableName, topic, path)).getOrElse(SourceOffset(0)) new HiveReader { lazy val reader = format.reader(path, sourceOffset.rowNumber, metastoreSchema) override def iterator: Iterator[Record] = reader.iterator.map { record => Record(mapper(record.struct), record.path, record.offset) } override def close(): Unit = reader.close() } } private val iterator: Iterator[Record] = readers.map(_.iterator).reduce(_ ++ _).take(tableConfig.limit) override def hasNext: Boolean = iterator.hasNext override def next(): SourceRecord = { val record = iterator.next val sourcePartition = SourcePartition(db, tableName, topic, record.path) val offset = SourceOffset(record.offset) new SourceRecord( fromSourcePartition(sourcePartition).asJava, fromSourceOffset(offset).asJava, topic.value, record.struct.schema, record.struct ) } def close(): Unit = { readers.foreach(_.close()) } }
Example 77
Source File: CassandraSinkTaskSpec.scala From kafka-connect-cassandra with Apache License 2.0 | 5 votes |
package com.tuplejump.kafka.connect.cassandra import scala.collection.JavaConverters._ import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.{SinkRecord, SinkTaskContext} class CassandraSinkTaskSpec extends AbstractFlatSpec { val topicName = "test_kv_topic" val tableName = "test.kv" val config = sinkProperties(Map(topicName -> tableName)) it should "start sink task" in { val sinkTask = new CassandraSinkTask() val mockContext = mock[SinkTaskContext] sinkTask.initialize(mockContext) sinkTask.start(config.asJava) sinkTask.stop() } it should "save records in cassandra" in { val sinkTask = new CassandraSinkTask() val mockContext = mock[SinkTaskContext] sinkTask.initialize(mockContext) sinkTask.start(config.asJava) val valueSchema = SchemaBuilder.struct.name("record").version(1) .field("key", Schema.STRING_SCHEMA) .field("value", Schema.INT32_SCHEMA).build val value1 = new Struct(valueSchema).put("key", "pqr").put("value", 15) val value2 = new Struct(valueSchema).put("key", "abc").put("value", 17) val record1 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value1, 0) val record2 = new SinkRecord(topicName, 1, SchemaBuilder.struct.build, "key", valueSchema, value2, 0) sinkTask.put(List(record1, record2).asJavaCollection) sinkTask.stop() val cc = CassandraCluster.local val session = cc.session val result = session.execute(s"select count(1) from $tableName").one() val rowCount = result.getLong(0) rowCount should be(2) cc.shutdown() } }
Example 78
Source File: MetastoreSchemaAlignMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import com.landoop.streamreactor.connect.hive.StructMapper import org.apache.kafka.connect.data.{Schema, Struct} import scala.util.Try class MetastoreSchemaAlignMapper(schema: Schema) extends StructMapper { import scala.collection.JavaConverters._ override def map(input: Struct): Struct = { //hive converts everything to lowercase val inputFieldsMapping = input.schema().fields().asScala.map { f => f.name().toLowerCase() -> f.name() }.toMap val struct = schema.fields.asScala.foldLeft(new Struct(schema)) { (struct, field) => Try(input.get(inputFieldsMapping(field.name))).toOption match { case Some(value) => struct.put(field.name, value) case None if field.schema.isOptional => struct.put(field.name, null) case None => sys.error(s"Cannot map struct to required schema; ${field.name} is missing, no default value has been supplied and null is not permitted") } } struct } }
Example 79
Source File: DropPartitionValuesMapper.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.mapper import com.landoop.streamreactor.connect.hive.{PartitionPlan, StructMapper} import org.apache.kafka.connect.data.{SchemaBuilder, Struct} class DropPartitionValuesMapper(plan: PartitionPlan) extends StructMapper { import scala.collection.JavaConverters._ override def map(input: Struct): Struct = { val partitionKeys = plan.keys.map(_.value).toList val dataFields = input.schema.fields().asScala.filterNot(field => partitionKeys.contains(field.name)) val builder = dataFields.foldLeft(SchemaBuilder.struct) { (builder, field) => builder.field(field.name, field.schema) } val schema = builder.build() dataFields.foldLeft(new Struct(schema)) { (struct, field) => struct.put(field.name, input.get(field.name)) } } }
Example 80
Source File: ValueConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ object ValueConverter { def apply(record: SinkRecord): Struct = record.value match { case struct: Struct => StructValueConverter.convert(struct) case map: Map[_, _] => MapValueConverter.convert(map) case map: java.util.Map[_, _] => MapValueConverter.convert(map.asScala.toMap) case string: String => StringValueConverter.convert(string) case other => sys.error(s"Unsupported record $other:${other.getClass.getCanonicalName}") } } trait ValueConverter[T] { def convert(value: T): Struct } object StructValueConverter extends ValueConverter[Struct] { override def convert(struct: Struct): Struct = struct } object MapValueConverter extends ValueConverter[Map[_, _]] { def convertValue(value: Any, key: String, builder: SchemaBuilder): Any = { value match { case s: String => builder.field(key, Schema.OPTIONAL_STRING_SCHEMA) s case l: Long => builder.field(key, Schema.OPTIONAL_INT64_SCHEMA) l case i: Int => builder.field(key, Schema.OPTIONAL_INT64_SCHEMA) i.toLong case b: Boolean => builder.field(key, Schema.OPTIONAL_BOOLEAN_SCHEMA) b case f: Float => builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA) f.toDouble case d: Double => builder.field(key, Schema.OPTIONAL_FLOAT64_SCHEMA) d case innerMap: java.util.Map[_, _] => val innerStruct = convert(innerMap.asScala.toMap, true) builder.field(key, innerStruct.schema()) innerStruct case innerMap: Map[_, _] => val innerStruct = convert(innerMap, true) builder.field(key, innerStruct.schema()) innerStruct } } def convert(map: Map[_, _], optional: Boolean) = { val builder = SchemaBuilder.struct() val values = map.map { case (k, v) => val key = k.toString val value = convertValue(v, key, builder) key -> value }.toList if (optional) builder.optional() val schema = builder.build val struct = new Struct(schema) values.foreach { case (key, value) => struct.put(key.toString, value) } struct } override def convert(map: Map[_, _]): Struct = convert(map, false) } object StringValueConverter extends ValueConverter[String] { override def convert(string: String): Struct = { val schema = SchemaBuilder.struct().field("a", Schema.OPTIONAL_STRING_SCHEMA).name("struct").build() new Struct(schema).put("a", string) } }
Example 81
Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.sink.config.TableOptions import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.Table import org.apache.kafka.connect.data.{Schema, Struct} case class HiveSinkState(offsets: Map[TopicPartition, Offset], committedOffsets: Map[TopicPartition, Offset], table: Table, tableLocation: Path, plan: Option[PartitionPlan], metastoreSchema: Schema, mapper: Struct => Struct, lastSchema: Schema) { def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = { copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset)) } def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(offsets = offsets + (tp -> offset)) } def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = { copy(committedOffsets = committedOffsets ++ offsets) } def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(committedOffsets = committedOffsets + (tp -> offset)) } def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema) } object HiveSinkState { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def from(schema: Schema, table: TableOptions, dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = { logger.info(s"Init sink for schema $schema") val hiveTable = getOrCreateTable(table, dbName, schema) val tableLocation = new Path(hiveTable.getSd.getLocation) val plan = hive.partitionPlan(hiveTable) val metastoreSchema = table.evolutionPolicy .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema) .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema")) val mapperFns: Seq[Struct => Struct] = Seq( table.projection.map(new ProjectionMapper(_)), Some(new MetastoreSchemaAlignMapper(metastoreSchema)), plan.map(new DropPartitionValuesMapper(_)) ).flatten.map(mapper => mapper.map _) val mapper = Function.chain(mapperFns) HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema) } def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema) (implicit client: IMetaStoreClient, fs: FileSystem): Table = { def create: Table = { val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",") logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]") hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format) } logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}") client.tableExists(dbName.value, table.tableName.value) match { case true if table.overwriteTable => hive.dropTable(dbName, table.tableName, true) create case true => client.getTable(dbName.value, table.tableName.value) case false if table.createTable => create case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist") } } }
Example 82
Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.Serde import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.ParquetWriter import scala.util.Try object ParquetHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", Map("serialization.format" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating parquet writer at $path") val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val createdTime: Long = System.currentTimeMillis() var lastKnownFileSize: Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { writer.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing writer at path $path") writer.close() } override def currentCount: Long = count override def file: Path = path override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating parquet reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path) var offset = startAt override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 83
Source File: SinkRecordParser.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.influx.converters import com.datamountaineer.streamreactor.connect.influx.helpers.Util import com.datamountaineer.streamreactor.connect.influx.writers.KcqlDetails.Path import com.datamountaineer.streamreactor.connect.influx.writers.ValuesExtractor import com.fasterxml.jackson.databind.JsonNode import com.landoop.json.sql.JacksonJson import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.util.Try object SinkRecordParser { type Field = String trait ParsedSinkRecord { def valueFields(ignored: Set[Path]): Seq[(String, Any)] def field(path: Path): Option[Any] } trait ParsedKeyValueSinkRecord extends ParsedSinkRecord { def keyFields(ignored: Set[Path]): Seq[(String, Any)] } private case class JsonSinkRecord(json: JsonNode) extends ParsedSinkRecord { override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(json, ignored.map(_.value.last)) override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(json, path.value)) } private case class StructSinkRecord(struct: Struct) extends ParsedSinkRecord { override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(struct, ignored.map(_.value.last)) override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(struct, path.value)) } private case class MapSinkRecord(map: java.util.Map[String, Any]) extends ParsedSinkRecord { override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = ValuesExtractor.extractAllFields(map, ignored.map(_.value.last)) override def field(path: Path): Option[Any] = Option(ValuesExtractor.extract(map, path.value)) } private case class KeyValueRecord(key: ParsedSinkRecord, value: ParsedSinkRecord) extends ParsedKeyValueSinkRecord { override def valueFields(ignored: Set[Path]): Seq[(String, Any)] = value.valueFields(ignored) override def field(path: Path): Option[Any] = path.value.headOption match { case Some(fieldName) if Util.caseInsensitiveComparison(fieldName, Util.KEY_CONSTANT) => key.field(Path(path.value.tail)) case Some(_) => value.field(path) case None => throw new IllegalArgumentException("Unreachable situation detected. Path should never be empty") } override def keyFields(ignored: Set[Path]): Seq[(String, Any)] = key.valueFields(ignored) } def build(record: SinkRecord): Try[ParsedKeyValueSinkRecord] = { val key = Option(record.keySchema()).map(_.`type`()) match { case Some(Schema.Type.STRING) => Try(JsonSinkRecord(JacksonJson.asJson(record.key().asInstanceOf[String]))) case Some(Schema.Type.STRUCT) => Try(StructSinkRecord(record.key().asInstanceOf[Struct])) case None => Try(MapSinkRecord(record.key().asInstanceOf[java.util.Map[String, Any]])) } val value = Option(record.valueSchema()).map(_.`type`()) match { case Some(Schema.Type.STRING) => Try(require(record.value() != null && record.value().getClass == classOf[String], "The SinkRecord payload should be of type String")).flatMap(_ => Try(JsonSinkRecord(JacksonJson.asJson(record.value().asInstanceOf[String])))) case Some(Schema.Type.STRUCT) => Try(require(record.value() != null && record.value().getClass == classOf[Struct], "The SinkRecord payload should be of type Struct")).flatMap(_ => Try(StructSinkRecord(record.value().asInstanceOf[Struct]))) case None => Try(require(record.value() != null && record.value().isInstanceOf[java.util.Map[_, _]], "The SinkRecord payload should be of type java.util.Map[String, Any]")).flatMap(_ => Try(MapSinkRecord(record.value().asInstanceOf[java.util.Map[String, Any]]))) } key .flatMap(key => value.map(key -> _)) .map { case (k, v) => KeyValueRecord(k, v) } } }
Example 84
Source File: StructFieldsRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.hbase import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._ import org.apache.hadoop.hbase.util.Bytes import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct} import org.apache.kafka.connect.sink.SinkRecord import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec class StructFieldsRowKeyBuilderTest extends AnyWordSpec with Matchers { "StructFieldsRowKeyBuilder" should { "raise an exception if the field is not present in the struct" in { intercept[IllegalArgumentException] { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) //val field = Field("threshold", "threshold", false) StructFieldsRowKeyBuilderBytes(List("threshold")).build(sinkRecord, null) } } "create the row key based on one single field in the struct" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) //val field = Field("firstName", "firstName", true) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StructFieldsRowKeyBuilderBytes(List("firstName")).build(sinkRecord, null) shouldBe "Alex".fromString } "create the row key based on more thant one field in the struct" in { val schema = SchemaBuilder.struct().name("com.example.Person") .field("firstName", Schema.STRING_SCHEMA) .field("age", Schema.INT32_SCHEMA) .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build() val struct = new Struct(schema).put("firstName", "Alex").put("age", 30) //val field = Field("firstName", "firstName", true) //val field2 = Field("age", "age", true) val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1) StructFieldsRowKeyBuilderBytes(List("firstName", "age")).build(sinkRecord, null) shouldBe Bytes.add("Alex".fromString(), "\n".fromString(), 30.fromInt()) } } }
Example 85
Source File: ObjectMessageConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.jms.sink.converters import com.datamountaineer.streamreactor.connect.jms.config.JMSSetting import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import javax.jms.{ObjectMessage, Session} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ class ObjectMessageConverter extends JMSMessageConverter with ConverterUtil { override def convert(record: SinkRecord, session: Session, setting: JMSSetting): (String, ObjectMessage) = { val converted = super[ConverterUtil].convert(record, setting.fields, setting.ignoreField) val msg = session.createObjectMessage() val value = converted.value() val schema = converted.valueSchema() schema.`type`() match { case Schema.Type.STRUCT => val struct = value.asInstanceOf[Struct] struct.schema().fields().asScala.foreach { f => ObjectMessageConverterFn(f.name(), struct.get(f), f.schema(), msg, session) } case _ => ObjectMessageConverterFn("field", value, schema, msg, session) } (setting.source, msg) } } object ObjectMessageConverterFn { def apply(fieldName: String, value: AnyRef, schema: Schema, msg: ObjectMessage, session: Session): Unit = { schema.`type`() match { case Schema.Type.BYTES => msg.setObjectProperty(fieldName, value.asInstanceOf[Array[Byte]].toList.asJava) case Schema.Type.BOOLEAN => msg.setBooleanProperty(fieldName, value.asInstanceOf[Boolean]) case Schema.Type.FLOAT32 => msg.setFloatProperty(fieldName, value.asInstanceOf[Float]) case Schema.Type.FLOAT64 => msg.setDoubleProperty(fieldName, value.asInstanceOf[Double]) case Schema.Type.INT8 => msg.setByteProperty(fieldName, value.asInstanceOf[Byte]) case Schema.Type.INT16 => msg.setShortProperty(fieldName, value.asInstanceOf[Short]) case Schema.Type.INT32 => msg.setIntProperty(fieldName, value.asInstanceOf[Int]) case Schema.Type.INT64 => msg.setLongProperty(fieldName, value.asInstanceOf[Long]) case Schema.Type.STRING => msg.setStringProperty(fieldName, value.asInstanceOf[String]) case Schema.Type.MAP => msg.setObjectProperty(fieldName, value) case Schema.Type.ARRAY => msg.setObjectProperty(fieldName, value) case Schema.Type.STRUCT => val nestedMsg = session.createObjectMessage() val struct = value.asInstanceOf[Struct] struct.schema().fields().asScala.foreach { f => ObjectMessageConverterFn(f.name(), struct.get(f), f.schema(), nestedMsg, session) } msg.setObjectProperty(fieldName, nestedMsg) } } }
Example 86
Source File: MapMessageConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.jms.sink.converters import com.datamountaineer.streamreactor.connect.jms.config.JMSSetting import com.datamountaineer.streamreactor.connect.schemas.ConverterUtil import javax.jms.{MapMessage, Session} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.kafka.connect.sink.SinkRecord import scala.collection.JavaConverters._ class MapMessageConverter extends JMSMessageConverter with ConverterUtil { override def convert(record: SinkRecord, session: Session, setting: JMSSetting): (String, MapMessage) = { val converted = super[ConverterUtil].convert(record, setting.fields, setting.ignoreField) val msg = session.createMapMessage() val value = converted.value() val schema = converted.valueSchema() schema.`type`() match { case Schema.Type.STRUCT => val struct = value.asInstanceOf[Struct] struct.schema().fields().asScala.foreach { f => MapMessageBuilderFn(f.name(), struct.get(f), f.schema(), msg, session) } case _ => MapMessageBuilderFn("field", value, schema, msg, session) } (setting.source, msg) } } object MapMessageBuilderFn { def apply(fieldName: String, value: AnyRef, schema: Schema, msg: MapMessage, session: Session): Unit = { schema.`type`() match { case Schema.Type.BYTES => msg.setBytes(fieldName, value.asInstanceOf[Array[Byte]]) case Schema.Type.BOOLEAN => msg.setBoolean(fieldName, value.asInstanceOf[Boolean]) case Schema.Type.FLOAT32 => msg.setFloat(fieldName, value.asInstanceOf[Float]) case Schema.Type.FLOAT64 => msg.setDouble(fieldName, value.asInstanceOf[Double]) case Schema.Type.INT8 => msg.setByte(fieldName, value.asInstanceOf[Byte]) case Schema.Type.INT16 => msg.setShort(fieldName, value.asInstanceOf[Short]) case Schema.Type.INT32 => msg.setInt(fieldName, value.asInstanceOf[Int]) case Schema.Type.INT64 => msg.setLong(fieldName, value.asInstanceOf[Long]) case Schema.Type.STRING => msg.setString(fieldName, value.asInstanceOf[String]) case Schema.Type.MAP => msg.setObject(fieldName, value) case Schema.Type.ARRAY => msg.setObject(fieldName, value) case Schema.Type.STRUCT => val nestedMsg = session.createMapMessage() val struct = value.asInstanceOf[Struct] struct.schema().fields().asScala.foreach { f => MapMessageBuilderFn(f.name(), struct.get(f), f.schema(), nestedMsg, session) } msg.setObject(fieldName, nestedMsg) } } }