org.apache.avro.generic.GenericRecord Scala Example

Source File: AvroParquetSourceTest.scala From eel-sdk with Apache License 2.0

6 votes

package io.eels.component.parquet

import java.nio.file.Paths

import io.eels.component.parquet.avro.AvroParquetSource
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.schema._
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{Matchers, WordSpec}

class AvroParquetSourceTest extends WordSpec with Matchers {
  ParquetLogMute()

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(conf)

  private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI)
  private val resourcesDir = personFile.getParent

  "AvroParquetSource" should {
    "read schema" in {
      val people = AvroParquetSource(personFile)
      people.schema shouldBe StructType(
        Field("name", StringType, nullable = false),
        Field("job", StringType, nullable = false),
        Field("location", StringType, nullable = false)
      )
    }
    "read parquet files" in {
      val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    "read multiple parquet files using file expansion" in {
      import io.eels.FilePattern._
      val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values)
      people shouldBe Set(
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner"),
        Vector("clint eastwood", "actor", "carmel"),
        Vector("elton john", "musician", "pinner")
      )
    }
    // todo add merge to parquet source
    "merge schemas" ignore {

      try {
        fs.delete(new Path("merge1.pq"), false)
      } catch {
        case t: Throwable =>
      }
      try {
        fs.delete(new Path("merge2.pq"), false)
      } catch {
        case t: Throwable =>
      }

      val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord()
      val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord()

      val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build()
      val record1 = new GenericData.Record(schema1)
      record1.put("a", "aaaaa")
      record1.put("b", 124.3)
      writer1.write(record1)
      writer1.close()

      val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build()
      val record2 = new GenericData.Record(schema2)
      record2.put("a", 111)
      record2.put("c", true)
      writer2.write(record2)
      writer2.close()

      ParquetSource(new Path("merge*")).schema shouldBe
        StructType(
          Field("a", StringType, nullable = false),
          Field("b", DoubleType, nullable = false),
          Field("c", BooleanType, nullable = false)
        )

      fs.delete(new Path(".merge1.pq.crc"), false)
      fs.delete(new Path(".merge2.pq.crc"), false)
      fs.delete(new Path("merge1.pq"), false)
      fs.delete(new Path("merge2.pq"), false)
    }
  }
}

Source File: FieldMapperEncoderTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.record.encoder

import com.sksamuel.avro4s.{Encoder, SchemaFor, SnakeCase}
import org.apache.avro.generic.GenericRecord
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

class FieldMapperEncoderTest extends AnyFunSuite with Matchers {

  test("adding an in scope FieldMapper should overide the fields in an encoder") {
    implicit val fieldMapper = SnakeCase
    val schema: SchemaFor[NamingTest] = SchemaFor[NamingTest]
    val encoder = Encoder[NamingTest]
    val record = encoder.encode(NamingTest("Foo")).asInstanceOf[GenericRecord]
    record.get("camel_case")
  }

}

case class NamingTest(camelCase: String)

Source File: Codecs.scala From embedded-kafka-schema-registry with MIT License

5 votes

package net.manub.embeddedkafka.schemaregistry.avro

import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecord
import org.apache.kafka.clients.consumer.ConsumerRecord

@deprecated(
  "Avro-related classes will be removed soon",
  since = "5.5.0"
)
object Codecs {
  implicit def stringKeyAvroValueCrDecoder[V <: SpecificRecord]
      : ConsumerRecord[String, V] => (String, V) =
    cr => (cr.key, cr.value)
  implicit def avroValueCrDecoder[V <: SpecificRecord]
      : ConsumerRecord[String, V] => V =
    _.value
  implicit def stringKeyAvroValueTopicCrDecoder[V <: SpecificRecord]
      : ConsumerRecord[String, V] => (String, String, V) =
    cr => (cr.topic, cr.key, cr.value)

  implicit def stringKeyGenericValueCrDecoder
      : ConsumerRecord[String, GenericRecord] => (String, GenericRecord) =
    cr => (cr.key, cr.value)

  implicit def genericKeyGenericValueCrDecoder
      : ConsumerRecord[GenericRecord, GenericRecord] => (
          GenericRecord,
          GenericRecord
      ) =
    cr => (cr.key, cr.value)
}

Source File: AvroSerdes.scala From embedded-kafka-schema-registry with MIT License

5 votes

package net.manub.embeddedkafka.schemaregistry.avro

import io.confluent.kafka.serializers.{
  AbstractKafkaSchemaSerDeConfig,
  KafkaAvroDeserializerConfig,
  KafkaAvroDeserializer => ConfluentKafkaAvroDeserializer,
  KafkaAvroSerializer => ConfluentKafkaAvroSerializer
}
import net.manub.embeddedkafka.schemaregistry.EmbeddedKafkaConfig
import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecord
import org.apache.kafka.common.serialization.{Serde, Serdes}

import scala.jdk.CollectionConverters._

@deprecated(
  "Avro-related classes will be removed soon",
  since = "5.5.0"
)
object AvroSerdes {

  protected def configForSchemaRegistry(
      implicit config: EmbeddedKafkaConfig
  ): Map[String, Object] =
    Map(
      AbstractKafkaSchemaSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> s"http://localhost:${config.schemaRegistryPort}"
    )

  protected def specificAvroReaderConfigForSchemaRegistry(
      implicit config: EmbeddedKafkaConfig
  ): Map[String, Object] =
    configForSchemaRegistry ++ Map(
      KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG -> true.toString
    )

  def specific[T <: SpecificRecord](
      isKey: Boolean = false,
      extraConfig: Map[String, Object] = Map.empty
  )(
      implicit config: EmbeddedKafkaConfig
  ): Serde[T] =
    serdeFrom[T](
      configForSchemaRegistry ++ extraConfig,
      specificAvroReaderConfigForSchemaRegistry ++ extraConfig, //need this to support SpecificRecord
      isKey
    )

  def generic(
      isKey: Boolean = false,
      extraConfig: Map[String, Object] = Map.empty
  )(
      implicit config: EmbeddedKafkaConfig
  ): Serde[GenericRecord] =
    serdeFrom[GenericRecord](
      configForSchemaRegistry ++ extraConfig,
      configForSchemaRegistry ++ extraConfig,
      isKey
    )

  private def serdeFrom[T](
      serConfig: Map[String, Object],
      deserConfig: Map[String, Object],
      isKey: Boolean
  ): Serde[T] = {
    val ser = new ConfluentKafkaAvroSerializer
    ser.configure(serConfig.asJava, isKey)
    val deser = new ConfluentKafkaAvroDeserializer
    deser.configure(deserConfig.asJava, isKey)

    Serdes.serdeFrom(ser, deser).asInstanceOf[Serde[T]]
  }
}

Source File: AvroCodecsSpecification.scala From kafka-scala-api with Apache License 2.0

5 votes

package com.example.avro

import org.scalatest._
import com.twitter.bijection.Injection
import com.twitter.bijection.avro.GenericAvroCodecs
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}

class GenericAvroCodecsSpecification extends WordSpec with Matchers {
  val testSchema = new Schema.Parser().parse("""{
                                                   "type":"record",
                                                   "name":"FiscalRecord",
                                                   "namespace":"avro",
                                                   "fields":[
                                                      {
                                                         "name":"calendarDate",
                                                         "type":"string"
                                                      },
                                                      {
                                                         "name":"fiscalWeek",
                                                         "type":[
                                                            "int",
                                                            "null"
                                                         ]
                                                      },
                                                      {
                                                         "name":"fiscalYear",
                                                         "type":[
                                                            "int",
                                                            "null"
                                                         ]
                                                      }
                                                   ]
                                                }""")

  "Generic Avro codec" should {

    "Round trip generic record using Generic Injection" in {
      implicit val genericInjection = GenericAvroCodecs[GenericRecord](testSchema)
      val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12))
      val bytes = Injection[GenericRecord, Array[Byte]](testRecord)
      val attempt = Injection.invert[GenericRecord, Array[Byte]](bytes)
      assert(attempt.get == testRecord)
    }

    "Round trip generic record using Binary Injection" in {
      implicit val genericBinaryInjection = GenericAvroCodecs.toBinary[GenericRecord](testSchema)
      val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12))
      val bytes = Injection[GenericRecord, Array[Byte]](testRecord)
      val attempt = Injection.invert[GenericRecord, Array[Byte]](bytes)
      assert(attempt.get == testRecord)
    }

    "Round trip generic record using Json Injection" in {
      implicit val genericJsonInjection = GenericAvroCodecs.toJson[GenericRecord](testSchema)
      val testRecord = buildGenericAvroRecord(("2012-01-01", 1, 12))
      val jsonString = Injection[GenericRecord, String](testRecord)
      val attempt = Injection.invert[GenericRecord, String](jsonString)
      assert(attempt.get == testRecord)
    }
  }

  def buildGenericAvroRecord(i: (String, Int, Int)): GenericRecord = {

    val fiscalRecord = new GenericData.Record(testSchema)
    fiscalRecord.put("calendarDate", i._1)
    fiscalRecord.put("fiscalWeek", i._2)
    fiscalRecord.put("fiscalYear", i._3)
    fiscalRecord
  }
}

Source File: AvroSerializer.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.bloomberg.avro

import java.io.ByteArrayOutputStream

import com.datamountaineer.streamreactor.connect.bloomberg.BloombergData
import com.datamountaineer.streamreactor.connect.bloomberg.avro.AvroSchemaGenerator._
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData.Record
import org.apache.avro.generic.{GenericData, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.EncoderFactory

import scala.collection.JavaConverters._

object AvroSerializer {

  
    private def recursive(record: GenericData.Record, schema: Schema, fieldName: String, value: Any): Unit = {
      value match {
        case _: Boolean => record.put(fieldName, value)
        case _: Int => record.put(fieldName, value)
        case _: Long => record.put(fieldName, value)
        case _: Double => record.put(fieldName, value)
        case _: Char => record.put(fieldName, value)
        case _: Float => record.put(fieldName, value)
        case _: String =>
          record.put(fieldName, value)
        case list: java.util.List[_] =>
          val tmpSchema = schema.getField(fieldName).schema()
          val itemSchema = if (tmpSchema.getType == Schema.Type.UNION) tmpSchema.getTypes.get(1) else tmpSchema
          require(itemSchema.getType == Schema.Type.ARRAY)
          //we might have a record not a primitive
          if (itemSchema.getElementType.getType == Schema.Type.RECORD) {
            val items = new GenericData.Array[GenericData.Record](list.size(), itemSchema)
            list.asScala.foreach { i =>
              //only map is allowed
              val m = i.asInstanceOf[java.util.Map[String, Any]]
              items.add(m.toAvroRecord(itemSchema.getElementType))
            }
            record.put(fieldName, items)
          } else {
            val items = new GenericData.Array[Any](list.size(), itemSchema)
            items.addAll(list)
            record.put(fieldName, items)
          }

        case map: java.util.LinkedHashMap[String @unchecked, _] =>
          //record schema
          val fieldSchema = schema.getField(fieldName).schema()
          val nestedSchema = if (fieldSchema.getType == Schema.Type.UNION) fieldSchema.getTypes.get(1) else fieldSchema
          val nestedRecord = new Record(nestedSchema)
          map.entrySet().asScala.foreach(e =>
            recursive(nestedRecord, nestedSchema, e.getKey, e.getValue))
          record.put(fieldName, nestedRecord)
      }
    }
  }
}

Source File: AvroRecordRowKeyBuilderTest.scala From stream-reactor with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import com.datamountaineer.streamreactor.connect.hbase.avro.AvroRecordFieldExtractorMapFn
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class AvroRecordRowKeyBuilderTest extends AnyWordSpec with Matchers with MockitoSugar {
  val schema: Schema = new Schema.Parser().parse(PersonAvroSchema.schema)

  "AvroRecordRowKeyBuilder" should {
    "extract the values from the avro record and create the key" in {
      val keys = Seq("firstName", "lastName", "age")
      val rowKeyBuilder = new AvroRecordRowKeyBuilderBytes(AvroRecordFieldExtractorMapFn(schema, keys), keys)

      val sinkRecord = mock[SinkRecord]
      val firstName = "Jack"
      val lastName = "Smith"
      val age = 29

      val record = new GenericRecord {

        val values: Map[String, AnyRef] = Map("firstName" -> firstName, "lastName" -> lastName, "age" -> Int.box(age))

        override def get(key: String): AnyRef = values(key)

        override def put(key: String, v: scala.Any): Unit = sys.error("not supported")

        override def get(i: Int): AnyRef = sys.error("not supported")


        override def put(i: Int, v: scala.Any): Unit = sys.error("not supported")


        override def getSchema: Schema = sys.error("not supported")
      }

      val expectedValue = Bytes.add(
        Array(
          firstName.fromString(),
          rowKeyBuilder.delimBytes,
          lastName.fromString(),
          rowKeyBuilder.delimBytes,
          age.fromInt()))
      rowKeyBuilder.build(sinkRecord, record) shouldBe expectedValue
    }
  }
}

Source File: SparkAvroDecoder.scala From cloudflow with Apache License 2.0

5 votes

package cloudflow.spark.avro

import org.apache.log4j.Logger

import java.io.ByteArrayOutputStream

import scala.reflect.runtime.universe._

import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord }
import org.apache.avro.io.{ DecoderFactory, EncoderFactory }
import org.apache.spark.sql.{ Dataset, Encoder, Row }
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.StructType
import org.apache.avro.Schema

import cloudflow.spark.sql.SQLImplicits._

case class EncodedKV(key: String, value: Array[Byte])

case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) {

  val encoder: Encoder[T]                           = implicitly[Encoder[T]]
  val sqlSchema: StructType                         = encoder.schema
  val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema)
  @transient lazy val _avroSchema                   = new Schema.Parser().parse(avroSchema)
  @transient lazy val rowConverter                  = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema)
  @transient lazy val datumReader                   = new GenericDatumReader[GenericRecord](_avroSchema)
  @transient lazy val decoder                       = DecoderFactory.get
  def decode(bytes: Array[Byte]): Row = {
    val binaryDecoder = decoder.binaryDecoder(bytes, null)
    val record        = datumReader.read(null, binaryDecoder)
    rowConverter(record).asInstanceOf[GenericRow]
  }

}


case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) {

  @transient lazy val log = Logger.getLogger(getClass.getName)

  val BufferSize = 5 * 1024 // 5 Kb

  val encoder                     = implicitly[Encoder[T]]
  val sqlSchema                   = encoder.schema
  @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema)

  val recordName                = "topLevelRecord" // ???
  val recordNamespace           = "recordNamespace" // ???
  @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace)

  // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage
  def rowToBytes(row: Row): Array[Byte] = {
    val genRecord = converter(row).asInstanceOf[GenericRecord]
    if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord")
    val datumWriter   = new GenericDatumWriter[GenericRecord](_avroSchema)
    val avroEncoder   = EncoderFactory.get
    val byteArrOS     = new ByteArrayOutputStream(BufferSize)
    val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null)
    datumWriter.write(genRecord, binaryEncoder)
    binaryEncoder.flush()
    byteArrOS.toByteArray
  }

  def encode(dataset: Dataset[T]): Dataset[Array[Byte]] =
    dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]]

  // Note to self: I'm not sure how heavy this chain of transformations is
  def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = {
    val encoder             = encoderFor[T]
    implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind()
    dataset.map { value ⇒
      val key         = keyFun(value)
      val internalRow = encoder.toRow(value)
      val row         = rowEncoder.fromRow(internalRow)
      val bytes       = rowToBytes(row)
      EncodedKV(key, bytes)
    }
  }

}

Source File: Avro4sConsumerImpl.scala From kafka4s with Apache License 2.0

5 votes

package com.banno.kafka.consumer

import cats.implicits._
import java.util.regex.Pattern

import scala.concurrent.duration._
import org.apache.kafka.common._
import org.apache.kafka.clients.consumer._
import org.apache.avro.generic.GenericRecord
import com.sksamuel.avro4s.FromRecord
import cats.Functor
import com.banno.kafka._

//this is a Bifunctor[ConsumerApi]

case class Avro4sConsumerImpl[F[_]: Functor, K: FromRecord, V: FromRecord](
    c: ConsumerApi[F, GenericRecord, GenericRecord]
) extends ConsumerApi[F, K, V] {
  def assign(partitions: Iterable[TopicPartition]): F[Unit] = c.assign(partitions)
  def assignment: F[Set[TopicPartition]] = c.assignment
  def beginningOffsets(partitions: Iterable[TopicPartition]): F[Map[TopicPartition, Long]] =
    c.beginningOffsets(partitions)
  def beginningOffsets(
      partitions: Iterable[TopicPartition],
      timeout: FiniteDuration
  ): F[Map[TopicPartition, Long]] =
    c.beginningOffsets(partitions, timeout)
  def close: F[Unit] = c.close
  def close(timeout: FiniteDuration): F[Unit] = c.close(timeout)
  def commitAsync: F[Unit] = c.commitAsync
  def commitAsync(
      offsets: Map[TopicPartition, OffsetAndMetadata],
      callback: OffsetCommitCallback
  ): F[Unit] = c.commitAsync(offsets, callback)
  def commitAsync(callback: OffsetCommitCallback): F[Unit] = c.commitAsync(callback)
  def commitSync: F[Unit] = c.commitSync
  def commitSync(offsets: Map[TopicPartition, OffsetAndMetadata]): F[Unit] = c.commitSync(offsets)
  def committed(partition: Set[TopicPartition]): F[Map[TopicPartition, OffsetAndMetadata]] =
    c.committed(partition)
  def endOffsets(partitions: Iterable[TopicPartition]): F[Map[TopicPartition, Long]] =
    c.endOffsets(partitions)
  def endOffsets(
      partitions: Iterable[TopicPartition],
      timeout: FiniteDuration
  ): F[Map[TopicPartition, Long]] = c.endOffsets(partitions, timeout)
  def listTopics: F[Map[String, Seq[PartitionInfo]]] = c.listTopics
  def listTopics(timeout: FiniteDuration): F[Map[String, Seq[PartitionInfo]]] =
    c.listTopics(timeout)
  def metrics: F[Map[MetricName, Metric]] = c.metrics
  def offsetsForTimes(
      timestampsToSearch: Map[TopicPartition, Long]
  ): F[Map[TopicPartition, OffsetAndTimestamp]] =
    c.offsetsForTimes(timestampsToSearch)
  def offsetsForTimes(
      timestampsToSearch: Map[TopicPartition, Long],
      timeout: FiniteDuration
  ): F[Map[TopicPartition, OffsetAndTimestamp]] =
    c.offsetsForTimes(timestampsToSearch, timeout)
  def partitionsFor(topic: String): F[Seq[PartitionInfo]] = c.partitionsFor(topic)
  def partitionsFor(topic: String, timeout: FiniteDuration): F[Seq[PartitionInfo]] =
    c.partitionsFor(topic, timeout)
  def pause(partitions: Iterable[TopicPartition]): F[Unit] = c.pause(partitions)
  def paused: F[Set[TopicPartition]] = c.paused
  def poll(timeout: FiniteDuration): F[ConsumerRecords[K, V]] =
    c.poll(timeout).map(_.fromGenericRecords[K, V])
  def position(partition: TopicPartition): F[Long] = c.position(partition)
  def resume(partitions: Iterable[TopicPartition]): F[Unit] = c.resume(partitions)
  def seek(partition: TopicPartition, offset: Long): F[Unit] = c.seek(partition, offset)
  def seekToBeginning(partitions: Iterable[TopicPartition]): F[Unit] = c.seekToBeginning(partitions)
  def seekToEnd(partitions: Iterable[TopicPartition]): F[Unit] = c.seekToEnd(partitions)
  def subscribe(topics: Iterable[String]): F[Unit] = c.subscribe(topics)
  def subscribe(topics: Iterable[String], callback: ConsumerRebalanceListener): F[Unit] =
    c.subscribe(topics, callback)
  def subscribe(pattern: Pattern): F[Unit] = c.subscribe(pattern)
  def subscribe(pattern: Pattern, callback: ConsumerRebalanceListener): F[Unit] =
    c.subscribe(pattern, callback)
  def subscription: F[Set[String]] = c.subscription
  def unsubscribe: F[Unit] = c.unsubscribe
  def wakeup: F[Unit] = c.wakeup
}

Source File: ProducerOps.scala From kafka4s with Apache License 2.0

5 votes

package com.banno.kafka.producer

import cats.{Applicative, Foldable, MonadError, Traverse}
import cats.implicits._
import fs2._
import org.apache.kafka.common._
import org.apache.kafka.common.errors._
import org.apache.kafka.clients.consumer.OffsetAndMetadata
import org.apache.kafka.clients.producer._

case class ProducerOps[F[_], K, V](producer: ProducerApi[F, K, V]) {

  def sendAndForgetBatch[G[_]: Foldable](
      records: G[ProducerRecord[K, V]]
  )(implicit F: Applicative[F]): F[Unit] =
    records.traverse_(producer.sendAndForget)

  def sendSyncBatch[G[_]: Traverse](
      records: G[ProducerRecord[K, V]]
  )(implicit F: Applicative[F]): F[G[RecordMetadata]] =
    records.traverse(producer.sendSync)

  def sendAsyncBatch[G[_]: Traverse](
      records: G[ProducerRecord[K, V]]
  )(implicit F: Applicative[F]): F[G[RecordMetadata]] =
    records.traverse(producer.sendAsync)

  def pipeSync: Pipe[F, ProducerRecord[K, V], RecordMetadata] =
    _.evalMap(producer.sendSync)

  def pipeAsync: Pipe[F, ProducerRecord[K, V], RecordMetadata] =
    _.evalMap(producer.sendAsync)

  def sink: Pipe[F, ProducerRecord[K, V], Unit] =
    _.evalMap(producer.sendAndForget)

  def sinkSync: Pipe[F, ProducerRecord[K, V], Unit] =
    pipeSync.apply(_).void

  def sinkAsync: Pipe[F, ProducerRecord[K, V], Unit] =
    pipeAsync.apply(_).void

  def transaction[G[_]: Foldable](
      records: G[ProducerRecord[K, V]],
      offsets: Map[TopicPartition, OffsetAndMetadata],
      consumerGroupId: String
  )(implicit F: MonadError[F, Throwable]): F[Unit] =
    (for {
      _ <- producer.beginTransaction
      _ <- sendAndForgetBatch(records) //should be no need to wait for RecordMetadatas or errors, since commitTransaction flushes and throws
      _ <- producer.sendOffsetsToTransaction(offsets, consumerGroupId)
      _ <- producer.commitTransaction
    } yield ()).handleErrorWith {
      // Exception-handling described in https://kafka.apache.org/10/javadoc/org/apache/kafka/clients/producer/KafkaProducer.html#send-org.apache.kafka.clients.producer.ProducerRecord-org.apache.kafka.clients.producer.Callback-
      case e: ProducerFencedException => F.raiseError(e)
      case e: OutOfOrderSequenceException => F.raiseError(e)
      case e: UnsupportedVersionException => F.raiseError(e)
      case e: AuthorizationException => F.raiseError(e)
      case _ => producer.abortTransaction
    }
}

import org.apache.avro.generic.GenericRecord
import com.sksamuel.avro4s.ToRecord

case class GenericProducerOps[F[_]](producer: ProducerApi[F, GenericRecord, GenericRecord]) {

  def toAvro4s[K: ToRecord, V: ToRecord]: ProducerApi[F, K, V] =
    Avro4sProducerImpl[F, K, V](producer)

}

Source File: Avro4sProducerImpl.scala From kafka4s with Apache License 2.0

5 votes

package com.banno.kafka.producer

import java.util.concurrent.{Future => JFuture}
import scala.concurrent.duration._
import org.apache.kafka.common._
import org.apache.kafka.clients.consumer.OffsetAndMetadata
import org.apache.kafka.clients.producer._
import org.apache.avro.generic.GenericRecord
import com.sksamuel.avro4s.ToRecord
import com.banno.kafka._

//this is like Bifunctor[ProducerApi] but is contravariant in both arguments, cats does not seem to have anything like ContravriantBifunctor...

case class Avro4sProducerImpl[F[_], K: ToRecord, V: ToRecord](
    p: ProducerApi[F, GenericRecord, GenericRecord]
) extends ProducerApi[F, K, V] {
  def abortTransaction: F[Unit] = p.abortTransaction
  def beginTransaction: F[Unit] = p.beginTransaction
  def close: F[Unit] = p.close
  def close(timeout: FiniteDuration): F[Unit] = p.close(timeout)
  def commitTransaction: F[Unit] = p.commitTransaction
  def flush: F[Unit] = p.flush
  def initTransactions: F[Unit] = p.initTransactions
  def metrics: F[Map[MetricName, Metric]] = p.metrics
  def partitionsFor(topic: String): F[Seq[PartitionInfo]] = p.partitionsFor(topic)
  def sendOffsetsToTransaction(
      offsets: Map[TopicPartition, OffsetAndMetadata],
      consumerGroupId: String
  ): F[Unit] = p.sendOffsetsToTransaction(offsets, consumerGroupId)

  private[producer] def sendRaw(record: ProducerRecord[K, V]): JFuture[RecordMetadata] =
    p.sendRaw(record.toGenericRecord)
  private[producer] def sendRaw(
      record: ProducerRecord[K, V],
      callback: Callback
  ): JFuture[RecordMetadata] = p.sendRaw(record.toGenericRecord, callback)
  private[producer] def sendRaw(
      record: ProducerRecord[K, V],
      callback: Either[Exception, RecordMetadata] => Unit
  ): Unit =
    p.sendRaw(record.toGenericRecord, callback)

  def sendAndForget(record: ProducerRecord[K, V]): F[Unit] = p.sendAndForget(record.toGenericRecord)
  def sendSync(record: ProducerRecord[K, V]): F[RecordMetadata] = p.sendSync(record.toGenericRecord)
  def sendAsync(record: ProducerRecord[K, V]): F[RecordMetadata] =
    p.sendAsync(record.toGenericRecord)
}

Source File: Decoding.scala From avro4s with Apache License 2.0

5 votes

package benchmarks

import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer
import java.util.Collections

import benchmarks.record._
import com.sksamuel.avro4s._
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.util.ByteBufferInputStream
import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole

object Decoding extends BenchmarkHelpers {
  @State(Scope.Thread)
  class Setup {
    val avroBytes = {
      import benchmarks.record.generated.AttributeValue._
      import benchmarks.record.generated._
      new RecordWithUnionAndTypeField(new ValidInt(255, t)).toByteBuffer
    }

    val avro4sBytes = encode(RecordWithUnionAndTypeField(AttributeValue.Valid[Int](255, t)))

    val (handrolledDecoder, handrolledReader) = {
      import benchmarks.handrolled_codecs._
      implicit val codec: Codec[AttributeValue[Int]] = AttributeValueCodec[Int]
      implicit val schemaFor: SchemaFor[AttributeValue[Int]] = SchemaFor[AttributeValue[Int]](codec.schema)
      val recordSchemaFor = SchemaFor[RecordWithUnionAndTypeField]
      val decoder = Decoder[RecordWithUnionAndTypeField].withSchema(recordSchemaFor)
      val reader = new GenericDatumReader[GenericRecord](recordSchemaFor.schema)
      (decoder, reader)
    }

    val (avro4sDecoder, avro4sReader) = {
      val decoder = Decoder[RecordWithUnionAndTypeField]
      val reader = new GenericDatumReader[GenericRecord](decoder.schema)
      (decoder, reader)
    }
  }

  def encode[T: Encoder: SchemaFor](value: T): ByteBuffer = {
    val outputStream = new ByteArrayOutputStream(512)
    val encoder = Encoder[T]
    val schema = AvroSchema[T]
    val record = encoder.encode(value).asInstanceOf[GenericRecord]
    val writer = new GenericDatumWriter[GenericRecord](schema)
    val enc = EncoderFactory.get().directBinaryEncoder(outputStream, null)
    writer.write(record, enc)
    ByteBuffer.wrap(outputStream.toByteArray)
  }
}

class Decoding extends CommonParams with BenchmarkHelpers {

  import Decoding._

  def decode[T](bytes: ByteBuffer, decoder: Decoder[T], reader: GenericDatumReader[GenericRecord]): T = {
    val dec =
      DecoderFactory.get().binaryDecoder(new ByteBufferInputStream(Collections.singletonList(bytes.duplicate)), null)
    val record = reader.read(null, dec)
    decoder.decode(record)
  }


  @Benchmark
  def avroSpecificRecord(setup: Setup, blackhole: Blackhole) = {
    import benchmarks.record.generated._
    blackhole.consume(RecordWithUnionAndTypeField.fromByteBuffer(setup.avroBytes.duplicate))
  }

  @Benchmark
  def avro4sHandrolled(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(decode(setup.avro4sBytes, setup.handrolledDecoder, setup.handrolledReader))

  @Benchmark
  def avro4sGenerated(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(decode(setup.avro4sBytes, setup.avro4sDecoder, setup.avro4sReader))
}

Source File: Encoding.scala From avro4s with Apache License 2.0

5 votes

package benchmarks

import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer

import benchmarks.record._
import com.sksamuel.avro4s._
import org.apache.avro.generic.{GenericDatumWriter, GenericRecord}
import org.apache.avro.io.EncoderFactory
import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole

object Encoding extends BenchmarkHelpers {

  @State(Scope.Thread)
  class Setup {
    val record = RecordWithUnionAndTypeField(AttributeValue.Valid[Int](255, t))

    val specificRecord = {
      import benchmarks.record.generated.AttributeValue._
      import benchmarks.record.generated._
      new RecordWithUnionAndTypeField(new ValidInt(255, t))
    }

    val (avro4sEncoder, avro4sWriter) = {
      val schema = AvroSchema[RecordWithUnionAndTypeField]
      val encoder = Encoder[RecordWithUnionAndTypeField]
      val writer = new GenericDatumWriter[GenericRecord](schema)
      (encoder, writer)
    }

    val (handrolledEncoder, handrolledWriter) = {
      import benchmarks.handrolled_codecs._
      implicit val codec: AttributeValueCodec[Int] = AttributeValueCodec[Int]
      implicit val schemaForValid = codec.schemaForValid
      val schema = AvroSchema[RecordWithUnionAndTypeField]
      val encoder = Encoder[RecordWithUnionAndTypeField]
      val writer = new GenericDatumWriter[GenericRecord](schema)
      (encoder, writer)
    }

  }
}

class Encoding extends CommonParams with BenchmarkHelpers {

  import Encoding._

  def encode[T](value: T, encoder: Encoder[T], writer: GenericDatumWriter[GenericRecord]): ByteBuffer = {
    val outputStream = new ByteArrayOutputStream(512)
    val record = encoder.encode(value).asInstanceOf[GenericRecord]
    val enc = EncoderFactory.get().directBinaryEncoder(outputStream, null)
    writer.write(record, enc)
    ByteBuffer.wrap(outputStream.toByteArray)
  }


  @Benchmark
  def avroSpecificRecord(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(setup.specificRecord.toByteBuffer)

  @Benchmark
  def avro4sGenerated(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(encode(setup.record, setup.avro4sEncoder, setup.avro4sWriter))

  @Benchmark
  def avro4sHandrolled(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(encode(setup.record, setup.handrolledEncoder, setup.handrolledWriter))
}

Source File: AvroSchema.scala From aloha with MIT License

5 votes

package com.eharmony.aloha.semantics.compiled.plugin.avro

import com.eharmony.aloha.reflect.RefInfo
import com.eharmony.aloha.semantics.compiled.plugin.schemabased.schema.Schema.FieldRetrievalError
import com.eharmony.aloha.semantics.compiled.plugin.schemabased.schema._
import org.apache.avro
import org.apache.avro.Schema.Type._
import org.apache.avro.generic.GenericRecord

import scala.collection.JavaConversions.asScalaBuffer


  protected[avro] def unionField(name: String, index: Int, fieldSchema: avro.Schema, reqField: Boolean): Result = {
    val union = fieldSchema.getTypes

    // If there's only one item in the union, treat the union as if it didn't exist.
    if (1 == union.size)
      extract(name, index, union.head, reqField)
    else {
      val nonNull = union.filter(t => t.getType != NULL)
      if (1 == nonNull.size)
        extract(name, index, nonNull.head, nullable = true)
      else
        Left(FieldRetrievalError("Only UNION fields of one type or two types where one is NULL are allowed."))
    }
  }
}

object AvroSchema {
  def apply(rootSchema: avro.Schema): AvroSchema = new AvroSchema(rootSchema, rootSchema)
}

Source File: BasicEncoderTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.record.encoder

import com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage
import com.sksamuel.avro4s._
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericFixed, GenericRecord}
import org.apache.avro.util.Utf8
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class BasicEncoderTest extends AnyWordSpec with Matchers {

  "Encoder" should {
    "encode strings as UTF8" in {
      case class Foo(s: String)
      val schema = AvroSchema[Foo]
      val record = Encoder[Foo].encode(Foo("hello"))
      record shouldBe ImmutableRecord(schema, Vector(new Utf8("hello")))
    }
    "encode strings as GenericFixed and pad bytes when schema is fixed" in {
      case class Foo(s: String)

      val fixedSchema = SchemaFor[String](Schema.createFixed("FixedString", null, null, 7))
      implicit val fixedStringEncoder: Encoder[String] = Encoder.StringEncoder.withSchema(fixedSchema)

      val record = Encoder[Foo].encode(Foo("hello")).asInstanceOf[GenericRecord]
      record.get("s").asInstanceOf[GenericFixed].bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
      // the fixed should have the right size
      record.get("s").asInstanceOf[GenericFixed].bytes().length shouldBe 7
    }
    "encode longs" in {
      case class Foo(l: Long)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123456L)) shouldBe ImmutableRecord(schema, Vector(java.lang.Long.valueOf(123456L)))
    }
    "encode doubles" in {
      case class Foo(d: Double)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123.435)) shouldBe ImmutableRecord(schema, Vector(java.lang.Double.valueOf(123.435D)))
    }
    "encode booleans" in {
      case class Foo(d: Boolean)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(true)) shouldBe ImmutableRecord(schema, Vector(java.lang.Boolean.valueOf(true)))
    }
    "encode floats" in {
      case class Foo(d: Float)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123.435F)) shouldBe ImmutableRecord(schema, Vector(java.lang.Float.valueOf(123.435F)))
    }
    "encode ints" in {
      case class Foo(i: Int)
      val schema = AvroSchema[Foo]
      Encoder[Foo].encode(Foo(123)) shouldBe ImmutableRecord(schema, Vector(java.lang.Integer.valueOf(123)))
    }
    "support uppercase packages" in {
      val schema = AvroSchema[ClassInUppercasePackage]
      val t = com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage("hello")
      schema.getFullName shouldBe "com.sksamuel.avro4s.examples.UppercasePkg.ClassInUppercasePackage"
      Encoder[ClassInUppercasePackage].encode(t) shouldBe ImmutableRecord(schema, Vector(new Utf8("hello")))
    }
  }
}

Source File: FixedEncoderTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.record.encoder

import com.sksamuel.avro4s.{AvroFixed, Encoder, SchemaFor}
import org.apache.avro.generic.{GenericFixed, GenericRecord}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

@AvroFixed(8)
case class QuarterSHA256(bytes: Array[Byte]) extends AnyVal

case class FixedString(@AvroFixed(7) mystring: String)

case class AvroMessage(q: QuarterSHA256, payload: Array[Byte])

@AvroFixed(8)
case class FixedValueType(z: String) extends AnyVal
case class OptionFixedWrapper(opt: Option[FixedValueType])

class FixedEncoderTest extends AnyFunSuite with Matchers {

  val m = AvroMessage(
    QuarterSHA256(Array[Byte](0, 1, 2, 3, 4, 5, 6)),
    Array[Byte](0, 1, 2, 3)
  )

  test("encode fixed when used on a value type") {
    val schema = SchemaFor[AvroMessage]
    val record = Encoder[AvroMessage].encode(m).asInstanceOf[GenericRecord]
    record.get("q").asInstanceOf[GenericFixed].bytes().toVector shouldBe Vector(0, 1, 2, 3, 4, 5, 6, 0)
  }

  test("encode fixed when used on a field in a case class") {
    val schema = SchemaFor[FixedString]
    val record = Encoder[FixedString].encode(FixedString("sam")).asInstanceOf[GenericRecord]
    record.get("mystring").asInstanceOf[GenericFixed].bytes.toVector shouldBe Vector(115, 97, 109, 0, 0, 0, 0)
  }

  test("support options of fixed") {
    val schema = SchemaFor[OptionFixedWrapper]
    val record = Encoder[OptionFixedWrapper].encode(OptionFixedWrapper(Some(FixedValueType("sam")))).asInstanceOf[GenericRecord]
    record.get("opt").asInstanceOf[GenericFixed].bytes.toVector shouldBe Vector(115, 97, 109, 0, 0, 0, 0, 0)
  }
}

Source File: ByteArrayEncoderTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.record.encoder

import java.nio.ByteBuffer

import com.sksamuel.avro4s.{AvroSchema, Encoder, SchemaFor}
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericFixed, GenericRecord}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

class ByteArrayEncoderTest extends AnyFunSuite with Matchers {

  test("encode byte arrays as BYTES type") {
    case class Test(z: Array[Byte])
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(Array[Byte](1, 4, 9)))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode byte vectors as BYTES type") {
    case class Test(z: Vector[Byte])
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(Vector[Byte](1, 4, 9)))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode byte seq as BYTES type") {
    case class Test(z: Seq[Byte])
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(Seq[Byte](1, 4, 9)))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode byte list as BYTES type") {
    case class Test(z: List[Byte])
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(List[Byte](1, 4, 9)))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode top level byte arrays") {
    val schema = AvroSchema[Array[Byte]]
    Encoder[Array[Byte]].encode(Array[Byte](1, 4, 9))
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode ByteBuffers as BYTES type") {
    case class Test(z: ByteBuffer)
    val schema = AvroSchema[Test]
    Encoder[Test].encode(Test(ByteBuffer.wrap(Array[Byte](1, 4, 9))))
      .asInstanceOf[GenericRecord]
      .get("z")
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("encode top level ByteBuffers") {
    val schema = AvroSchema[ByteBuffer]
    Encoder[ByteBuffer].encode(ByteBuffer.wrap(Array[Byte](1, 4, 9)))
      .asInstanceOf[ByteBuffer]
      .array().toList shouldBe List[Byte](1, 4, 9)
  }

  test("support FIXED") {
    val schema = SchemaBuilder.fixed("foo").size(7)
    val fixed = Encoder.ByteArrayEncoder.withSchema(SchemaFor(schema)).encode("hello".getBytes).asInstanceOf[GenericFixed]
    fixed.bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
    fixed.bytes().length shouldBe 7
  }
}

Source File: TupleEncoderTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.record.encoder

import com.sksamuel.avro4s.{AvroSchema, Encoder}
import org.apache.avro.generic.GenericRecord
import org.apache.avro.util.Utf8
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

class TupleEncoderTest extends AnyFunSuite with Matchers {

  test("encode tuple2") {
    case class Test(z: (String, Option[Int]))
    val schema = AvroSchema[Test]
    val record = Encoder[Test].encode(Test("hello", Some(55))).asInstanceOf[GenericRecord]
    val z = record.get("z").asInstanceOf[GenericRecord]
    z.get("_1") shouldBe new Utf8("hello")
    z.get("_2") shouldBe 55
  }

  test("encode tuple3") {
    case class Test(z: (String, Option[Int], Long))
    val schema = AvroSchema[Test]
    val record = Encoder[Test].encode(Test("hello", Some(55), 9999999L)).asInstanceOf[GenericRecord]
    val z = record.get("z").asInstanceOf[GenericRecord]
    z.get("_1") shouldBe new Utf8("hello")
    z.get("_2") shouldBe 55
    z.get("_3") shouldBe 9999999L
  }

  test("encode tuple4") {
    case class Test(z: (String, Option[Int], Boolean, Double))
    val schema = AvroSchema[Test]
    val record = Encoder[Test].encode(Test("hello", Some(55), true, 0.24)).asInstanceOf[GenericRecord]
    val z = record.get("z").asInstanceOf[GenericRecord]
    z.get("_1") shouldBe new Utf8("hello")
    z.get("_2") shouldBe 55
    z.get("_3") shouldBe true
    z.get("_4") shouldBe 0.24
  }

  test("encode tuple5") {
    case class Test(z: (String, Option[Int], String, Boolean, String))
    val schema = AvroSchema[Test]
    val record = Encoder[Test].encode(Test("a", Some(55), "b", true, "c")).asInstanceOf[GenericRecord]
    val z = record.get("z").asInstanceOf[GenericRecord]
    z.get("_1") shouldBe new Utf8("a")
    z.get("_2") shouldBe 55
    z.get("_3") shouldBe new Utf8("b")
    z.get("_4") shouldBe true
    z.get("_5") shouldBe new Utf8("c")
  }
}

Source File: OptionOutputStreamTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.streams.output

import org.apache.avro.generic.GenericRecord
import org.apache.avro.util.Utf8

class OptionOutputStreamTest extends OutputStreamTest {

  test("options of booleans") {
    case class Test(z: Option[Boolean])
    writeRead(Test(Some(true))) { record =>
      record.get("z") shouldBe true
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of ints") {
    case class Test(z: Option[Int])
    writeRead(Test(Some(43242))) { record =>
      record.get("z") shouldBe 43242
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of longs") {
    case class Test(z: Option[Long])
    writeRead(Test(Some(43242L))) { record =>
      record.get("z") shouldBe 43242L
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of doubles") {
    case class Test(z: Option[Double])
    writeRead(Test(Some(123.34))) { record =>
      record.get("z") shouldBe java.lang.Double.valueOf(123.34)
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of strings") {
    case class Test(z: Option[String])
    writeRead(Test(Some("hello"))) { record =>
      record.get("z") shouldBe new Utf8("hello")
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }

  test("options of classes") {
    case class Foo(s: String)
    case class Test(z: Option[Foo])
    writeRead(Test(Some(Foo("hello")))) { record =>
      record.get("z").asInstanceOf[GenericRecord].get("s") shouldBe new Utf8("hello")
    }
    writeRead(Test(None)) { record =>
      record.get("z") shouldBe null
    }
  }
}

Source File: OutputStreamTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.streams.output

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s._
import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

trait OutputStreamTest extends AnyFunSuite with Matchers {

  def readData[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readData(out.toByteArray)
  def readData[T: SchemaFor](bytes: Array[Byte]): GenericRecord = {
    val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T])
    val dataFileReader = new DataFileReader[GenericRecord](new SeekableByteArrayInput(bytes), datumReader)
    dataFileReader.next
  }

  def writeData[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = {
    val out = new ByteArrayOutputStream
    val avro = AvroOutputStream.data[T].to(out).build()
    avro.write(t)
    avro.close()
    out
  }

  def readBinary[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readBinary(out.toByteArray)
  def readBinary[T: SchemaFor](bytes: Array[Byte]): GenericRecord = {
    val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T])
    val decoder = DecoderFactory.get().binaryDecoder(new SeekableByteArrayInput(bytes), null)
    datumReader.read(null, decoder)
  }

  def writeBinary[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = {
    val out = new ByteArrayOutputStream
    val avro = AvroOutputStream.binary[T].to(out).build()
    avro.write(t)
    avro.close()
    out
  }

  def readJson[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readJson(out.toByteArray)
  def readJson[T: SchemaFor](bytes: Array[Byte]): GenericRecord = {
    val schema = AvroSchema[T]
    val datumReader = new GenericDatumReader[GenericRecord](schema)
    val decoder = DecoderFactory.get().jsonDecoder(schema, new SeekableByteArrayInput(bytes))
    datumReader.read(null, decoder)
  }

  def writeJson[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = {
    val out = new ByteArrayOutputStream
    val avro = AvroOutputStream.json[T].to(out).build()
    avro.write(t)
    avro.close()
    out
  }

  def writeRead[T: Encoder : SchemaFor](t: T)(fn: GenericRecord => Any): Unit = {
    {
      val out = writeData(t)
      val record = readData(out)
      fn(record)
    }
    {
      val out = writeBinary(t)
      val record = readBinary(out)
      fn(record)
    }
    {
      val out = writeJson(t)
      val record = readJson(out)
      fn(record)
    }
  }
}

Source File: EitherOutputStreamTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.streams.output

import java.util

import com.sksamuel.avro4s.schema.Wine
import org.apache.avro.AvroRuntimeException
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.util.Utf8

class EitherOutputStreamTest extends OutputStreamTest {

  import scala.collection.JavaConverters._

  test("write out either of primitives") {
    case class Test(z: Either[String, Int])
    writeRead(Test(Left("hello"))) { record =>
      record.get("z") shouldBe new Utf8("hello")
    }
    writeRead(Test(Right(45))) { record =>
      record.get("z") shouldBe 45
    }
  }

  test("write out either of Array") {
    case class Test(z: Either[Array[Int], String])
    writeRead(Test(Left(Array(1, 3, 4)))) { record =>
      record.get("z").asInstanceOf[GenericData.Array[Int]].asScala shouldBe List(1, 3, 4)
    }
  }

  test("write out either of Seq") {
    case class Test(z: Either[String, Seq[String]])
    writeRead(Test(Right(Seq("c", "d")))) { record =>
      record.get("z").asInstanceOf[GenericData.Array[String]].asScala shouldBe List(new Utf8("c"), new Utf8("d"))
    }
  }

  test("write out either of enum") {
    case class Test(z: Either[Wine, Seq[String]])
    writeRead(Test(Left(Wine.Malbec))) { record =>
      record.get("z").asInstanceOf[GenericData.EnumSymbol].toString shouldBe "Malbec"
    }
  }

  test("write out either of Maps") {
    case class Test(z: Either[Array[Int], Map[String, Boolean]])
    writeRead(Test(Right(Map("a" -> true, "b" -> false)))) { record =>
      record.get("z").asInstanceOf[util.HashMap[String, Boolean]].asScala shouldBe Map(new Utf8("a") -> true, new Utf8("b") -> false)
    }
  }

  test("write out case classes") {
    case class Foo(a: String)
    case class Bar(b: Boolean)
    case class Test(z: Either[Foo, Bar])
    writeRead(Test(Left(Foo("hello")))) { record =>
      record.get("z").asInstanceOf[GenericRecord].get("a") shouldBe new Utf8("hello")
    }
    writeRead(Test(Right(Bar(true)))) { record =>
      record.get("z").asInstanceOf[GenericRecord].get("b") shouldBe true
    }
  }

  test("throw an exception if trying to use two collection types in an either") {
    intercept[AvroRuntimeException] {
      case class Test(z: Either[Seq[String], List[Int]])
      writeRead(Test(Left(Seq("hello")))) { record =>
      }
    }
  }
}

Source File: BasicOutputStreamTest.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.streams.output

import com.sksamuel.avro4s.{Encoder, SchemaFor}
import org.apache.avro.Schema.Parser
import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder}
import org.apache.avro.util.Utf8

class BasicOutputStreamTest extends OutputStreamTest {

  test("write out booleans") {
    case class Test(z: Boolean)
    writeRead(Test(true)) { record =>
      record.get("z") shouldBe true
    }
  }

  test("write out strings") {
    case class Test(z: String)
    writeRead(Test("Hello world")) { record =>
      record.get("z") shouldBe new Utf8("Hello world")
    }
  }

  test("write out longs") {
    case class Test(z: Long)
    writeRead(Test(65653L)) { record =>
      record.get("z") shouldBe 65653L
    }
  }

  test("write out ints") {
    case class Test(z: Int)
    writeRead(Test(44)) { record =>
      record.get("z") shouldBe 44
    }
  }

  test("write out doubles") {
    case class Test(z: Double)
    writeRead(Test(3.235)) { record =>
      record.get("z") shouldBe 3.235
    }
  }

  test("write out floats") {
    case class Test(z: Float)
    writeRead(Test(3.4F)) { record =>
      record.get("z") shouldBe 3.4F
    }
  }

  test("write out generic record") {
    val recordSchema = new Parser().parse(
      """{"type":"record","name":"Test","fields":[{"name":"field","type":"string"}]}"""
    )
    implicit val recordSchemaFor: SchemaFor[GenericRecord] = SchemaFor(recordSchema)

    implicit val encoder: Encoder[GenericRecord] = new Encoder[GenericRecord] {
      def schemaFor = recordSchemaFor

      def encode(value: GenericRecord): AnyRef = value
    }


    val record: GenericRecord = new GenericRecordBuilder(recordSchema).set("field", "value").build()

    writeRead(record) { rec =>
      rec.get("field") shouldBe new Utf8("value")
    }
  }
}

Source File: GithubIssue235.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.github

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s.{Decoder, Encoder, RecordFormat, SchemaFor}
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

case class Label(value: String) extends AnyVal
case class Value[A](label: Label, value: A)

sealed trait OneOrTwo[A]
case class One[A](value: Value[A]) extends OneOrTwo[A]
case class Two[A](first: Value[A], second: Value[A]) extends OneOrTwo[A]
case class OneOrTwoWrapper[A](t: OneOrTwo[A])

object Bug {

  def apply[T <: Product](a: T)(
    implicit schemaFor: SchemaFor[T],
    encoder: Encoder[T],
    decoder: Decoder[T]
  ): Unit = {

    val format = RecordFormat[T]
    val schema = schemaFor.schema
    val datumReader = new GenericDatumReader[GenericRecord](schema)
    val datumWriter = new GenericDatumWriter[GenericRecord](schema)

    val stream = new ByteArrayOutputStream()
    val bEncoder = EncoderFactory.get().binaryEncoder(stream, null)

    datumWriter.write(format.to(a), bEncoder)
    bEncoder.flush()

    val bytes = stream.toByteArray
    val bDecoder = DecoderFactory.get().binaryDecoder(bytes, null)
    val record = datumReader.read(null, bDecoder)
    require(format.from(record) == a)
  }

}

class GithubIssue235 extends AnyFunSuite with Matchers {
  test("Broken typeclass derivation upgrading from 1.9.0 to 2.0.1 #235") {
    val o = OneOrTwoWrapper(One(Value(Label("lbl"), "foo")))
    Bug(o)
  }
}

Source File: GithubIssue191.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s.github

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s.{AvroOutputStream, AvroSchema}
import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.util.Utf8
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

final case class SN(value: String) extends AnyVal
final case class SimpleUser(name: String, sn: Option[SN])

class GithubIssue191 extends AnyFunSuite with Matchers {

  test("writing out AnyVal in an option") {
    implicit val schema = AvroSchema[SimpleUser]
    val bytes = new ByteArrayOutputStream
    val out = AvroOutputStream.data[SimpleUser].to(bytes).build()
    out.write(SimpleUser("Tom", Some(SN("123"))))
    out.close()

    val datumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](new SeekableByteArrayInput(bytes.toByteArray), datumReader)
    val record = new Iterator[GenericRecord] {
      override def hasNext: Boolean = dataFileReader.hasNext
      override def next(): GenericRecord = dataFileReader.next
    }.toList.head
    record.getSchema shouldBe schema
    record.get("name") shouldBe new Utf8("Tom")
    record.get("sn") shouldBe new Utf8("123")
  }
}

Source File: AvroSerializer.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.coders.instances.kryo

import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.io.{Input, Output}
import com.twitter.chill.KSerializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecordBase
import org.apache.beam.sdk.coders.AvroCoder

import scala.collection.mutable.{Map => MMap}
import scala.util.Try

private[coders] class GenericAvroSerializer extends KSerializer[GenericRecord] {
  private lazy val cache: MMap[String, AvroCoder[GenericRecord]] = MMap()

  private def getCoder(schemaStr: String): AvroCoder[GenericRecord] =
    cache.getOrElseUpdate(schemaStr, AvroCoder.of(new Schema.Parser().parse(schemaStr)))
  private def getCoder(schemaStr: String, schema: Schema): AvroCoder[GenericRecord] =
    cache.getOrElseUpdate(schemaStr, AvroCoder.of(schema))

  override def write(kryo: Kryo, out: Output, obj: GenericRecord): Unit = {
    val schemaStr = obj.getSchema.toString
    val coder = this.getCoder(schemaStr, obj.getSchema)
    // write schema before every record in case it's not in reader serializer's cache
    out.writeString(schemaStr)
    coder.encode(obj, out)
  }

  override def read(kryo: Kryo, in: Input, cls: Class[GenericRecord]): GenericRecord = {
    val coder = this.getCoder(in.readString())
    coder.decode(in)
  }
}

private[coders] class SpecificAvroSerializer[T <: SpecificRecordBase] extends KSerializer[T] {
  private lazy val cache: MMap[Class[T], AvroCoder[T]] = MMap()

  private def getCoder(cls: Class[T]): AvroCoder[T] =
    cache.getOrElseUpdate(
      cls,
      Try(cls.getConstructor().newInstance().getSchema)
        .map(AvroCoder.of(cls, _))
        .getOrElse(AvroCoder.of(cls))
    )

  override def write(kser: Kryo, out: Output, obj: T): Unit =
    this.getCoder(obj.getClass.asInstanceOf[Class[T]]).encode(obj, out)

  override def read(kser: Kryo, in: Input, cls: Class[T]): T =
    this.getCoder(cls).decode(in)
}

Source File: TestUtilsBase.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect

import java.util
import java.util.Collections

import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.kafka.connect.source.SourceTaskContext
import org.apache.kafka.connect.storage.OffsetStorageReader
import org.mockito.Mockito._
import org.mockito.MockitoSugar
import org.scalatest.BeforeAndAfter
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import scala.collection.JavaConverters._



    //set up partition
    val partition: util.Map[String, String] = Collections.singletonMap(lookupPartitionKey, table)
    //as a list to search for
    val partitionList: util.List[util.Map[String, String]] = List(partition).asJava
    //set up the offset
    val offset: util.Map[String, Object] = (Collections.singletonMap(offsetColumn,offsetValue ))
    //create offsets to initialize from
    val offsets :util.Map[util.Map[String, String],util.Map[String, Object]] = Map(partition -> offset).asJava

    //mock out reader and task context
    val taskContext = mock[SourceTaskContext]
    val reader = mock[OffsetStorageReader]
    when(reader.offsets(partitionList)).thenReturn(offsets)
    when(taskContext.offsetStorageReader()).thenReturn(reader)

    taskContext
  }
}

Source File: AvroConverter.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.source

import java.io.File
import java.util.Collections

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import io.confluent.connect.avro.AvroData
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import org.apache.avro.{Schema => AvroSchema}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.source.SourceRecord
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException


class AvroConverter extends Converter {
  private val avroData = new AvroData(8)
  private var sourceToSchemaMap: Map[String, AvroSchema] = Map.empty
  private var avroReadersMap: Map[String, GenericDatumReader[GenericRecord]] = Map.empty

  override def convert(kafkaTopic: String,
                       sourceTopic: String,
                       messageId: String,
                       bytes: Array[Byte],
                       keys: Seq[String] = Seq.empty,
                       keyDelimiter: String = "."): SourceRecord = {
    Option(bytes) match {
      case None =>
        new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic),
          null,
          kafkaTopic,
          avroData.toConnectSchema(sourceToSchemaMap(sourceTopic)),
          null)
      case Some(_) =>
        val reader = avroReadersMap.getOrElse(sourceTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $sourceTopic"))
        val decoder = DecoderFactory.get().binaryDecoder(bytes, null)
        val record = reader.read(null, decoder)
        val schemaAndValue = avroData.toConnectData(sourceToSchemaMap(sourceTopic.toLowerCase), record)
        val value = schemaAndValue.value()
        value match {
          case s: Struct if keys.nonEmpty =>
            val keysValue = keys.flatMap { key =>
              Option(KeyExtractor.extract(s, key.split('.').toVector)).map(_.toString)
            }.mkString(keyDelimiter)
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              Schema.STRING_SCHEMA,
              keysValue,
              schemaAndValue.schema(),
              schemaAndValue.value())
          case _ =>
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              MsgKey.schema,
              MsgKey.getStruct(sourceTopic, messageId),
              schemaAndValue.schema(),
              schemaAndValue.value())
        }

    }
  }

  override def initialize(config: Map[String, String]): Unit = {
    sourceToSchemaMap = AvroConverter.getSchemas(config)
    avroReadersMap = sourceToSchemaMap.map { case (key, schema) =>
      key -> new GenericDatumReader[GenericRecord](schema)
    }
  }
}

object AvroConverter {
  val SCHEMA_CONFIG = "connect.source.converter.avro.schemas"

  def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = {
    config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided"))
      .toString
      .split(';')
      .filter(_.trim.nonEmpty)
      .map(_.split("="))
      .map {
        case Array(source, path) =>
          val file = new File(path)
          if (!file.exists()) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!")
          }
          val s = source.trim.toLowerCase()
          if (s.isEmpty) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path")
          }
          s -> new AvroSchema.Parser().parse(file)
        case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Source->AVRO_FILE")
      }.toMap
  }
}

Source File: AvroConverter.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.converters.sink

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import io.confluent.connect.avro.AvroData
import java.io.ByteArrayOutputStream
import java.io.File
import org.apache.avro.{Schema => AvroSchema}
import org.apache.avro.generic.GenericRecord
import org.apache.avro.io.EncoderFactory
import org.apache.avro.reflect.ReflectDatumWriter
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException


class AvroConverter extends Converter {
  private val avroData = new AvroData(8)
  private var sinkToSchemaMap: Map[String, AvroSchema] = Map.empty
  private var avroWritersMap: Map[String, ReflectDatumWriter[Object]] = Map.empty

  override def convert(sinkTopic: String,
                       data: SinkRecord): SinkRecord = {
    Option(data) match {
      case None =>
        new SinkRecord(
          sinkTopic,
          0,
          null,
          null,
          avroData.toConnectSchema(sinkToSchemaMap(sinkTopic)),
          null,
          0
        )
      case Some(_) =>
        val kafkaTopic = data.topic()
        val writer = avroWritersMap.getOrElse(kafkaTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $kafkaTopic"))

        val output = new ByteArrayOutputStream();
        val decoder = EncoderFactory.get().binaryEncoder(output, null)
        output.reset()

        val avro = avroData.fromConnectData(data.valueSchema(), data.value())
        avro.asInstanceOf[GenericRecord]

        val record = writer.write(avro, decoder)
        decoder.flush()
        val arr = output.toByteArray

        new SinkRecord(
          kafkaTopic,
          data.kafkaPartition(),
          MsgKey.schema,
          MsgKey.getStruct(sinkTopic, data.key().toString()),
          data.valueSchema(),
          arr,
          0
        )


    }
  }

  override def initialize(config: Map[String, String]): Unit = {
    sinkToSchemaMap = AvroConverter.getSchemas(config)
    avroWritersMap = sinkToSchemaMap.map { case (key, schema) =>
      key -> new ReflectDatumWriter[Object](schema)
    }
  }
}

object AvroConverter {
  val SCHEMA_CONFIG = "connect.converter.avro.schemas"

  def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = {
    config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided"))
      .toString
      .split(';')
      .filter(_.trim.nonEmpty)
      .map(_.split("="))
      .map {
        case Array(sink, path) =>
          val file = new File(path)
          if (!file.exists()) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!")
          }
          val s = sink.trim.toLowerCase()
          if (s.isEmpty) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path")
          }
          s -> new AvroSchema.Parser().parse(file)
        case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Sink->AVRO_FILE")
      }.toMap
  }
}

Source File: AvroSerializer.scala From kafka-connect-common with Apache License 2.0

5 votes

package com.datamountaineer.streamreactor.connect.serialization

import java.io.{ByteArrayOutputStream, InputStream, OutputStream}

import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}

object AvroSerializer {
  def write[T <: Product](t: T)(implicit os: OutputStream, formatter: RecordFormat[T], schemaFor: SchemaFor[T]): Unit = write(apply(t), schemaFor())

  def write(record: GenericRecord, schema: Schema)(implicit os: OutputStream) = {
    val writer = new GenericDatumWriter[GenericRecord](schema)
    val encoder = EncoderFactory.get().binaryEncoder(os, null)

    writer.write(record, encoder)
    encoder.flush()
    os.flush()
  }

  def getBytes[T <: Product](t: T)(implicit recordFormat: RecordFormat[T], schemaFor: SchemaFor[T]): Array[Byte] = getBytes(recordFormat.to(t), schemaFor())

  def getBytes(record: GenericRecord, schema: Schema): Array[Byte] = {
    implicit val output = new ByteArrayOutputStream()
    write(record, schema)
    output.toByteArray
  }

  def read(is: InputStream, schema: Schema): GenericRecord = {
    val reader = new GenericDatumReader[GenericRecord](schema)
    val decoder = DecoderFactory.get().binaryDecoder(is, null)
    reader.read(null, decoder)
  }

  def read[T <: Product](is: InputStream)(implicit schemaFor: SchemaFor[T], recordFormat: RecordFormat[T]): T = recordFormat.from(read(is, schemaFor()))

  def apply[T <: Product](t: T)(implicit formatter: RecordFormat[T]): GenericRecord = formatter.to(t)
}

Source File: FixAvroIO.scala From scio with Apache License 2.0

5 votes

package fix
package v0_7_0

import com.spotify.scio.ContextAndArgs
import org.apache.avro.generic.GenericRecord
import com.spotify.scio.testing.{AvroIO, BigQueryIO, PipelineSpec, TextIO}

case class InputClass(s: String, i: Int) extends GenericRecord {
  def getSchema(): org.apache.avro.Schema = ???
  def get(x$1: String): Object = ???
  def put(x$1: String,x$2: Any): Unit = ???
  def get(x$1: Int): Object = ???
  def put(x$1: Int,x$2: Any): Unit = ???
}

case class OutputClass(result: String) extends GenericRecord {
  def getSchema(): org.apache.avro.Schema = ???
  def get(x$1: String): Object = ???
  def put(x$1: String,x$2: Any): Unit = ???
  def get(x$1: Int): Object = ???
  def put(x$1: Int,x$2: Any): Unit = ???
}

object TestJob

class ValidationJobTest extends PipelineSpec {
  val inputs: List[InputClass] = (1 to 10).toList.map{ i => InputClass(s"s$i", i) }
  val inputs2 = (1 to 10).zip(inputs).toMap
  val inputs3 = inputs2.values
  val expected = List(OutputClass("result"))

  "TestJob" should "run" in {
    JobTest[TestJob.type]
      .input(AvroIO("current"), inputs)
      .input(AvroIO("reference"), inputs2.values)
      .input(AvroIO("reference2"), inputs3)
      .input(AvroIO[InputClass]("donttouch"), inputs)
      .output[OutputClass](AvroIO("foo")){ coll =>
coll should containInAnyOrder(expected)
()
}
      .run()
  }
}

Source File: FixAvroIO.scala From scio with Apache License 2.0

5 votes

package fix
package v0_7_0

import com.spotify.scio.ContextAndArgs
import org.apache.avro.generic.GenericRecord
import com.spotify.scio.testing.PipelineSpec
import com.spotify.scio.avro._
import com.spotify.scio.bigquery._
import com.spotify.scio.io._

case class InputClass(s: String, i: Int) extends GenericRecord {
  def getSchema(): org.apache.avro.Schema = ???
  def get(x$1: String): Object = ???
  def put(x$1: String,x$2: Any): Unit = ???
  def get(x$1: Int): Object = ???
  def put(x$1: Int,x$2: Any): Unit = ???
}

case class OutputClass(result: String) extends GenericRecord {
  def getSchema(): org.apache.avro.Schema = ???
  def get(x$1: String): Object = ???
  def put(x$1: String,x$2: Any): Unit = ???
  def get(x$1: Int): Object = ???
  def put(x$1: Int,x$2: Any): Unit = ???
}

object TestJob

class ValidationJobTest extends PipelineSpec {
  val inputs: List[InputClass] = (1 to 10).toList.map{ i => InputClass(s"s$i", i) }
  val inputs2 = (1 to 10).zip(inputs).toMap
  val inputs3 = inputs2.values
  val expected = List(OutputClass("result"))

  "TestJob" should "run" in {
    JobTest[TestJob.type]
      .input(AvroIO[InputClass]("current"), inputs)
      .input(AvroIO[GenericRecord]("reference"), inputs2.values)
      .input(AvroIO[InputClass]("reference2"), inputs3)
      .input(AvroIO[InputClass]("donttouch"), inputs)
      .output[OutputClass](AvroIO("foo")){ coll =>
coll should containInAnyOrder(expected)
()
}
      .run()
  }
}

Source File: CoderTestUtils.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.coders

import com.spotify.scio.avro.TestRecord
import org.apache.beam.sdk.coders.{Coder => BCoder}
import org.apache.avro.generic.GenericRecord
import org.apache.beam.sdk.util.CoderUtils

object CoderTestUtils {
  case class Pair(name: String, size: Int)
  case class CaseClassWithGenericRecord(name: String, size: Int, record: GenericRecord)
  case class CaseClassWithSpecificRecord(name: String, size: Int, record: TestRecord)

  def testRoundTrip[T](coder: BCoder[T], value: T): Boolean =
    testRoundTrip(coder, coder, value)

  def testRoundTrip[T](writer: BCoder[T], reader: BCoder[T], value: T): Boolean = {
    val bytes = CoderUtils.encodeToByteArray(writer, value)
    val result = CoderUtils.decodeFromByteArray(reader, bytes)
    result == value
  }
}

Source File: ProtobufUtilTest.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.util

import java.io.File
import java.nio.channels.Channels
import java.nio.file.Files

import com.spotify.scio.ScioContext
import com.spotify.scio.avro._
import com.spotify.scio.coders.Coder
import com.spotify.scio.proto.Track.TrackPB
import org.apache.avro.file.DataFileStream
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.beam.sdk.io.{FileSystems, LocalResources}
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

import scala.jdk.CollectionConverters._

class ProtobufUtilTest extends AnyFlatSpec with Matchers {

  "ProtobufUtil" should "convert Message -> GenericRecords that can be written and read" in {
    val sc = ScioContext()

    val dir = Files.createTempDirectory("protobuf-util-")
    val (path1, path2) = (new File(s"$dir/1"), new File(s"$dir/2"))
    path1.deleteOnExit()
    path2.deleteOnExit()
    dir.toFile.deleteOnExit()

    implicit val grCoder: Coder[GenericRecord] = ProtobufUtil.AvroMessageCoder

    val messages = sc
      .parallelize(1 to 10)
      .map(i => TrackPB.newBuilder().setTrackId(i.toString).build())

    messages
      .map(ProtobufUtil.toAvro[TrackPB])
      .saveAsAvroFile(
        path1.getPath,
        suffix = ".protobuf",
        metadata = ProtobufUtil.schemaMetadataOf[TrackPB],
        schema = ProtobufUtil.AvroMessageSchema,
        numShards = 1
      )

    val protoWriteTap = messages.saveAsProtobufFile(path2.getPath, numShards = 1)

    val result = sc.run().waitUntilDone()

    val (tapFromAvroWrite, tapFromProtoWrite) = (
      ObjectFileTap[TrackPB](ScioUtil.addPartSuffix(path1.getPath)),
      protoWriteTap.get(result)
    )

    tapFromAvroWrite.value.toList should contain theSameElementsAs tapFromProtoWrite.value.toList
    getMetadata(path1) should contain theSameElementsAs getMetadata(path2)
  }

  private def getMetadata(dir: File): Map[String, AnyRef] = {
    val files = dir.listFiles()
    if (files.length != 1) {
      fail(s"Directory $dir should contain 1 Avro file. Instead, found ${files.toList}")
    }

    val dfs = new DataFileStream[GenericRecord](
      Channels.newInputStream(FileSystems.open(LocalResources.fromFile(files(0), false))),
      new GenericDatumReader[GenericRecord]
    )

    dfs.getMetaKeys.asScala.map(k => (k, dfs.getMetaString(k))).toMap
  }
}

Source File: Pretty.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.testing

import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecordBase
import scala.jdk.CollectionConverters._
import com.spotify.scio.{registerSysProps, SysProp}
import scala.util.Try

@registerSysProps
object PrettySysProps {
  val PrettyPrint =
    SysProp("tests.prettyprint.colors", "Should pretty printed values be rendered with colors")
}

object Pretty {
  import pprint.Tree
  import fansi.{Color, Str}

  private def renderFieldName(n: String) =
    Tree.Lazy(ctx => List(Color.LightBlue(n).toString).iterator)

  private def renderGenericRecord: PartialFunction[GenericRecord, Tree] = {
    case g =>
      val renderer =
        new pprint.Renderer(
          printer.defaultWidth,
          printer.colorApplyPrefix,
          printer.colorLiteral,
          printer.defaultIndent
        )
      def render(tree: Tree): Str =
        Str.join(renderer.rec(tree, 0, 0).iter.toSeq: _*)
      Tree.Lazy { ctx =>
        val fields =
          for {
            f <- g.getSchema().getFields().asScala
          } yield Str.join(
            render(renderFieldName(f.name)),
            ": ",
            render(treeifyAvro(g.get(f.name())))
          )
        List(
          Color.LightGray("{ ").toString +
            fields.reduce((a, b) => Str.join(a, ", ", b)) +
            Color.LightGray(" }")
        ).iterator
      }
  }

  private def renderSpecificRecord: PartialFunction[SpecificRecordBase, Tree] = {
    case x =>
      val fs =
        for {
          f <- x.getSchema().getFields().asScala
        } yield Tree.Infix(renderFieldName(f.name), "=", treeifyAvro(x.get(f.name())))
      Tree.Apply(x.getClass().getSimpleName(), fs.iterator)
  }

  private def treeifyAvro: PartialFunction[Any, Tree] = {
    case x: SpecificRecordBase =>
      renderSpecificRecord(x)
    case g: GenericRecord =>
      renderGenericRecord(g)
    case x =>
      printer.treeify(x)
  }

  private val handlers: PartialFunction[Any, Tree] = {
    case x: GenericRecord => treeifyAvro(x)
  }

  private val useColors =
    PrettySysProps.PrettyPrint.valueOption
      .flatMap(x => Try(x.toBoolean).toOption)
      .getOrElse {
        // Crude test to check if the terminal seems to support colors
        (System.console() != null) && (System.getenv().get("TERM") != null)
      }

  val printer =
    if (useColors) {
      pprint.PPrinter(
        additionalHandlers = handlers
      )
    } else {
      pprint.PPrinter(
        additionalHandlers = handlers,
        colorLiteral = fansi.Attrs.Empty,
        colorApplyPrefix = fansi.Attrs.Empty
      )
    }
}

Source File: AvroInstances.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.schemas.instances

import com.spotify.scio.schemas.{RawRecord, Schema}
import org.apache.avro.specific.SpecificRecord
import org.apache.avro.generic.{GenericRecord, IndexedRecord}
import org.apache.beam.sdk.schemas.utils.AvroUtils
import org.apache.beam.sdk.schemas.{AvroRecordSchema, Schema => BSchema}
import org.apache.beam.sdk.transforms.SerializableFunction
import org.apache.beam.sdk.values.{Row, TypeDescriptor}

import scala.jdk.CollectionConverters._
import scala.reflect.{classTag, ClassTag}

trait AvroInstances {
  implicit def avroSchema[T <: SpecificRecord: ClassTag]: Schema[T] = {
    // TODO: broken because of a bug upstream https://issues.apache.org/jira/browse/BEAM-6742
    // RawRecord[T](new AvroRecordSchema())
    import org.apache.avro.reflect.ReflectData
    val rc = classTag[T].runtimeClass.asInstanceOf[Class[T]]
    val provider = new AvroRecordSchema()
    val td = TypeDescriptor.of(rc)
    val schema = provider.schemaFor(td)
    val avroSchema =
      new AvroInstances.SerializableSchema(ReflectData.get().getSchema(td.getRawType))

    def fromRow = provider.fromRowFunction(td)

    val toRow: SerializableFunction[T, Row] =
      new SerializableFunction[T, Row] {
        def apply(t: T): Row =
          AvroInstances.recordtoRow(schema, avroSchema, t)
      }
    RawRecord[T](schema, fromRow, toRow)
  }

  def fromAvroSchema(schema: org.apache.avro.Schema): Schema[GenericRecord] = {
    val beamSchema = AvroUtils.toBeamSchema(schema)
    val avroSchema = new AvroInstances.SerializableSchema(schema)
    val toRow = new SerializableFunction[GenericRecord, Row] {
      def apply(t: GenericRecord): Row =
        AvroInstances.recordtoRow[GenericRecord](beamSchema, avroSchema, t)
    }

    val fromRow = new SerializableFunction[Row, GenericRecord] {
      def apply(t: Row): GenericRecord =
        AvroUtils.toGenericRecord(t, avroSchema.get)
    }

    RawRecord[GenericRecord](beamSchema, fromRow, toRow)
  }
}

object AvroInstances {
  private class SerializableSchema(@transient private val schema: org.apache.avro.Schema)
      extends Serializable {
    private[this] val stringSchema = schema.toString
    def get: org.apache.avro.Schema = new org.apache.avro.Schema.Parser().parse(stringSchema)
  }

  // Workaround BEAM-6742
  private def recordtoRow[T <: IndexedRecord](
    schema: BSchema,
    avroSchema: SerializableSchema,
    t: T
  ): Row = {
    val row = Row.withSchema(schema)
    schema.getFields.asScala.zip(avroSchema.get.getFields.asScala).zipWithIndex.foreach {
      case ((f, a), i) =>
        val value = t.get(i)
        val v = AvroUtils.convertAvroFieldStrict(value, a.schema, f.getType)
        row.addValue(v)
    }
    row.build()
  }
}

Source File: AvroCoders.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.coders.instances

import java.io.{InputStream, OutputStream}

import com.spotify.scio.coders.{AvroCoderMacros, Coder}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.{SpecificData, SpecificFixed}
import org.apache.beam.sdk.coders.Coder.NonDeterministicException
import org.apache.beam.sdk.coders.{AtomicCoder, AvroCoder, StringUtf8Coder}
import org.apache.beam.sdk.util.common.ElementByteSizeObserver

import scala.reflect.{classTag, ClassTag}

final private class SlowGenericRecordCoder extends AtomicCoder[GenericRecord] {
  // TODO: can we find something more efficient than String ?
  private[this] val sc = StringUtf8Coder.of()

  override def encode(value: GenericRecord, os: OutputStream): Unit = {
    val schema = value.getSchema
    val coder = AvroCoder.of(schema)
    sc.encode(schema.toString, os)
    coder.encode(value, os)
  }

  override def decode(is: InputStream): GenericRecord = {
    val schemaStr = sc.decode(is)
    val schema = new Schema.Parser().parse(schemaStr)
    val coder = AvroCoder.of(schema)
    coder.decode(is)
  }

  // delegate methods for determinism and equality checks
  override def verifyDeterministic(): Unit =
    throw new NonDeterministicException(
      this,
      "Coder[GenericRecord] without schema is non-deterministic"
    )
  override def consistentWithEquals(): Boolean = false
  override def structuralValue(value: GenericRecord): AnyRef =
    AvroCoder.of(value.getSchema).structuralValue(value)

  // delegate methods for byte size estimation
  override def isRegisterByteSizeObserverCheap(value: GenericRecord): Boolean =
    AvroCoder.of(value.getSchema).isRegisterByteSizeObserverCheap(value)
  override def registerByteSizeObserver(
    value: GenericRecord,
    observer: ElementByteSizeObserver
  ): Unit =
    AvroCoder.of(value.getSchema).registerByteSizeObserver(value, observer)
}


  // TODO: Use a coder that does not serialize the schema
  def avroGenericRecordCoder(schema: Schema): Coder[GenericRecord] =
    Coder.beam(AvroCoder.of(schema))

  // XXX: similar to GenericAvroSerializer
  def avroGenericRecordCoder: Coder[GenericRecord] =
    Coder.beam(new SlowGenericRecordCoder)

  import org.apache.avro.specific.SpecificRecordBase
  implicit def genAvro[T <: SpecificRecordBase]: Coder[T] =
    macro AvroCoderMacros.staticInvokeCoder[T]

  implicit def avroSpecificFixedCoder[T <: SpecificFixed: ClassTag]: Coder[T] =
    SpecificFixedCoder[T]
}

Source File: StdAvroModelFactory.scala From aloha with MIT License

5 votes

package com.eharmony.aloha.factory.avro

import java.io.File

import org.apache.commons.{vfs => vfs1, vfs2}
import com.eharmony.aloha.io.vfs.{Vfs1, Vfs2}
import com.eharmony.aloha.audit.impl.avro.Score
import com.eharmony.aloha.factory.ModelFactory
import org.apache.avro.generic.GenericRecord

import scala.util.Try




  @deprecated(message = "Prefer StdAvroModelFactory.fromConfig(conf: FactoryConfig)", since = "4.0.1")
  def apply(modelDomainSchemaVfsUrl: String,
            modelCodomainRefInfoStr: String,
            imports: Seq[String] = Nil,
            classCacheDir: Option[File] = None,
            dereferenceAsOptional: Boolean = true,
            useVfs2: Boolean = true): Try[ModelFactory[GenericRecord, Score]] = {

    val vfs = url(modelDomainSchemaVfsUrl, useVfs2)

    vfs.flatMap { u =>
      UrlConfig(
        u,
        modelCodomainRefInfoStr,
        imports,
        classCacheDir,
        dereferenceAsOptional
      )()
    }
  }

  private[this] def url(modelDomainSchemaVfsUrl: String, useVfs2: Boolean) = {
    val u =
      if (useVfs2)
        Try { Vfs2(vfs2.VFS.getManager.resolveFile(modelDomainSchemaVfsUrl)) }
      else Try { Vfs1(vfs1.VFS.getManager.resolveFile(modelDomainSchemaVfsUrl)) }
    FactoryConfig.wrapException(u)
  }
}

Source File: AvroBytesUtil.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.coders

import java.nio.ByteBuffer

import org.apache.avro.{Schema => ASchema}
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.beam.sdk.coders.{Coder => BCoder}
import org.apache.beam.sdk.util.CoderUtils

import scala.jdk.CollectionConverters._

private[scio] object AvroBytesUtil {
  val schema: ASchema = {
    val s = ASchema.createRecord("AvroBytesRecord", null, null, false)
    s.setFields(
      List(
        new ASchema.Field(
          "bytes",
          ASchema.create(ASchema.Type.BYTES),
          null,
          null.asInstanceOf[Object]
        )
      ).asJava
    )
    s
  }

  def encode[T](coder: BCoder[T], obj: T): GenericRecord = {
    val bytes = CoderUtils.encodeToByteArray(coder, obj)
    val record = new GenericData.Record(schema)
    record.put("bytes", ByteBuffer.wrap(bytes))
    record
  }

  def decode[T](coder: BCoder[T], record: GenericRecord): T = {
    val bb = record.get("bytes").asInstanceOf[ByteBuffer]
    val bytes =
      java.util.Arrays.copyOfRange(bb.array(), bb.position(), bb.limit())
    CoderUtils.decodeFromByteArray(coder, bytes)
  }
}

Source File: GroupByBenchmark.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.jmh

import com.spotify.scio.{ScioContext, ScioExecutionContext}
import com.spotify.scio.avro._
import com.spotify.scio.coders._
import org.apache.beam.sdk.coders.{KvCoder, Coder => BCoder}
import org.apache.beam.sdk.values.KV
import org.apache.beam.sdk.transforms.GroupByKey
import org.apache.beam.sdk.options.{PipelineOptions, PipelineOptionsFactory}
import java.util.concurrent.TimeUnit

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.openjdk.jmh.annotations._

import scala.jdk.CollectionConverters._
@BenchmarkMode(Array(Mode.AverageTime))
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Thread)
class GroupByBenchmark {
  val schema =
    """
      {
        "type": "record",
        "name": "Event",
        "namespace": "smbjoin",
        "fields": [
          {
            "name": "id",
            "type": "string"
          },
          {
            "name": "value",
            "type": "double"
          }
        ]
      }
    """

  val avroSchema =
    new Schema.Parser().parse(schema)

  private def runWithContext[T](fn: ScioContext => T): ScioExecutionContext = {
    val opts = PipelineOptionsFactory.as(classOf[PipelineOptions])
    val sc = ScioContext(opts)
    fn(sc)
    sc.run()
  }

  val source = "src/test/resources/events-10000-0.avro"
  implicit val coderGenericRecord: Coder[GenericRecord] =
    Coder.avroGenericRecordCoder(avroSchema)

  val charCoder = CoderMaterializer.beamWithDefault(Coder[Char])
  val doubleCoder = CoderMaterializer.beamWithDefault(Coder[Double])
  val kvCoder: BCoder[KV[Char, Double]] = KvCoder.of(charCoder, doubleCoder)

  @Benchmark
  def testScioGroupByKey: ScioExecutionContext =
    runWithContext { sc =>
      sc.avroFile(source, schema = avroSchema)
        .map(rec => (rec.get("id").toString.head, rec.get("value").asInstanceOf[Double]))
        .groupByKey
    }

  @Benchmark
  def testBeamGroupByKey: ScioExecutionContext =
    runWithContext { sc =>
      sc.wrap {
        sc.avroFile(source, schema = avroSchema)
          .map { rec =>
            KV.of(rec.get("id").toString.head, rec.get("value").asInstanceOf[Double])
          }
          .internal
          .setCoder(kvCoder)
          .apply(GroupByKey.create[Char, Double])
      }.map(kv => (kv.getKey, kv.getValue.asScala))
    }
}

Source File: BigQueryIT.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.extra.bigquery

import java.{util => ju}

import com.google.protobuf.ByteString
import com.spotify.scio.avro.types.AvroType
import com.spotify.scio.bigquery.client.BigQuery
import com.spotify.scio.bigquery.Table
import com.spotify.scio.bigquery.TableRow
import com.spotify.scio.coders._
import com.spotify.scio.ContextAndArgs
import org.apache.avro.generic.GenericRecord
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryAvroUtilsWrapper
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

object BigQueryIT {
  @AvroType.fromSchema("""{
      | "type":"record",
      | "name":"Account",
      | "namespace":"com.spotify.scio.avro",
      | "doc":"Record for an account",
      | "fields":[
      |   {"name":"id","type":"long"},
      |   {"name":"type","type":"string"},
      |   {"name":"name","type":"string"},
      |   {"name":"amount","type":"double"},
      |   {"name":"secret","type":"bytes"}]}
    """.stripMargin)
  class Account

  implicit def genericCoder = Coder.avroGenericRecordCoder(Account.schema)

}

final class BigQueryIT extends AnyFlatSpec with Matchers {
  import BigQueryIT._

  it should "save avro to BigQuery" in {
    val args = Array(
      "--project=data-integration-test",
      "--tempLocation=gs://data-integration-test-eu/temp"
    )
    val (sc, _) = ContextAndArgs(args)
    val prefix = ju.UUID.randomUUID().toString.replaceAll("-", "")
    val table = Table.Spec(s"data-integration-test:bigquery_avro_it.${prefix}_accounts")

    val data: Seq[GenericRecord] = (1 to 100).map { i =>
      Account.toGenericRecord(
        Account(i, "checking", s"account$i", i.toDouble, ByteString.copyFromUtf8("%20cフーバー"))
      )
    }

    val tap = sc
      .parallelize(data)
      .saveAvroAsBigQuery(
        table.ref,
        Account.schema,
        writeDisposition = WriteDisposition.WRITE_EMPTY,
        createDisposition = CreateDisposition.CREATE_IF_NEEDED
      )

    val result = sc.run().waitUntilDone()

    val ts = BigQuery.defaultInstance().tables.schema(table.ref)
    val expected: Seq[TableRow] = data.map { gr =>
      BigQueryAvroUtilsWrapper.convertGenericRecordToTableRow(gr, ts)
    }

    result.tap(tap).value.toSet shouldEqual expected.toSet
  }

}

Source File: AvroUtils.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.avro

import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}

import scala.jdk.CollectionConverters._

object AvroUtils {
  private def f(name: String, tpe: Schema.Type) =
    new Schema.Field(
      name,
      Schema.createUnion(List(Schema.create(Schema.Type.NULL), Schema.create(tpe)).asJava),
      null: String,
      null: AnyRef
    )

  private def fArr(name: String, tpe: Schema.Type) =
    new Schema.Field(name, Schema.createArray(Schema.create(tpe)), null: String, null: AnyRef)

  val schema = Schema.createRecord("GenericTestRecord", null, null, false)
  schema.setFields(
    List(
      f("int_field", Schema.Type.INT),
      f("long_field", Schema.Type.LONG),
      f("float_field", Schema.Type.FLOAT),
      f("double_field", Schema.Type.DOUBLE),
      f("boolean_field", Schema.Type.BOOLEAN),
      f("string_field", Schema.Type.STRING),
      fArr("array_field", Schema.Type.STRING)
    ).asJava
  )

  def newGenericRecord(i: Int): GenericRecord = {
    val r = new GenericData.Record(schema)
    r.put("int_field", 1 * i)
    r.put("long_field", 1L * i)
    r.put("float_field", 1f * i)
    r.put("double_field", 1.0 * i)
    r.put("boolean_field", true)
    r.put("string_field", "hello")
    r.put("array_field", List[CharSequence]("a", "b", "c").asJava)
    r
  }

  def newSpecificRecord(i: Int): TestRecord =
    new TestRecord(
      i,
      i.toLong,
      i.toFloat,
      i.toDouble,
      true,
      "hello",
      List[CharSequence]("a", "b", "c").asJava
    )
}

Source File: MagnolifyAvroExampleTest.scala From scio with Apache License 2.0

5 votes

package com.spotify.scio.examples.extra

import com.spotify.scio.avro.AvroIO
import com.spotify.scio.io._
import com.spotify.scio.testing._
import org.apache.avro.generic.{GenericData, GenericRecord}

class MagnolifyAvroExampleTest extends PipelineSpec {
  import MagnolifyAvroExample._

  val textIn = Seq("a b c d e", "a b a b")
  val wordCount = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L))
  val records: Seq[GenericRecord] = wordCount.map { kv =>
    val r = new GenericData.Record(wordCountType.schema)
    r.put("word", kv._1)
    r.put("count", kv._2)
    r
  }
  val textOut = wordCount.map(kv => kv._1 + ": " + kv._2)

  "MagnolifyAvroWriteExample" should "work" in {
    JobTest[com.spotify.scio.examples.extra.MagnolifyAvroWriteExample.type]
      .args("--input=in.txt", "--output=wc.avro")
      .input(TextIO("in.txt"), textIn)
      .output(AvroIO[GenericRecord]("wc.avro"))(coll => coll should containInAnyOrder(records))
      .run()
  }

  "MagnolifyAvroReadExample" should "work" in {
    JobTest[com.spotify.scio.examples.extra.MagnolifyAvroReadExample.type]
      .args("--input=wc.avro", "--output=out.txt")
      .input(AvroIO[GenericRecord]("wc.avro"), records)
      .output(TextIO("out.txt"))(coll => coll should containInAnyOrder(textOut))
      .run()
  }
}

Source File: Utils.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.hbase

import java.util
import java.util.Comparator

import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.catalyst.expressions.MutableRow
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.execution.SparkSqlSerializer
import org.apache.spark.sql.types._

import scala.collection.mutable.ArrayBuffer
import scala.math.Ordering

object Utils {

  def setRowCol(
      row: MutableRow,
      field: (Field, Int),
      src: HBaseType,
      offset: Int,
      length: Int): Unit = {
    val index = field._2
    val f = field._1
    if (f.sedes.isDefined) {
      // If we already have sedes defined , use it.
      val m = f.sedes.get.deserialize(src, offset, length)
      row.update(index, m)
    } else if (f.exeSchema.isDefined) {
      // println("avro schema is defined to do deserialization")
      // If we have avro schema defined, use it to get record, and then covert them to catalyst data type
      val m = AvroSedes.deserialize(src, f.exeSchema.get)
      // println(m)
      val n = f.avroToCatalyst.map(_(m))
      row.update(index, n.get)
    } else  {
      // Fall back to atomic type
      f.dt match {
        case BooleanType => row.setBoolean(index, toBoolean(src, offset))
        case ByteType => row.setByte(index, src(offset))
        case DoubleType => row.setDouble(index, Bytes.toDouble(src, offset))
        case FloatType => row.setFloat(index, Bytes.toFloat(src, offset))
        case IntegerType => row.setInt(index, Bytes.toInt(src, offset))
        case LongType => row.setLong(index, Bytes.toLong(src, offset))
        case ShortType => row.setShort(index, Bytes.toShort(src, offset))
        case StringType => row.update(index, toUTF8String(src, offset, length))
        case BinaryType =>
          val newArray = new Array[Byte](length)
          System.arraycopy(src, offset, newArray, 0, length)
          row.update(index, newArray)
        case _ => row.update(index, SparkSqlSerializer.deserialize[Any](src)) //TODO
      }
    }
  }

  // convert input to data type
  def toBytes(input: Any, field: Field): Array[Byte] = {
    if (field.sedes.isDefined) {
      field.sedes.get.serialize(input)
    } else if (field.schema.isDefined) {
      // Here we assume the top level type is structType
      val record = field.catalystToAvro(input)
      AvroSedes.serialize(record, field.schema.get)
    } else {
      input match {
        case data: Boolean => Bytes.toBytes(data)
        case data: Byte => Array(data)
        case data: Array[Byte] => data
        case data: Double => Bytes.toBytes(data)
        case data: Float => Bytes.toBytes(data)
        case data: Int => Bytes.toBytes(data)
        case data: Long => Bytes.toBytes(data)
        case data: Short => Bytes.toBytes(data)
        case data: UTF8String => data.getBytes
        case data: String => Bytes.toBytes(data)
          //Bytes.toBytes(input.asInstanceOf[String])//input.asInstanceOf[UTF8String].getBytes
        case _ => throw new Exception(s"unsupported data type ${field.dt}") //TODO
      }
    }
  }

  def toBoolean(input: HBaseType, offset: Int): Boolean = {
    input(offset) != 0
  }

  def toUTF8String(input: HBaseType, offset: Int, length: Int): UTF8String = {
    UTF8String(input.slice(offset, offset + length))
  }
}

Source File: Sedes.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.hbase

import java.io.ByteArrayInputStream

import org.apache.avro.Schema
import org.apache.avro.Schema.Type._
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io._
import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._

trait Sedes {
  def serialize(value: Any): Array[Byte]
  def deserialize(bytes: Array[Byte], start: Int, end: Int): Any
}

class DoubleSedes extends Sedes {
  override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double])
  override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = {
    Bytes.toLong(bytes, start)
  }
}

Source File: StdAvroModelFactoryTest.scala From aloha with MIT License

5 votes

package com.eharmony.aloha.factory.avro

import com.eharmony.aloha.audit.impl.avro.Score
import com.eharmony.aloha.factory.ModelFactory
import com.eharmony.aloha.io.vfs.Vfs1
import com.eharmony.aloha.models.Model
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.commons.io.IOUtils
import org.junit.Assert.assertEquals
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.runners.BlockJUnit4ClassRunner

import scala.util.Try


  private[this] def record = {
    val r = new GenericData.Record(TheSchema)
    r.put("req_str_1", "smart handsome stubborn")
    r
  }
}

object StdAvroModelFactoryTest {
  private lazy val TheSchema = {
    val is = getClass.getClassLoader.getResourceAsStream(SchemaUrlResource)
    try new Schema.Parser().parse(is) finally IOUtils.closeQuietly(is)
  }

  private val ExpectedResult = 7d

  private val SchemaUrlResource = "avro/class7.avpr"

  private val SchemaUrl = s"res:$SchemaUrlResource"

  private val SchemaFile = new java.io.File(getClass.getClassLoader.getResource(SchemaUrlResource).getFile)

  private val SchemaVfs1FileObject = org.apache.commons.vfs.VFS.getManager.resolveFile(SchemaUrl)

  private val SchemaVfs2FileObject = org.apache.commons.vfs2.VFS.getManager.resolveFile(SchemaUrl)

  private val Imports = Seq("com.eharmony.aloha.feature.BasicFunctions._", "scala.math._")

  private val ReturnType = "Double"

  private val ModelJson =
    """
      |{
      |  "modelType": "Regression",
      |  "modelId": { "id": 0, "name": "" },
      |  "features" : {
      |    "my_attributes": "${req_str_1}.split(\"\\\\W+\").map(v => (s\"=$v\", 1.0))"
      |  },
      |  "weights": {
      |    "my_attributes=handsome": 1,
      |    "my_attributes=smart": 2,
      |    "my_attributes=stubborn": 4
      |  }
      |}
    """.stripMargin
}

Source File: ImplicitsTest.scala From aloha with MIT License

5 votes

package com.eharmony.aloha.audit.impl.avro

import com.google.common.collect.Lists
import org.junit.Assert.assertEquals
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.runners.BlockJUnit4ClassRunner

import scala.collection.JavaConverters.seqAsJavaListConverter
import com.eharmony.aloha.audit.impl.avro.Implicits.{RichFlatScore, RichScore}
import java.{lang => jl, util => ju}

import org.apache.avro.generic.GenericRecord


  @Test def testAllFieldsAppear(): Unit = {
    val s = filledInScore
    assertEquals(s, s.toFlatScore.toScore)
  }

  @Test def testSameFieldsInGenericRecord(): Unit = {
    val s = filledInScore
    val s1 = s.asInstanceOf[GenericRecord]
    val s2 = s.toFlatScore.asInstanceOf[GenericRecord]

    testStuff(s1, s2, Map(
      "model" -> modelId,
      "value" -> value,
      "errorMsgs" -> errors,
      "missingVarNames" -> missing,
      "prob" -> prob
    ))
  }

  private[this] def testStuff(r1: GenericRecord, r2: GenericRecord, data: Map[String, Any]): Unit = {
    data.foreach { case (k, v) =>
      val v1 = r1.get(k)
      val v2 = r2.get(k)
      assertEquals(s"for r1('$k') = $v1.  Expected $v", v, r1.get(k))
      assertEquals(s"for r2('$k') = $v2.  Expected $v", v, r2.get(k))
    }
  }
}


object ImplicitsTest {
  private def filledInScore = new Score(modelId, value, subvalues, errors, missing, prob)
  private def modelId = new ModelId(5L, "five")
  private def value: jl.Double = 13d
  private def subvalues = Lists.newArrayList(scr(12L, 8))
  private def errors: ju.List[CharSequence] = Lists.newArrayList("one error", "two errors")
  private def missing: ju.List[CharSequence] =
    Lists.newArrayList("some feature", "another feature", "yet another feature")
  private def prob: jl.Float = 1f

  private lazy val score: Score =
    scr(1, 1,
      scr(2L, 2,
        scr(4f, 4),
        scr(5,  5)
      ),
      scr(3d, 3,
        scr(6d, 6),
        scr(7L, 7)
      )
    )

  private lazy val irregularTree: Score =
    scr(1, 1,
      scr(2L, 2),
      scr(3d, 3,
        scr(5d, 5),
        scr(6L, 6)
      ),
      scr(4d, 4,
        scr(7L, 7)
      )
    )

  private[this] def scr(value: Any, id: Long, children: Score*): Score = {
    new Score(
      new ModelId(id, ""),
      value,
      Lists.newArrayList(children.asJava),
      java.util.Collections.emptyList(),
      java.util.Collections.emptyList(),
      null
    )
  }
}

Source File: AvroDataInputStream.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s

import java.io.InputStream

import org.apache.avro.Schema
import org.apache.avro.file.DataFileStream
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.io.DatumReader

import scala.util.Try

class AvroDataInputStream[T](in: InputStream,
                             writerSchema: Option[Schema])
                            (implicit decoder: Decoder[T]) extends AvroInputStream[T] {

  val resolved = decoder.resolveDecoder()

  // if no reader or writer schema is specified, then we create a reader that uses what's present in the files
  private val datumReader = writerSchema match {
    case Some(writer) => GenericData.get.createDatumReader(writer, resolved.schema)
    case None => GenericData.get.createDatumReader(null, resolved.schema)
  }

  private val dataFileReader = new DataFileStream[GenericRecord](in, datumReader.asInstanceOf[DatumReader[GenericRecord]])

  override def iterator: Iterator[T] = new Iterator[T] {
    override def hasNext: Boolean = dataFileReader.hasNext
    override def next(): T = {
      val record = dataFileReader.next
      resolved.decode(record)
    }
  }

  override def tryIterator: Iterator[Try[T]] = new Iterator[Try[T]] {
    override def hasNext: Boolean = dataFileReader.hasNext
    override def next(): Try[T] = Try {
      val record = dataFileReader.next
      resolved.decode(record)
    }
  }

  override def close(): Unit = in.close()
}

Source File: AvroRecord.scala From hydra with Apache License 2.0

5 votes

package hydra.kafka.producer

import com.pluralsight.hydra.avro.JsonConverter
import hydra.core.transport.AckStrategy
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.commons.lang3.StringUtils


case class AvroRecord(
    destination: String,
    schema: Schema,
    key: String,
    payload: GenericRecord,
    ackStrategy: AckStrategy
) extends KafkaRecord[String, GenericRecord]

object AvroRecord {

  def apply(
      destination: String,
      schema: Schema,
      key: Option[String],
      json: String,
      ackStrategy: AckStrategy,
      useStrictValidation: Boolean = false
  ): AvroRecord = {

    val payload: GenericRecord = {
      val converter: JsonConverter[GenericRecord] =
        new JsonConverter[GenericRecord](schema, useStrictValidation)
      converter.convert(json)
    }

    AvroRecord(destination, schema, key.orNull, payload, ackStrategy)
  }

  def apply(
      destination: String,
      schema: Schema,
      key: Option[String],
      record: GenericRecord,
      ackStrategy: AckStrategy
  ): AvroRecord = {
    AvroRecord(destination, schema, key.orNull, record, ackStrategy)
  }
}

Source File: SpecificDefautValuesSpec.scala From sbt-avrohugger with Apache License 2.0

5 votes

import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

import DefaultEnum._

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === List(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

}

Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0

5 votes

package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]()
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file)
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close()
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

}

Source File: SpecificDefautValuesSpec.scala From sbt-avrohugger with Apache License 2.0

5 votes

import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

import DefaultEnum._

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === List(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

}

Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0

5 votes

package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

}

Source File: SpecificDefautValuesSpec.scala From sbt-avrohugger with Apache License 2.0

5 votes

import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === "SPADES"
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === Array(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

}

Source File: BytesWithSchemaToObject.scala From trucking-iot with Apache License 2.0

5 votes

package com.orendainx.trucking.storm.bolts

import java.io.ByteArrayInputStream
import java.nio.charset.StandardCharsets
import java.util

import com.hortonworks.registries.schemaregistry.SchemaMetadata
import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer
import com.orendainx.trucking.commons.models.{EnrichedTruckData, TrafficData}
import com.typesafe.scalalogging.Logger
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.storm.task.{OutputCollector, TopologyContext}
import org.apache.storm.topology.OutputFieldsDeclarer
import org.apache.storm.topology.base.BaseRichBolt
import org.apache.storm.tuple.{Fields, Tuple, Values}

import scala.collection.JavaConversions._



  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToEnrichedTruckData(r: GenericRecord): EnrichedTruckData =
    EnrichedTruckData(
      r.get("eventTime").toString.toLong,
      r.get("truckId").toString.toInt,
      r.get("driverId").toString.toInt,
      r.get("driverName").toString,
      r.get("routeId").toString.toInt,
      r.get("routeName").toString,
      r.get("latitude").toString.toDouble,
      r.get("longitude").toString.toDouble,
      r.get("speed").toString.toInt,
      r.get("eventType").toString,
      r.get("foggy").toString.toInt,
      r.get("rainy").toString.toInt,
      r.get("windy").toString.toInt)

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToTrafficData(r: GenericRecord): TrafficData =
    TrafficData(r.get("eventTime").toString.toLong, r.get("routeId").toString.toInt, r.get("congestionLevel").toString.toInt)
}

Source File: NiFiPacketWithSchemaToObject.scala From trucking-iot with Apache License 2.0

5 votes

package com.orendainx.trucking.storm.bolts

import java.io.ByteArrayInputStream
import java.util

import com.hortonworks.registries.schemaregistry.SchemaMetadata
import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer
import com.orendainx.trucking.commons.models.{EnrichedTruckData, TrafficData}
import com.typesafe.scalalogging.Logger
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.nifi.storm.NiFiDataPacket
import org.apache.storm.task.{OutputCollector, TopologyContext}
import org.apache.storm.topology.OutputFieldsDeclarer
import org.apache.storm.topology.base.BaseRichBolt
import org.apache.storm.tuple.{Fields, Tuple, Values}

import scala.collection.JavaConversions._


class NiFiPacketWithSchemaToObject extends BaseRichBolt {

  private lazy val log = Logger(this.getClass)
  private var outputCollector: OutputCollector = _

  // Declare schema-related fields to be initialized when this component's prepare() method is called
  private var schemaRegistryClient: SchemaRegistryClient = _
  private var deserializer: AvroSnapshotDeserializer = _
  private var truckDataSchemaMetadata: SchemaMetadata = _
  private var trafficDataSchemaMetadata: SchemaMetadata = _

  override def prepare(stormConf: util.Map[_, _], context: TopologyContext, collector: OutputCollector): Unit = {

    outputCollector = collector

    val schemaRegistryUrl = stormConf.get(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name()).toString
    val clientConfig = Map(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name() -> schemaRegistryUrl)

    schemaRegistryClient = new SchemaRegistryClient(clientConfig)
    truckDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("EnrichedTruckData").getSchemaMetadata
    trafficDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("TrafficData").getSchemaMetadata
    deserializer = schemaRegistryClient.getDefaultDeserializer(AvroSchemaProvider.TYPE).asInstanceOf[AvroSnapshotDeserializer]
    deserializer.init(clientConfig)
  }

  override def execute(tuple: Tuple): Unit = {
    val dp = tuple.getValueByField("nifiDataPacket").asInstanceOf[NiFiDataPacket]

    // Deserialize each tuple and convert it into its proper case class (e.g. EnrichedTruckData or TrafficData)
    val (dataType, data) = dp.getAttributes.get("dataType") match {
      case typ @ "EnrichedTruckData" => (typ, recordToEnrichedTruckData(deserializer.deserialize(new ByteArrayInputStream(dp.getContent), null).asInstanceOf[GenericData.Record]))
      case typ @ "TrafficData" => (typ, recordToTrafficData(deserializer.deserialize(new ByteArrayInputStream(dp.getContent), null).asInstanceOf[GenericData.Record]))
    }

    outputCollector.emit(new Values(data, dataType))
    outputCollector.ack(tuple)
  }

  override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("data", "dataType"))

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToEnrichedTruckData(r: GenericRecord): EnrichedTruckData =
    EnrichedTruckData(
      r.get("eventTime").toString.toLong,
      r.get("truckId").toString.toInt,
      r.get("driverId").toString.toInt,
      r.get("driverName").toString,
      r.get("routeId").toString.toInt,
      r.get("routeName").toString,
      r.get("latitude").toString.toDouble,
      r.get("longitude").toString.toDouble,
      r.get("speed").toString.toInt,
      r.get("eventType").toString,
      r.get("foggy").toString.toInt,
      r.get("rainy").toString.toInt,
      r.get("windy").toString.toInt)

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToTrafficData(r: GenericRecord): TrafficData =
    TrafficData(r.get("eventTime").toString.toLong, r.get("routeId").toString.toInt, r.get("congestionLevel").toString.toInt)
}

Source File: SerializedWithSchemaToObject.scala From trucking-iot with Apache License 2.0

5 votes

package com.orendainx.trucking.storm.bolts

import java.io.ByteArrayInputStream
import java.nio.charset.StandardCharsets
import java.util

import com.hortonworks.registries.schemaregistry.SchemaMetadata
import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer
import com.orendainx.trucking.commons.models.{EnrichedTruckData, TrafficData}
import com.typesafe.scalalogging.Logger
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.storm.task.{OutputCollector, TopologyContext}
import org.apache.storm.topology.OutputFieldsDeclarer
import org.apache.storm.topology.base.BaseRichBolt
import org.apache.storm.tuple.{Fields, Tuple, Values}

import scala.collection.JavaConversions._


class SerializedWithSchemaToObject extends BaseRichBolt {

  private lazy val log = Logger(this.getClass)
  private var outputCollector: OutputCollector = _

  // Declare schema-related fields to be initialized when this component's prepare() method is called
  private var schemaRegistryClient: SchemaRegistryClient = _
  private var deserializer: AvroSnapshotDeserializer = _
  private var truckDataSchemaMetadata: SchemaMetadata = _
  private var trafficDataSchemaMetadata: SchemaMetadata = _

  override def prepare(stormConf: util.Map[_, _], context: TopologyContext, collector: OutputCollector): Unit = {

    outputCollector = collector

    val schemaRegistryUrl = stormConf.get(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name()).toString
    val clientConfig = Map(SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name() -> schemaRegistryUrl)

    schemaRegistryClient = new SchemaRegistryClient(clientConfig)
    truckDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("EnrichedTruckData").getSchemaMetadata
    trafficDataSchemaMetadata = schemaRegistryClient.getSchemaMetadataInfo("TrafficData").getSchemaMetadata
    deserializer = schemaRegistryClient.getDefaultDeserializer(AvroSchemaProvider.TYPE).asInstanceOf[AvroSnapshotDeserializer]
    deserializer.init(clientConfig)
  }

  override def execute(tuple: Tuple): Unit = {

    // Deserialize each tuple and convert it into its proper case class (e.g. EnrichedTruckData or TrafficData)
    val str = tuple.getStringByField("data").getBytes(StandardCharsets.UTF_8)
    log.info(s"str2: ${tuple.getStringByField("data")}")
    val bytes = new ByteArrayInputStream(str)
    log.info(s"bytes: $bytes")
    val (dataType, data) = tuple.getStringByField("dataType") match {
      case typ @ "EnrichedTruckData" =>
        log.info(s"des: ${deserializer.deserialize(bytes, null)}")
        (typ, recordToEnrichedTruckData(deserializer.deserialize(bytes, null).asInstanceOf[GenericData.Record]))
      case typ @ "TrafficData" =>
        log.info(s"des: ${deserializer.deserialize(bytes, null)}")
        (typ, recordToTrafficData(deserializer.deserialize(bytes, null).asInstanceOf[GenericData.Record]))
    }

    outputCollector.emit(new Values(data, dataType))
    outputCollector.ack(tuple)
  }

  override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("data", "dataType"))

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToEnrichedTruckData(r: GenericRecord): EnrichedTruckData =
    EnrichedTruckData(
      r.get("eventTime").toString.toLong,
      r.get("truckId").toString.toInt,
      r.get("driverId").toString.toInt,
      r.get("driverName").toString,
      r.get("routeId").toString.toInt,
      r.get("routeName").toString,
      r.get("latitude").toString.toDouble,
      r.get("longitude").toString.toDouble,
      r.get("speed").toString.toInt,
      r.get("eventType").toString,
      r.get("foggy").toString.toInt,
      r.get("rainy").toString.toInt,
      r.get("windy").toString.toInt)

  // Helper function to convert GenericRecord (result of deserializing via Schema Registry) into JVM object
  private def recordToTrafficData(r: GenericRecord): TrafficData =
    TrafficData(r.get("eventTime").toString.toLong, r.get("routeId").toString.toInt, r.get("congestionLevel").toString.toInt)
}

Source File: MetadataAlgebraSpec.scala From hydra with Apache License 2.0

5 votes

package hydra.kafka.algebras

import java.time.Instant

import cats.data.NonEmptyList
import cats.effect.{Concurrent, ContextShift, IO, Sync, Timer}
import cats.implicits._
import hydra.avro.registry.SchemaRegistry
import hydra.core.marshallers.History
import hydra.kafka.algebras.MetadataAlgebra.TopicMetadataContainer
import hydra.kafka.model.ContactMethod.Slack
import hydra.kafka.model.TopicMetadataV2Request.Subject
import hydra.kafka.model.{Public, StreamTypeV2, TopicMetadataV2, TopicMetadataV2Key, TopicMetadataV2Request, TopicMetadataV2Value}
import io.chrisdavenport.log4cats.SelfAwareStructuredLogger
import io.chrisdavenport.log4cats.slf4j.Slf4jLogger
import org.apache.avro.generic.GenericRecord
import org.scalatest.Assertion
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpecLike
import retry.RetryPolicies._
import retry.syntax.all._
import retry.{RetryPolicy, _}

import scala.concurrent.ExecutionContext
import scala.concurrent.duration._

class MetadataAlgebraSpec extends AnyWordSpecLike with Matchers {

  implicit private val contextShift: ContextShift[IO] = IO.contextShift(ExecutionContext.global)
  private implicit val concurrentEffect: Concurrent[IO] = IO.ioConcurrentEffect

  private implicit val policy: RetryPolicy[IO] = limitRetries[IO](5) |+| exponentialBackoff[IO](500.milliseconds)
  private implicit val timer: Timer[IO] = IO.timer(ExecutionContext.global)
  private implicit def noop[A]: (A, RetryDetails) => IO[Unit] = retry.noop[IO, A]

  implicit private def unsafeLogger[F[_]: Sync]: SelfAwareStructuredLogger[F] =
    Slf4jLogger.getLogger[F]

  private implicit class RetryAndAssert[A](boolIO: IO[A]) {
    def retryIfFalse(check: A => Boolean): IO[Assertion] =
      boolIO.map(check).retryingM(identity, policy, noop).map(assert(_))
  }


  private val metadataTopicName = "_internal.metadataTopic"
  private val consumerGroup = "Consumer Group"

  (for {
    kafkaClient <- KafkaClientAlgebra.test[IO]
    schemaRegistry <- SchemaRegistry.test[IO]
    metadata <- MetadataAlgebra.make(metadataTopicName, consumerGroup, kafkaClient, schemaRegistry, consumeMetadataEnabled = true)
  } yield {
    runTests(metadata, kafkaClient)
  }).unsafeRunSync()

  private def runTests(metadataAlgebra: MetadataAlgebra[IO], kafkaClientAlgebra: KafkaClientAlgebra[IO]): Unit = {
    "MetadataAlgebraSpec" should {

      "retrieve none for non-existant topic" in {
        val subject = Subject.createValidated("Non-existantTopic").get
        metadataAlgebra.getMetadataFor(subject).unsafeRunSync() shouldBe None
      }

      "retrieve metadata" in {
        val subject = Subject.createValidated("subject1").get
        val (genericRecordsIO, key, value) = getMetadataGenericRecords(subject)

        (for {
          record <- genericRecordsIO
          _ <- kafkaClientAlgebra.publishMessage(record, metadataTopicName)
          _ <- metadataAlgebra.getMetadataFor(subject).retryIfFalse(_.isDefined)
          metadata <- metadataAlgebra.getMetadataFor(subject)
        } yield metadata shouldBe Some(TopicMetadataContainer(key, value, None, None))).unsafeRunSync()
      }

      "retrieve all metadata" in {
        val subject = Subject.createValidated("subject2").get
        val (genericRecordsIO, key, value) = getMetadataGenericRecords(subject)
        (for {
          record <- genericRecordsIO
          _ <- kafkaClientAlgebra.publishMessage(record, metadataTopicName)
          _ <- metadataAlgebra.getMetadataFor(subject).retryIfFalse(_.isDefined)
          allMetadata <- metadataAlgebra.getAllMetadata
        } yield allMetadata should have length 2).unsafeRunSync()
      }
    }
  }

  private def getMetadataGenericRecords(subject: Subject): (IO[(GenericRecord, Option[GenericRecord])], TopicMetadataV2Key, TopicMetadataV2Value) = {
    val key = TopicMetadataV2Key(subject)
    val value = TopicMetadataV2Value(
        StreamTypeV2.Entity,
        deprecated = false,
        Public,
        NonEmptyList.one(Slack.create("#channel").get),
        Instant.now,
        List(),
        None)
    (TopicMetadataV2.encode[IO](key, Some(value)), key, value)
  }
}

Source File: AvroKeyRecord.scala From hydra with Apache License 2.0

5 votes

package hydra.kafka.producer

import com.pluralsight.hydra.avro.JsonConverter
import hydra.core.transport.AckStrategy
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord

final case class AvroKeyRecord(
    destination: String,
    keySchema: Schema,
    valueSchema: Schema,
    key: GenericRecord,
    payload: GenericRecord,
    ackStrategy: AckStrategy
) extends KafkaRecord[GenericRecord, GenericRecord]

object AvroKeyRecord {

  def apply(
      destination: String,
      keySchema: Schema,
      valueSchema: Schema,
      keyJson: String,
      valueJson: String,
      ackStrategy: AckStrategy
  ): AvroKeyRecord = {

    val (key, value): (GenericRecord, GenericRecord) = {
      val keyConverter: String => GenericRecord =
        new JsonConverter[GenericRecord](keySchema).convert
      val valueConverter: String => GenericRecord =
        new JsonConverter[GenericRecord](valueSchema).convert
      (keyConverter(keyJson), valueConverter(valueJson))
    }

    AvroKeyRecord(destination, keySchema, valueSchema, key, value, ackStrategy)
  }

  def apply(
      destination: String,
      keySchema: Schema,
      valueSchema: Schema,
      key: GenericRecord,
      value: GenericRecord,
      ackStrategy: AckStrategy
  ): AvroKeyRecord = {
    new AvroKeyRecord(
      destination,
      keySchema,
      valueSchema,
      key,
      value,
      ackStrategy
    )
  }
}

Source File: AvroRecordFactory.scala From hydra with Apache License 2.0

5 votes

package hydra.kafka.producer

import akka.actor.ActorRef
import akka.pattern.ask
import akka.util
import com.pluralsight.hydra.avro.JsonConverter
import hydra.avro.registry.ConfluentSchemaRegistry
import hydra.avro.resource.SchemaResource
import hydra.avro.util.AvroUtils
import hydra.common.config.ConfigSupport
import hydra.common.logging.LoggingAdapter
import hydra.core.akka.SchemaRegistryActor.{FetchSchemaRequest, FetchSchemaResponse}
import hydra.core.ingest.HydraRequest
import hydra.core.transport.ValidationStrategy.Strict
import org.apache.avro.generic.GenericRecord

import scala.concurrent.duration._
import scala.concurrent.{ExecutionContext, Future}


class AvroRecordFactory(schemaResourceLoader: ActorRef)
    extends KafkaRecordFactory[String, GenericRecord]
    with ConfigSupport with LoggingAdapter {

  private implicit val timeout = util.Timeout(3.seconds)

  override def build(
      request: HydraRequest
  )(implicit ec: ExecutionContext): Future[AvroRecord] = {
    for {
      (topic, subject) <- Future.fromTry(getTopicAndSchemaSubject(request))
      schemaResource <- (schemaResourceLoader ? FetchSchemaRequest(subject))
        .mapTo[FetchSchemaResponse]
        .map(_.schemaResource)
      record <- convert(schemaResource, request)
    } yield AvroRecord(
      topic,
      schemaResource.schema,
      getKey(request, record),
      record,
      request.ackStrategy
    )
  }

  private def convert(schemaResource: SchemaResource, request: HydraRequest)(
      implicit ec: ExecutionContext
  ): Future[GenericRecord] = {
    val converter = new JsonConverter[GenericRecord](
      schemaResource.schema,
      request.validationStrategy == Strict
    )
    Future({
      val converted = converter.convert(request.payload)
      converted
    }).recover {
      case ex => throw AvroUtils.improveException(ex, schemaResource,
        ConfluentSchemaRegistry.registryUrl(applicationConfig))
    }
  }
}

Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0

5 votes

package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord.equals(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

}

Source File: KafkaRecordFactory.scala From hydra with Apache License 2.0

5 votes

package hydra.kafka.producer

import com.fasterxml.jackson.databind.JsonNode
import hydra.avro.util.SchemaWrapper
import hydra.core.ingest.RequestParams._
import hydra.core.ingest.{HydraRequest, RequestParams}
import hydra.core.protocol.MissingMetadataException
import hydra.core.transport.RecordFactory
import hydra.kafka.producer.KafkaRecordFactory.RecordKeyExtractor
import org.apache.avro.generic.GenericRecord

import scala.util.{Failure, Success, Try}


  def getTopicAndSchemaSubject(request: HydraRequest): Try[(String, String)] = {
    val subject = request.metadataValue(RequestParams.HYDRA_SCHEMA_PARAM)
    request.metadataValue(HYDRA_KAFKA_TOPIC_PARAM) match {
      case Some(topic) => Success(topic -> subject.getOrElse(topic))
      case None =>
        Failure(
          MissingMetadataException(
            HYDRA_KAFKA_TOPIC_PARAM,
            "No kafka topic present in the request."
          )
        )
    }
  }
}

object KafkaRecordFactory {

  trait RecordKeyExtractor[K, V] {

    def extractKeyValue(request: HydraRequest, record: V): Option[K]
  }

  object RecordKeyExtractor {

    implicit object StringRecordKeyExtractor
        extends RecordKeyExtractor[String, String] {

      override def extractKeyValue(
          request: HydraRequest,
          record: String
      ): Option[String] = {
        request
          .metadataValue(HYDRA_RECORD_KEY_PARAM)
          .map(key => JsonPathKeys.getKey(key, record))
      }
    }

    implicit object JsonRecordKeyExtractor
        extends RecordKeyExtractor[String, JsonNode] {

      override def extractKeyValue(
          request: HydraRequest,
          record: JsonNode
      ): Option[String] = {
        request
          .metadataValue(HYDRA_RECORD_KEY_PARAM)
          .map(key => JsonPathKeys.getKey(key, record.toString))
      }
    }

    implicit object SchemaKeyExtractor
        extends RecordKeyExtractor[String, GenericRecord] {

      override def extractKeyValue(
          request: HydraRequest,
          payload: GenericRecord
      ): Option[String] = {
        request
          .metadataValue(HYDRA_RECORD_KEY_PARAM)
          .map { key => JsonPathKeys.getKey(key, request.payload) }
          .orElse {
            val schema = payload.getSchema
            val wrapper = SchemaWrapper.from(schema)
            wrapper
              .validate()
              .get //we're throwing the exception here so that the request ends with a 400
            wrapper.primaryKeys.map(payload.get) match {
              case Nil  => None
              case keys => Some(keys.mkString("|"))
            }
          }
      }
    }

  }

}

Source File: IngestionFlowV2.scala From hydra with Apache License 2.0

5 votes

package hydra.ingest.services

import java.io.IOException

import cats.MonadError
import cats.implicits._
import hydra.avro.registry.SchemaRegistry
import hydra.avro.resource.SchemaResourceLoader.SchemaNotFoundException
import hydra.avro.util.SchemaWrapper
import hydra.core.transport.ValidationStrategy
import hydra.kafka.algebras.KafkaClientAlgebra
import hydra.kafka.algebras.KafkaClientAlgebra.PublishResponse
import hydra.kafka.model.TopicMetadataV2Request.Subject
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import scalacache._
import scalacache.guava._
import scalacache.memoization._

import scala.concurrent.duration._
import scala.util.{Failure, Try}

final class IngestionFlowV2[F[_]: MonadError[*[_], Throwable]: Mode](
                                                                    schemaRegistry: SchemaRegistry[F],
                                                                    kafkaClient: KafkaClientAlgebra[F],
                                                                    schemaRegistryBaseUrl: String) {

  import IngestionFlowV2._
  import hydra.avro.convert.StringToGenericRecord._

  implicit val guavaCache: Cache[SchemaWrapper] = GuavaCache[SchemaWrapper]

  private def getSchema(subject: String): F[Schema] = {
    schemaRegistry.getLatestSchemaBySubject(subject)
      .flatMap { maybeSchema =>
        val schemaNotFound = SchemaNotFoundException(subject)
        MonadError[F, Throwable].fromOption(maybeSchema, SchemaNotFoundAugmentedException(schemaNotFound, subject))
      }
  }

  private def getSchemaWrapper(subject: Subject, isKey: Boolean): F[SchemaWrapper] = memoizeF[F, SchemaWrapper](Some(2.minutes)) {
    val suffix = if (isKey) "-key" else "-value"
    getSchema(subject.value + suffix).map { sch =>
      SchemaWrapper.from(sch)
    }
  }

  private def recover[A](subject: Subject, isKey: Boolean): PartialFunction[Throwable, Try[A]] = {
    val suffix = if (isKey) "-key" else "-value"
    val location = s"$schemaRegistryBaseUrl/subjects/${subject.value}$suffix/versions/latest/schema"
    val pf: PartialFunction[Throwable, Try[A]] = {
      case e: ValidationExtraFieldsError =>
        Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e: InvalidLogicalTypeError =>
        Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e: IOException =>
        Failure(AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e => Failure(e)
    }
    pf
  }

  private def getSchemas(request: V2IngestRequest, topic: Subject): F[(GenericRecord, Option[GenericRecord])] = {
    val useStrictValidation = request.validationStrategy.getOrElse(ValidationStrategy.Strict) == ValidationStrategy.Strict
    def getRecord(payload: String, schema: Schema): Try[GenericRecord] =
      payload.toGenericRecord(schema, useStrictValidation)
    for {
      kSchema <- getSchemaWrapper(topic, isKey = true)
      vSchema <- getSchemaWrapper(topic, isKey = false)
      k <- MonadError[F, Throwable].fromTry(
        getRecord(request.keyPayload, kSchema.schema).recoverWith(recover(topic, isKey = true)))
      v <- MonadError[F, Throwable].fromTry(
        request.valPayload.traverse(getRecord(_, vSchema.schema)).recoverWith(recover(topic, isKey = false)))
    } yield (k, v)
  }

  def ingest(request: V2IngestRequest, topic: Subject): F[PublishResponse] = {
    getSchemas(request, topic).flatMap { case (key, value) =>
      kafkaClient.publishMessage((key, value), topic.value).rethrow
    }
  }
}

object IngestionFlowV2 {
  final case class V2IngestRequest(keyPayload: String, valPayload: Option[String], validationStrategy: Option[ValidationStrategy])

  final case class AvroConversionAugmentedException(message: String) extends RuntimeException(message)
  final case class SchemaNotFoundAugmentedException(schemaNotFoundException: SchemaNotFoundException, topic: String)
    extends RuntimeException(s"Schema '$topic' cannot be loaded. Cause: ${schemaNotFoundException.getClass.getName}: Schema not found for $topic")
}

Source File: IngestionFlow.scala From hydra with Apache License 2.0

5 votes

package hydra.ingest.services

import java.io.IOException

import cats.MonadError
import cats.implicits._
import com.pluralsight.hydra.avro.JsonToAvroConversionException
import hydra.avro.registry.SchemaRegistry
import hydra.avro.resource.SchemaResourceLoader.SchemaNotFoundException
import hydra.avro.util.SchemaWrapper
import hydra.core.ingest.HydraRequest
import hydra.core.ingest.RequestParams.{HYDRA_KAFKA_TOPIC_PARAM, HYDRA_RECORD_KEY_PARAM}
import hydra.core.transport.{AckStrategy, ValidationStrategy}
import hydra.kafka.algebras.KafkaClientAlgebra
import hydra.kafka.producer.AvroRecord
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import scalacache._
import scalacache.guava._
import scalacache.memoization._

import scala.concurrent.duration._
import scala.util.{Failure, Success, Try}

final class IngestionFlow[F[_]: MonadError[*[_], Throwable]: Mode](
                                                                    schemaRegistry: SchemaRegistry[F],
                                                                    kafkaClient: KafkaClientAlgebra[F],
                                                                    schemaRegistryBaseUrl: String
                                                                  ) {

  import IngestionFlow._

  implicit val guavaCache: Cache[SchemaWrapper] = GuavaCache[SchemaWrapper]

  private def getValueSchema(topicName: String): F[Schema] = {
    schemaRegistry.getLatestSchemaBySubject(topicName + "-value")
      .flatMap { maybeSchema =>
        val schemaNotFound = SchemaNotFoundException(topicName)
        MonadError[F, Throwable].fromOption(maybeSchema, SchemaNotFoundAugmentedException(schemaNotFound, topicName))
      }
  }

  private def getValueSchemaWrapper(topicName: String): F[SchemaWrapper] = memoizeF[F, SchemaWrapper](Some(2.minutes)) {
    getValueSchema(topicName).map { valueSchema =>
      SchemaWrapper.from(valueSchema)
    }
  }

  def ingest(request: HydraRequest): F[Unit] = {
    request.metadataValue(HYDRA_KAFKA_TOPIC_PARAM) match {
      case Some(topic) => getValueSchemaWrapper(topic).flatMap { schemaWrapper =>
        val useStrictValidation = request.validationStrategy == ValidationStrategy.Strict
        val payloadTryMaybe: Try[Option[GenericRecord]] = Option(request.payload) match {
          case Some(p) => convertToAvro(topic, schemaWrapper, useStrictValidation, p).map(avroRecord => Some(avroRecord.payload))
          case None => Success(None)
        }
        val v1Key = getV1RecordKey(schemaWrapper, payloadTryMaybe, request)
        MonadError[F, Throwable].fromTry(payloadTryMaybe).flatMap { payloadMaybe =>
          kafkaClient.publishStringKeyMessage((v1Key, payloadMaybe), topic).void
        }
      }
      case None => MonadError[F, Throwable].raiseError(MissingTopicNameException(request))
    }
  }

  private def getV1RecordKey(schemaWrapper: SchemaWrapper, payloadTryMaybe: Try[Option[GenericRecord]], request: HydraRequest): Option[String] = {
    val headerV1Key = request.metadata.get(HYDRA_RECORD_KEY_PARAM)
    val optionString = schemaWrapper.primaryKeys.toList match {
      case Nil => None
      case l => l.flatMap(pkName => payloadTryMaybe match {
        case Success(payloadMaybe) =>
          payloadMaybe.flatMap(p => Try(p.get(pkName)).toOption)
        case Failure(_) => None
      }).mkString("|").some
    }
    headerV1Key.orElse(optionString)
  }

  private def convertToAvro(topic: String, schemaWrapper: SchemaWrapper, useStrictValidation: Boolean, payloadString: String): Try[AvroRecord] = {
    Try(AvroRecord(topic, schemaWrapper.schema, None, payloadString, AckStrategy.Replicated, useStrictValidation)).recoverWith {
      case e: JsonToAvroConversionException =>
        val location = s"$schemaRegistryBaseUrl/subjects/$topic-value/versions/latest/schema"
        Failure(new AvroConversionAugmentedException(s"${e.getClass.getName}: ${e.getMessage} [$location]"))
      case e: IOException =>
        val location = s"$schemaRegistryBaseUrl/subjects/$topic-value/versions/latest/schema"
        Failure(new AvroConversionAugmentedException(s"${e.getMessage} [$location]"))
      case e => Failure(e)
    }
  }
}

object IngestionFlow {
  final case class MissingTopicNameException(request: HydraRequest)
    extends Exception(s"Missing the topic name in request with correlationId ${request.correlationId}")
  final case class AvroConversionAugmentedException(message: String) extends RuntimeException(message)
  final case class SchemaNotFoundAugmentedException(schemaNotFoundException: SchemaNotFoundException, topic: String)
    extends RuntimeException(s"Schema '$topic' cannot be loaded. Cause: ${schemaNotFoundException.getClass.getName}: Schema not found for $topic")
}

Source File: StringToGenericRecord.scala From hydra with Apache License 2.0

5 votes

package hydra.avro.convert

import java.util.UUID

import org.apache.avro.{LogicalTypes, Schema}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import cats.implicits._
import org.apache.avro.util.Utf8

import scala.util.{Failure, Success, Try}

object StringToGenericRecord {

  final case class ValidationExtraFieldsError(fields: Set[String]) extends RuntimeException(
    s"Extra fields ${fields.mkString(",")} found with Strict Validation Strategy"
  )

  final case class InvalidLogicalTypeError(expected: String, received: AnyRef) extends RuntimeException(
    s"Invalid logical type. Expected $expected but received $received"
  )

  implicit class ConvertToGenericRecord(s: String) {

    private def isUuidValid(s: String): Boolean =
      Try(UUID.fromString(s)).isSuccess

    private def checkLogicalTypes(record: GenericRecord): Try[Unit] = {
      import collection.JavaConverters._
      def checkAll(avroField: AnyRef, fieldSchema: Option[Schema]): Try[Unit] = avroField match {
        case g: GenericRecord => g.getSchema.getFields.asScala.toList
          .traverse(f => checkAll(g.get(f.name), f.schema.some)).void
        case u: Utf8 if fieldSchema.exists(f => Option(f.getLogicalType).exists(_.getName == LogicalTypes.uuid.getName)) =>
          if (isUuidValid(u.toString)) Success(()) else Failure(InvalidLogicalTypeError("UUID", u.toString))
        case _ => Success(())
      }
      val fields = record.getSchema.getFields.asScala.toList
      fields.traverse(f => checkAll(record.get(f.name), f.schema.some)).void
    }

    private def getAllPayloadFieldNames: Set[String] = {
      import spray.json._
      def loop(cur: JsValue, extraName: Option[String]): Set[String] = cur match {
        case JsObject(f) => f.flatMap { case (k: String, v: JsValue) =>
          loop(v, k.some) ++ Set(extraName.getOrElse("") + k)
        }.toSet
        case _ => Set.empty
      }
      loop(s.parseJson, None)
    }

    private def getAllSchemaFieldNames(schema: Schema): Set[String] = {
      import Schema.Type._
      import collection.JavaConverters._
      def loop(sch: Schema, extraName: Option[String]): Set[String] = sch.getType match {
        case RECORD => sch.getFields.asScala.toSet.flatMap { f: Schema.Field =>
          loop(f.schema, f.name.some) ++ Set(extraName.getOrElse("") + f.name)
        }
        case _ => Set.empty
      }
      loop(schema, None)
    }

    def toGenericRecord(schema: Schema, useStrictValidation: Boolean): Try[GenericRecord] = Try {
      if (useStrictValidation) {
        val diff = getAllPayloadFieldNames diff getAllSchemaFieldNames(schema)
        if (diff.nonEmpty) throw ValidationExtraFieldsError(diff)
      }
      val decoderFactory = new DecoderFactory
      val decoder = decoderFactory.jsonDecoder(schema, s)
      val reader = new GenericDatumReader[GenericRecord](schema)
      reader.read(null, decoder)
    }.flatTap(checkLogicalTypes)
  }

}

Source File: AvroParquetReaderFnTest.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.util.UUID

import io.eels.component.avro.AvroSchemaFns
import io.eels.component.parquet.avro.AvroParquetReaderFn
import io.eels.schema.{DoubleType, Field, LongType, StructType}
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.avro.util.Utf8
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.avro.AvroParquetWriter
import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec}

class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll {

  private implicit val conf = new Configuration()
  private implicit val fs = FileSystem.get(new Configuration())

  private val path = new Path(UUID.randomUUID().toString())

  override def afterAll(): Unit = {
    val fs = FileSystem.get(new Configuration())
    fs.delete(path, false)
  }

  private val avroSchema = SchemaBuilder.record("com.chuckle").fields()
    .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord()

  private val writer = AvroParquetWriter.builder[GenericRecord](path)
    .withSchema(avroSchema)
    .build()

  private val record = new GenericData.Record(avroSchema)
  record.put("str", "wibble")
  record.put("looong", 999L)
  record.put("dooble", 12.34)
  writer.write(record)
  writer.close()

  val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true))

  "AvroParquetReaderFn" should {
    "support projections on doubles" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong"))))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("dooble") shouldBe 12.34
    }
    "support projections on longs" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str"))))
      val record = reader.read()
      reader.close()

      record.get("looong") shouldBe 999L
    }
    "support full projections" in {

      val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema)))
      val record = reader.read()
      reader.close()

      record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      record.get("looong") shouldBe 999L
      record.get("dooble") shouldBe 12.34

    }
    "support non projections" in {

      val reader = AvroParquetReaderFn(path, None, None)
      val group = reader.read()
      reader.close()

      group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble"
      group.get("looong") shouldBe 999L
      group.get("dooble") shouldBe 12.34

    }
  }
}

Source File: AvroParquetReaderFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import io.eels.Predicate
import io.eels.component.parquet.{ParquetPredicateBuilder, ParquetReaderConfig}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.{AvroParquetReader, AvroReadSupport}
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.hadoop.ParquetReader


  def apply(path: Path,
            predicate: Option[Predicate],
            projectionSchema: Option[Schema])(implicit conf: Configuration): ParquetReader[GenericRecord] = {

    // The parquet reader can use a projection by setting a projected schema onto a conf object
    def configuration(): Configuration = {
      val newconf = new Configuration(conf)
      projectionSchema.foreach { it =>
        AvroReadSupport.setAvroReadSchema(newconf, it)
        AvroReadSupport.setRequestedProjection(newconf, it)
      }
      //conf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, "true")
      newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString)
      newconf
    }

    // a filter is set when we have a predicate for the read
    def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build)
      .map(FilterCompat.get)
      .getOrElse(FilterCompat.NOOP)

    AvroParquetReader.builder[GenericRecord](path)
      .withCompatibility(false)
      .withConf(configuration())
      .withFilter(filter())
      .build()
      .asInstanceOf[ParquetReader[GenericRecord]]
  }
}

Source File: AvroParquetWriterFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import io.eels.component.parquet.ParquetWriterConfig
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}


object AvroParquetWriterFn extends Logging {
  def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = {
    val config = ParquetWriterConfig()
    AvroParquetWriter.builder[GenericRecord](path)
      .withSchema(avroSchema)
      .withCompressionCodec(config.compressionCodec)
      .withPageSize(config.pageSize)
      .withRowGroupSize(config.blockSize)
      .withDictionaryEncoding(config.enableDictionary)
      .withWriteMode(ParquetFileWriter.Mode.CREATE)
      .withValidation(config.validating)
      .build()
  }
}

Source File: AvroParquetRowWriter.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet.avro

import com.sksamuel.exts.Logging
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.{FileSystem, Path}


class AvroParquetRowWriter(path: Path,
                           avroSchema: Schema)(implicit fs: FileSystem) extends Logging {

  private val config: Config = ConfigFactory.load()
  private val skipCrc = config.getBoolean("eel.parquet.skipCrc")
  logger.info(s"Parquet writer will skipCrc = $skipCrc")

  private val writer = AvroParquetWriterFn(path, avroSchema)

  def write(record: GenericRecord): Unit = {
    writer.write(record)
  }

  def close(): Unit = {
    writer.close()
    if (skipCrc) {
      val crc = new Path("." + path.toString() + ".crc")
      logger.debug("Deleting crc $crc")
      if (fs.exists(crc))
        fs.delete(crc, false)
    }
  }
}

Source File: AvroWriter.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import java.io.OutputStream
import java.util.concurrent.atomic.AtomicInteger

import io.eels.Row
import io.eels.schema.StructType
import org.apache.avro.file.DataFileWriter
import org.apache.avro.generic
import org.apache.avro.generic.GenericRecord

class AvroWriter(structType: StructType, out: OutputStream) {
  
  private val schema = AvroSchemaFns.toAvroSchema(structType)
  private val datumWriter = new generic.GenericDatumWriter[GenericRecord](schema)
  private val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter)
  private val serializer = new RowSerializer(schema)
  private val _records = new AtomicInteger(0)

  dataFileWriter.create(schema, out)

  def write(row: Row): Unit = {
    val record = serializer.serialize(row)
    dataFileWriter.append(record)
    _records.incrementAndGet()
  }

  def records: Int = _records.get()

  def close(): Unit = {
    dataFileWriter.flush()
    dataFileWriter.close()
  }
}

Source File: AvroDeserializer.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.avro

import com.typesafe.config.ConfigFactory
import io.eels.Row
import io.eels.schema.StructType
import org.apache.avro.Schema.Field
import org.apache.avro.generic.GenericRecord
import org.apache.avro.util.Utf8

import scala.collection.JavaConverters._


class AvroDeserializer(useJavaString: Boolean = ConfigFactory.load().getBoolean("eel.avro.java.string")) {

  val config = ConfigFactory.load()
  val deserializeAsNullable = config.getBoolean("eel.avro.deserializeAsNullable")
  var schema: StructType = null
  var fields: Array[Field] = null
  var range: Range = null

  def toScala(value: Any): Any = {
    value match {
      case record: GenericRecord => toValues(record)
      case utf8: Utf8 if useJavaString => value.asInstanceOf[Utf8].toString
      case col: java.util.Collection[Any] => col.asScala.toVector.map(toScala)
      case map: java.util.Map[_, _] => map.asScala.toMap.map { case (k, v) => toScala(k) -> toScala(v) }
      case other => other
    }
  }

  def toValues(record: GenericRecord): Vector[Any] = {
    val vector = Vector.newBuilder[Any]
    for (k <- 0 until record.getSchema.getFields.size) {
      val value = record.get(k)
      vector += toScala(value)
    }
    vector.result
  }

  def toRow(record: GenericRecord): Row = {
    // take the schema from the first record
    if (schema == null) {
      schema = AvroSchemaFns.fromAvroSchema(record.getSchema, deserializeAsNullable)
    }
    Row(schema, toValues(record))
  }
}

Source File: IndexWithKeyFields.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.data

import com.fasterxml.jackson.databind.JsonNode
import com.typesafe.config.ConfigFactory
import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder}
import org.apache.avro.generic.GenericRecord
import org.apache.log4j.LogManager
import org.joda.time.format.ISODateTimeFormat

import scala.util.control.NonFatal


case class IndexWithKeyFields(uuid: String,
                              lastModified: java.sql.Timestamp,
                              path: String) extends GenericRecord with CsvGenerator {

  override def put(key: String, v: scala.Any): Unit = ???

  override def get(key: String): AnyRef = key match {
    case "uuid" => uuid
    case "lastModified" => java.lang.Long.valueOf(lastModified.getTime)
    case "path" => path
  }

  override def put(i: Int, v: scala.Any): Unit = ???

  override def get(i: Int): AnyRef = i match {
    case 0 => uuid
    case 1 => java.lang.Long.valueOf(lastModified.getTime)
    case 2 => path
    case _ => throw new IllegalArgumentException
  }

  override def getSchema: Schema = IndexWithSystemFields.schema

  override def csv: String =
    (if (uuid == null) "" else uuid) + "," +
      (if (lastModified == null) "" else ISODateTimeFormat.dateTime.print(lastModified.getTime)) + "," +
      (if (path == null) "" else path)
}

object IndexWithKeyFields extends ObjectExtractor[IndexWithKeyFields] {

  private val logger = LogManager.getLogger(IndexWithSystemFields.getClass)

  // AVRO-2065 - doesn't allow union over logical type, so we can't make timestamp column nullable.
  val timestampMilliType: Schema = LogicalTypes.timestampMillis.addToSchema(Schema.create(Schema.Type.LONG))

  val schema: Schema = SchemaBuilder
    .record("IndexWithSystemFields").namespace("cmwell.analytics")
    .fields
    .name("uuid").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .name("lastModified").`type`(timestampMilliType).noDefault
    .name("path").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .endRecord

  private val config = ConfigFactory.load
  val infotonSize: Int = config.getInt("extract-index-from-es.fetch-size-index-with-uuid-lastModified-path")

  def includeFields: String = {
    // Note that 'quad' is not included in this list
    val fields = "uuid,lastModified,path"
      .split(",")
      .map(name => s""""system.$name"""")
      .mkString(",")

    s""""_source": [$fields]"""
  }

  def extractFromJson(hit: JsonNode): IndexWithKeyFields = {

    val system = hit.findValue("_source").findValue("system")

    def extractString(name: String): String = system.findValue(name) match {
      case x: JsonNode => x.asText
      case _ => null
    }

    // Extracting date values as Long - as a java.sql.Date might be better
    def extractDate(name: String): java.sql.Timestamp = system.findValue(name) match {
      case x: JsonNode =>
        try {
          new java.sql.Timestamp(ISODateTimeFormat.dateTime.parseDateTime(x.asText).getMillis)
        }
        catch {
          case NonFatal(ex) =>
            logger.warn(s"Failed conversion of date value: $x", ex)
            throw ex
        }
      case _ => null
    }

    IndexWithKeyFields(
      uuid = extractString("uuid"),
      lastModified = extractDate("lastModified"),
      path = extractString("path"))
  }
}

Source File: AvroSEBasicTest.scala From akka-serialization-test with Apache License 2.0

5 votes

package com.github.dnvriend.serializer.avro4s

import com.github.dnvriend.TestSpec
import com.github.dnvriend.domain.BookStore.{ ChangedBookV1, ChangedBookV2, ChangedBookV3, ChangedBookV4 }
import com.github.dnvriend.serializer.avro.{ BookSerializerV1, BookSerializerV2, BookSerializerV3 }
import com.sksamuel.avro4s.{ AvroSchema, RecordFormat }
import org.apache.avro.Schema
import org.apache.avro.file.SeekableByteArrayInput
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord }
import org.apache.avro.io.DecoderFactory

// SE stands for Schema Evolution
class AvroSEBasicTest extends TestSpec {

  @Override
  def fromBytes(bytes: Array[Byte], schema: Schema): GenericRecord = {
    val serveReader = new GenericDatumReader[GenericRecord](schema)
    serveReader.read(null, DecoderFactory.get().binaryDecoder(bytes, null))
  }

  val title = "Moby-Dick; or, The Whale"
  val year = 1851
  val editor = "Scala Books"

  "AvroSEBasicTest" should "deserialize old class with renamed field" in {
    // in this case, two different serializers can be used

    val obj = ChangedBookV1(title, year)
    val serializerV1 = new BookSerializerV1
    val bytes: Array[Byte] = serializerV1.toBinary(obj)
    val serializerV2 = new BookSerializerV2

    serializerV2.fromBinary(bytes) should matchPattern {
      case ChangedBookV2(`title`, `year`) ⇒
    }
  }

  it should "deserialize old class without new field" in {

    val obj = ChangedBookV2(title, year)
    val serializerV2 = new BookSerializerV2
    val bytes: Array[Byte] = serializerV2.toBinary(obj)

    val in = new SeekableByteArrayInput(bytes)

    val schema2 = AvroSchema[ChangedBookV2]
    val schema3 = AvroSchema[ChangedBookV3]

    val gdr = new GenericDatumReader[GenericRecord](schema2, schema3)
    val binDecoder = DecoderFactory.get().binaryDecoder(in, null)
    val record: GenericRecord = gdr.read(null, binDecoder)
    val format = RecordFormat[ChangedBookV3]
    val r = format.from(record)

    r should matchPattern {
      case ChangedBookV3(`title`, `year`, "") ⇒
    }

  }

  it should "deserialize old class with dropped field" in {

    val obj = ChangedBookV3(title, year, editor)
    val serializerV3 = new BookSerializerV3
    val bytes: Array[Byte] = serializerV3.toBinary(obj)

    val in = new SeekableByteArrayInput(bytes)

    val schema3 = AvroSchema[ChangedBookV3]
    val schema4 = AvroSchema[ChangedBookV4]

    val gdr = new GenericDatumReader[GenericRecord](schema3, schema4)
    val binDecoder = DecoderFactory.get().binaryDecoder(in, null)
    val record: GenericRecord = gdr.read(null, binDecoder)
    val format = RecordFormat[ChangedBookV4]
    val r = format.from(record)

    r should matchPattern {
      case ChangedBookV4(`title`, `editor`) ⇒
    }

  }

}

Source File: AvroIOTest.scala From ratatool with Apache License 2.0

5 votes

package com.spotify.ratatool.io

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File}

import com.spotify.ratatool.Schemas
import com.spotify.ratatool.avro.specific.TestRecord
import org.apache.avro.generic.GenericRecord
import com.spotify.ratatool.scalacheck._
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class AvroIOTest extends AnyFlatSpec with Matchers {

  private val genericSchema = Schemas.avroSchema
  private val genericGen = genericRecordOf(genericSchema)
  private val genericData = (1 to 100).flatMap(_ => genericGen.sample)

  private val specificSchema = TestRecord.getClassSchema
  private val specificGen = specificRecordOf[TestRecord]
  private val specificData = (1 to 100).flatMap(_ => specificGen.sample)

  "AvroIO" should "work with generic record and stream" in {
    val out = new ByteArrayOutputStream()
    AvroIO.writeToOutputStream(genericData, genericSchema, out)
    val in = new ByteArrayInputStream(out.toByteArray)
    val result = AvroIO.readFromInputStream[GenericRecord](in).toList
    result should equal (genericData)
  }

  it should "work with generic record and file" in {
    val file = File.createTempFile("ratatool-", ".avro")
    file.deleteOnExit()
    AvroIO.writeToFile(genericData, genericSchema, file)
    val result = AvroIO.readFromFile[GenericRecord](file).toList
    result should equal (genericData)
  }

  it should "work with specific record and stream" in {
    val out = new ByteArrayOutputStream()
    AvroIO.writeToOutputStream(specificData, specificSchema, out)
    val in = new ByteArrayInputStream(out.toByteArray)
    val result = AvroIO.readFromInputStream[TestRecord](in).toList
    result.map(FixRandomData(_)) should equal (specificData.map(FixRandomData(_)))
  }

  it should "work with specific record and file" in {
    val file = File.createTempFile("ratatool-", ".avro")
    file.deleteOnExit()
    AvroIO.writeToFile(specificData, specificSchema, file)
    val result = AvroIO.readFromFile[TestRecord](file).toList
    result.map(FixRandomData(_)) should equal (specificData.map(FixRandomData(_)))
  }
}

Source File: AvroIO.scala From ratatool with Apache License 2.0

5 votes

package com.spotify.ratatool.io

import java.io.{File, InputStream, OutputStream}
import java.nio.ByteBuffer
import java.nio.channels.SeekableByteChannel

import com.google.common.io.ByteStreams
import org.apache.avro.Schema
import org.apache.avro.file.{DataFileReader, DataFileWriter, SeekableByteArrayInput, SeekableInput}
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DatumReader, DatumWriter}
import org.apache.avro.reflect.{ReflectDatumReader, ReflectDatumWriter}
import org.apache.avro.specific.{SpecificDatumReader, SpecificDatumWriter, SpecificRecord}
import org.apache.beam.sdk.io.FileSystems
import org.apache.beam.sdk.io.fs.MatchResult.Metadata

import scala.jdk.CollectionConverters._
import scala.reflect.ClassTag


  def writeToOutputStream[T: ClassTag](data: Iterable[T],
                                       schema: Schema,
                                       os: OutputStream): Unit = {
    val fileWriter = new DataFileWriter(createDatumWriter[T]).create(schema, os)
    data.foreach(fileWriter.append)
    fileWriter.close()
  }

  def getAvroSchemaFromFile(path: String): Schema = {
    require(FileStorage(path).exists, s"File `$path` does not exist!")
    val files = FileStorage(path).listFiles.filter(_.resourceId.getFilename.endsWith(".avro"))
    require(files.nonEmpty, s"File `$path` does not contain avro files")
    val reader = new GenericDatumReader[GenericRecord]()
    val dfr = new DataFileReader[GenericRecord](AvroIO.getAvroSeekableInput(files.head), reader)
    dfr.getSchema
  }

  private def getAvroSeekableInput(meta: Metadata): SeekableInput = new SeekableInput {
    require(meta.isReadSeekEfficient)
    private val in = FileSystems.open(meta.resourceId()).asInstanceOf[SeekableByteChannel]
    override def read(b: Array[Byte], off: Int, len: Int): Int =
      in.read(ByteBuffer.wrap(b, off, len))
    override def tell(): Long = in.position()
    override def length(): Long = in.size()
    override def seek(p: Long): Unit = in.position(p)
    override def close(): Unit = in.close()
  }

}

Source File: ParquetSampler.scala From ratatool with Apache License 2.0

5 votes

package com.spotify.ratatool.samplers

import com.spotify.ratatool.io.ParquetIO
import org.apache.avro.generic.GenericRecord
import org.slf4j.{Logger, LoggerFactory}

import scala.collection.mutable.ListBuffer


class ParquetSampler(path: String, protected val seed: Option[Long] = None)
  extends Sampler[GenericRecord] {

  private val logger: Logger = LoggerFactory.getLogger(classOf[ParquetSampler])

  override def sample(n: Long, head: Boolean): Seq[GenericRecord] = {
    require(n > 0, "n must be > 0")
    require(head, "Parquet can only be used with --head")
    logger.info("Taking a sample of {} from Parquet {}", n, path)

    val result = ListBuffer.empty[GenericRecord]
    val iterator = ParquetIO.readFromFile(path)
    while (result.length < n && iterator.hasNext) {
      result.append(iterator.next())
    }
    result.toList
  }

}

Source File: CSVAutoReadersTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.readers

import com.salesforce.op.test.PassengerSparkFixtureTest
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

import scala.collection.JavaConverters._


@RunWith(classOf[JUnitRunner])
class CSVAutoReadersTest extends FlatSpec with PassengerSparkFixtureTest {

  private val expectedSchema = new Schema.Parser().parse(resourceFile(name = "PassengerAuto.avsc"))
  private val allFields = expectedSchema.getFields.asScala.map(_.name())
  private val keyField: String = allFields.head

  Spec[CSVAutoReader[_]] should "read in data correctly and infer schema" in {
    val dataReader = DataReaders.Simple.csvAuto[GenericRecord](
      path = Some(passengerCsvWithHeaderPath),
      key = _.get(keyField).toString
    )
    val data = dataReader.readRDD().collect()
    data.foreach(_ shouldBe a[GenericRecord])
    data.length shouldBe 8

    val inferredSchema = data.head.getSchema
    inferredSchema shouldBe expectedSchema
  }

  it should "read in data correctly and infer schema based with headers provided" in {
    val dataReader = DataReaders.Simple.csvAuto[GenericRecord](
      path = Some(passengerCsvPath),
      key = _.get(keyField).toString,
      headers = allFields
    )
    val data = dataReader.readRDD().collect()
    data.foreach(_ shouldBe a[GenericRecord])
    data.length shouldBe 8

    val inferredSchema = data.head.getSchema
    inferredSchema shouldBe expectedSchema

  }

}

Source File: CSVReaders.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.readers

import com.salesforce.op.OpParams
import com.salesforce.op.utils.io.csv.{CSVInOut, CSVOptions, CSVToAvro}
import org.apache.avro.generic.GenericRecord
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}

import scala.reflect.ClassTag
import scala.reflect.runtime.universe.WeakTypeTag


class ConditionalCSVReader[T <: GenericRecord : ClassTag : WeakTypeTag]
(
  readPath: Option[String],
  key: T => String,
  schema: String,
  options: CSVOptions = CSVDefaults.CSVOptions,
  timeZone: String = CSVDefaults.TimeZone,
  val conditionalParams: ConditionalParams[T]
) extends CSVReader[T](readPath = readPath, key = key,
  schema = schema, options = options, timeZone = timeZone) with ConditionalDataReader[T]

Source File: AvroInOutTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.utils.io.avro

import java.io.{File, FileNotFoundException, FileWriter}
import java.nio.file.Paths

import com.salesforce.op.test.TestSparkContext
import com.salesforce.op.utils.io.avro.AvroInOut._
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class AvroInOutTest extends FlatSpec with TestSparkContext {
  val avroSchemaPath = s"$testDataDir/PassengerDataAll.avsc"
  val avroFilePath = s"$testDataDir/PassengerDataAll.avro"
  val avroFileRecordCount = 891
  val hdfs: FileSystem = FileSystem.get(sc.hadoopConfiguration)
  lazy val avroTemp: String = tempDir + "/avro-inout-test"

  Spec(AvroInOut.getClass) should "creates RDD from an avro file" in {
    val res = readPathSeq(avroFilePath, withCount = true, deepCopy = true, persist = false)
    res shouldBe a[RDD[_]]
    res.count shouldBe avroFileRecordCount
  }

  it should "creates RDD from a sequence of avro files" in {
    val res = readPathSeq(s"$avroFilePath,$avroFilePath")
    res.count shouldBe avroFileRecordCount*2
  }

  it should "create RDD from a mixed sequence of valid and invalid avro files" in {
    val res = readPathSeq(s"badfile/path1,$avroFilePath,badfile/path2,$avroFilePath,badfile/path3")
    res.count shouldBe avroFileRecordCount*2
  }

  it should "throw an error if passed in avro files are invalid" in {
    val error = intercept[IllegalArgumentException](readPathSeq("badfile/path1,badfile/path2"))
    error.getMessage shouldBe "No valid directory found in path 'badfile/path1,badfile/path2'"
  }

  it should "creates Some(RDD) from an avro file" in {
    val res = read(avroFilePath)
    res.size shouldBe 1
    res.get shouldBe an[RDD[_]]
    res.get.count shouldBe avroFileRecordCount
  }

  it should "create None from an invalid avro file" in {
    val res = read("badfile/path")
    res shouldBe None
  }

  Spec[AvroWriter[_]] should "writeAvro to filesystem" in {
    val avroData = readPathSeq(avroFilePath).asInstanceOf[RDD[GenericRecord]]
    val avroSchema = loadFile(avroSchemaPath)

    val error = intercept[FileNotFoundException](hdfs.listStatus(new Path(avroTemp)))
    error.getMessage shouldBe s"File $avroTemp does not exist"

    AvroWriter(avroData).writeAvro(avroTemp, avroSchema)
    val hdfsFiles = hdfs.listStatus(new Path(avroTemp)) filter (x => x.getPath.getName.contains("part"))
    val res = readPathSeq((for { x <- hdfsFiles } yield avroTemp + "/" + x.getPath.getName).mkString(","))
    res.count shouldBe avroFileRecordCount
  }

  it should "checkPathsExist" in {
    val tmpDir = Paths.get(File.separator, "tmp").toFile
    val f1 = new File(tmpDir, "avroinouttest")
    f1.delete()
    val w = new FileWriter(f1)
    w.write("just checking")
    w.close()
    val f2 = new File(tmpDir, "thisfilecannotexist")
    f2.delete()
    val f3 = new File(tmpDir, "this file cannot exist")
    f3.delete()
    assume(f1.exists && !f2.exists && !f3.exists)

    // check for one dir being invalid in the path amongst two
    selectExistingPaths(s"$f1,$f2") shouldBe f1.toString

    // check if all dirs in the path are invalid then we get an exception
    intercept[IllegalArgumentException] { selectExistingPaths(f2.toString) }

    // also, check if all dirs in the path are invalid ( in a different way ) then we get an exception
    intercept[IllegalArgumentException] { selectExistingPaths(f3.toString) }

    // check for one dir being invalid ( in a different way ) in the path amongst the two dirs in it
    selectExistingPaths(s"$f1,$f3") shouldBe f1.toString

    // check for paths order insensitivity
    selectExistingPaths(s"$f3,$f1") shouldBe f1.toString

    // check for an exception if the path is an empty string
    intercept[IllegalArgumentException] { selectExistingPaths("") }
  }

}

Source File: RichGenericRecordTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.utils.avro

import com.salesforce.op.test.{TestCommon, TestSparkContext}
import com.salesforce.op.utils.io.avro.AvroInOut
import org.apache.avro.generic.GenericRecord
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}


@RunWith(classOf[JUnitRunner])
class RichGenericRecordTest extends FlatSpec with Matchers with TestSparkContext with TestCommon {

  import com.salesforce.op.utils.avro.RichGenericRecord._

  val dataPath = resourceFile(parent = "../test-data", name = s"PassengerData.avro").getPath
  val passengerData = AvroInOut.read[GenericRecord](dataPath).getOrElse(throw new Exception("Couldn't read data"))
  val firstRow = passengerData.sortBy(_.get("passengerId").toString.toInt).first

  Spec[RichGenericRecord] should "get value of Int" in {
    val id = firstRow.getValue[Int]("passengerId")
    id shouldBe Some(1)
  }

  it should "get value of Double" in {
    val survived = firstRow.getValue[Double]("survived")
    survived shouldBe Some(0.0)
  }

  it should "get value of Long" in {
    val height = firstRow.getValue[Long]("height")
    height shouldBe Some(168L)
  }

  it should "get value of String" in {
    val gender = firstRow.getValue[String]("gender")
    gender shouldBe Some("Female")
  }

  it should "get value of Char" in {
    val gender = firstRow.getValue[Char]("gender")
    gender shouldBe Some("Female")
  }

  it should "get value of Float" in {
    val age = firstRow.getValue[Float]("age")
    age shouldBe Some(32.0)
  }

  it should "get value of Short" in {
    val weight = firstRow.getValue[Short]("weight")
    weight shouldBe Some(67)
  }

  it should "throw error for invalid field" in {
    val error = intercept[IllegalArgumentException](firstRow.getValue[Short]("invalidField"))
    error.getMessage shouldBe "requirement failed: invalidField is not found in Avro schema!"
  }
}

Source File: RichGenericRecord.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.utils.avro

import org.apache.avro.generic.GenericRecord

object RichGenericRecord {

  
  private def javaConvert(in: Any): Any = {
    in match {
      case s: java.lang.String => s
      case s: org.apache.avro.util.Utf8 => s.toString
      case i: java.lang.Integer => i.toInt
      case d: java.lang.Double => d.toDouble
      case l: java.lang.Long => l.toLong
      case b: java.lang.Boolean => b
      case f: java.lang.Float => f.toFloat
      case s: java.lang.Short => s.toShort
      case c: java.lang.Character => c.toChar
      case x => throw new NotImplementedError(s"${x.getClass} is not an implemented type")
    }
  }

}

Source File: IndexWithCompleteDocument.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.data

import com.fasterxml.jackson.databind.JsonNode
import com.typesafe.config.ConfigFactory
import org.apache.avro.generic.GenericRecord
import org.apache.avro.{Schema, SchemaBuilder}

case class IndexWithCompleteDocument(uuid: String, document: String) extends GenericRecord with CsvGenerator {

  override def put(key: String, v: scala.Any): Unit = ???

  override def get(key: String): AnyRef = key match {
    case "uuid" => uuid
    case "document" => document
    case _ => throw new IllegalArgumentException
  }

  override def put(i: Int, v: scala.Any): Unit = ???

  override def get(i: Int): AnyRef = i match {
    case 0 => uuid
    case 1 => document
    case _ => throw new IllegalArgumentException
  }

  override def getSchema: Schema = IndexWithCompleteDocument.schema

  // Specifically don't implement CsvGenerator.csv since it is guaranteed to be invalid CSV - force use of Parquet.
}

object IndexWithCompleteDocument extends ObjectExtractor[IndexWithCompleteDocument] {

  val schema: Schema = SchemaBuilder
    .record("IndexWithCompleteDocument").namespace("cmwell.analytics")
    .fields
    .name("uuid").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .name("document").`type`.unionOf.stringType.and.nullType.endUnion.noDefault
    .endRecord

  private val config = ConfigFactory.load
  val infotonSize: Int = config.getInt("extract-index-from-es.fetch-size-index-with-complete-document")

  def includeFields: String = s""""_source": "*""""

  def extractFromJson(hit: JsonNode): IndexWithCompleteDocument =
    IndexWithCompleteDocument(
      uuid = hit.findValue("_id").asText,
      document = hit.findValue("_source").toString)
}

Source File: DataWriterFactory.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.data

import java.io.File
import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.util.Shard
import org.apache.avro.generic.GenericRecord
import org.apache.commons.io.FileUtils
import org.apache.parquet.hadoop.metadata.CompressionCodecName

import scala.concurrent.ExecutionContextExecutor

trait DataWriterFactory[T <: GenericRecord] {
  def apply(shard: Shard): DataWriter[T]
}

object DataWriterFactory {

  private val compressionCodec = CompressionCodecName.SNAPPY


  def file[T <: GenericRecord with CsvGenerator](format: String,
                                                 objectExtractor: ObjectExtractor[T],
                                                 outDirectory: String): Shard => DataWriter[T] = {

    val extension = s".$format" + (if (format == "parquet") s"${compressionCodec.getExtension}" else "")

    // Generate a meaningful file name for the target file name based on the source shard index name and shard number.
    (sourceShard: Shard) => {
      val outFile: File = Paths.get(outDirectory, s"part-r-${sourceShard.indexName}.${sourceShard.shard}$extension").toFile

      if (outFile.exists)
        FileUtils.forceDelete(outFile)

      new File(outFile.getParent).mkdirs()

      FileDataWriter[T](format, objectExtractor.schema, outFile.toString, compressionCodec)
    }
  }

  
  def index[T <: GenericRecord](indexMap: Map[String, String], // source-index -> target-index
                                esEndpoint: String)
                               (implicit system: ActorSystem,
                                executionContext: ExecutionContextExecutor,
                                actorMaterializer: ActorMaterializer
                               ): Shard => DataWriter[T] = {

    (sourceShard: Shard) => {
      val targetIndex = indexMap(sourceShard.indexName)
      new IndexDataWriter[T](indexName = targetIndex, esEndpoint = esEndpoint)
    }
  }
}

Source File: AvroDataOutputStream.scala From avro4s with Apache License 2.0

5 votes

package com.sksamuel.avro4s

import java.io.OutputStream

import org.apache.avro.Schema
import org.apache.avro.file.{CodecFactory, DataFileWriter}
import org.apache.avro.generic.{GenericDatumWriter, GenericRecord}


case class AvroDataOutputStream[T](os: OutputStream,
                                   codec: CodecFactory)
                                  (implicit encoder: Encoder[T]) extends AvroOutputStream[T] {

  val resolved = encoder.resolveEncoder()

  val (writer, writeFn) = resolved.schema.getType match {
    case Schema.Type.DOUBLE | Schema.Type.LONG | Schema.Type.BOOLEAN | Schema.Type.STRING | Schema.Type.INT | Schema.Type.FLOAT =>
      val datumWriter = new GenericDatumWriter[T](resolved.schema)
      val dataFileWriter = new DataFileWriter[T](datumWriter)
      dataFileWriter.setCodec(codec)
      dataFileWriter.create(resolved.schema, os)
      (dataFileWriter, (t: T) => dataFileWriter.append(t))
    case _ =>
      val datumWriter = new GenericDatumWriter[GenericRecord](resolved.schema)
      val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter)
      dataFileWriter.setCodec(codec)
      dataFileWriter.create(resolved.schema, os)
      (dataFileWriter, (t: T) => {
        val record = resolved.encode(t).asInstanceOf[GenericRecord]
        dataFileWriter.append(record)
      })
  }

  override def close(): Unit = {
    flush()
    writer.close()
  }

  override def write(t: T): Unit = {
    writeFn(t)
  }

  override def flush(): Unit = writer.flush()
  override def fSync(): Unit = writer.fSync()
}

Source File: Job.scala From spark-avro-compactor with Apache License 2.0

5 votes

package ie.ianduffy.spark.avro.compactor

import ie.ianduffy.spark.avro.compactor.Utils._
import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.AvroKeyOutputFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.NullWritable
import org.apache.spark.sql.SparkSession
import org.slf4j.LoggerFactory

object Job {

  private val log = LoggerFactory.getLogger(Job.getClass.getName.replace("$", ""))

  def run(spark: SparkSession, schemaRegistry: SchemaRegistryClient, jobConfig: JobConfig): Unit = {
    val schema: Schema = {
      val latestSchemaMetadata: SchemaMetadata = schemaRegistry.getLatestSchemaMetadata(jobConfig.schemaRegistrySubject)
      val id: Int = latestSchemaMetadata.getId
      schemaRegistry.getById(id)
    }

    implicit val sparkConfig: Configuration = spark.sparkContext.hadoopConfiguration
    sparkConfig.set("avro.schema.input.key", schema.toString())
    sparkConfig.set("avro.schema.output.key", schema.toString())

    val inputPath: Path = new Path(jobConfig.input)
    val outputPath: Path = new Path(jobConfig.output)

    val fs: FileSystem = inputPath.getFileSystem(sparkConfig)

    // avoid raising org.apache.hadoop.mapred.FileAlreadyExistsException
    if (jobConfig.overrideOutput) fs.delete(outputPath, true)

    // from fileSystem prefix with s3 the default is 64MB and can be overwitten by fs.s3.block.size
    // from fileSystem prefix with s3a the default is 32MB and can be overwitten by setting fs.s3a.block.size
    val outputBlocksize: Long = fs.getDefaultBlockSize(outputPath)

    // Where inputPath is of the form s3://some/path
    val inputPathSize: Long = fs.getContentSummary(inputPath).getSpaceConsumed

    val numPartitions: Int = Math.max(1, Math.floor((inputPathSize / CompressionRatio.AVRO_SNAPPY) / outputBlocksize).toInt)

    log.debug(
      s"""outputBlocksize: $outputBlocksize
         | inputPathSize: $inputPathSize
         | splitSize: $numPartitions
       """.stripMargin)

    val rdd = readHadoopFile(spark, inputPath.toString)

    rdd.coalesce(numPartitions)
      .saveAsNewAPIHadoopFile(
        outputPath.toString,
        classOf[AvroKey[GenericRecord]],
        classOf[NullWritable],
        classOf[AvroKeyOutputFormat[GenericRecord]],
        sparkConfig
      )
  }
}

Source File: Utils.scala From spark-avro-compactor with Apache License 2.0

5 votes

package ie.ianduffy.spark.avro.compactor

import org.apache.avro.generic.GenericRecord
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.AvroKeyInputFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.NullWritable
import org.apache.spark.sql.SparkSession

object Utils {

  def createSparkSession: SparkSession =
    SparkSession
      .builder
      .appName("avro-compactor")
      .getOrCreate


  def readHadoopFile(spark: SparkSession, path: String)(implicit sparkConfig: Configuration) = {
    spark.sparkContext.newAPIHadoopFile(
      path,
      classOf[AvroKeyInputFormat[GenericRecord]],
      classOf[AvroKey[GenericRecord]],
      classOf[NullWritable],
      sparkConfig
    )
  }

}

Source File: AvroToParquetWriter.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.writers

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.slf4j.LoggerFactory
import yamrcraft.etlite.utils.FileUtils

class AvroToParquetWriter(tempFile: String, outputFile: String) extends Writer[GenericRecord] {

  val logger = LoggerFactory.getLogger(this.getClass)

  // lazy initialization
  var writer: Option[AvroParquetWriter[GenericRecord]] = None

  val tempPath = new Path(tempFile + ".parquet")
  val outputPath = new Path(outputFile + ".parquet")
  logger.info(s"creating writer for working file: ${tempPath.toString}, outputFile: ${outputPath.toString}")

  override def write(event: GenericRecord): Unit = {
    logger.info(s"ParquetWriter.write, event type: ${event.getSchema.getName}")
    if (writer.isEmpty) {
      writer = Some(createWriter(tempPath.toString, event.getSchema))
    }

    writer.get.write(event)
  }

  override def commit(): Unit = {
    writer.get.close()

    val fs = FileUtils.getFS(outputPath.toString)
    fs.mkdirs(outputPath.getParent)
    if (fs.exists(outputPath)) {
      fs.rename(outputPath, new Path(outputPath.getParent, s"__${outputPath.getName}.${System.currentTimeMillis()}.old.__"))
    }
    // copy temp file to output file (typically temp file would be on local file system).
    if (tempFile.startsWith("file")) {
      logger.info(s"copy file from: ${tempPath.toString} to $outputPath")
      fs.copyFromLocalFile(true, true, tempPath, outputPath)
    } else {
      logger.info(s"renaming file from: ${tempPath.toString} to $outputPath")
      fs.rename(tempPath, outputPath)
    }
  }

  private def createWriter(file: String, schema: Schema) = {
    val fs = FileUtils.getFS(file)
    val path = new Path(file)
    if (fs.exists(path)) {
      fs.delete(path, true)
    }
    fs.mkdirs(path.getParent)
    new AvroParquetWriter[GenericRecord](path, schema)
  }

}

Source File: JsonToParquetPipelineFactory.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.pipeline

import org.apache.avro.generic.GenericRecord
import yamrcraft.etlite.PipelineSettings
import yamrcraft.etlite.transformers.{JsonToAvroTransformer, Message}
import yamrcraft.etlite.writers.{AvroToParquetWriter, TimePartitioningWriter}

class JsonToParquetPipelineFactory extends PipelineFactory[Message[GenericRecord]] {

  def createPipeline(settings: PipelineSettings, jobId: Long, partitionId: Int): Pipeline[Message[GenericRecord]] =
    new Pipeline(
      new JsonToAvroTransformer(settings.transformerConfig),
      new TimePartitioningWriter(
        settings.writerConfig,
        jobId,
        partitionId,
        (tempFile, outputFile) => new AvroToParquetWriter(tempFile, outputFile))
    )

}

Source File: JsonToAvroTransformer.scala From etl-light with MIT License

5 votes

package yamrcraft.etlite.transformers

import com.typesafe.config.Config
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import play.api.libs.json.Json
import yamrcraft.etlite.utils.ConfigConversions._
import yamrcraft.etlite.utils.{FileUtils, JsonAvroConverter, TimeUtils}
import yamrcraft.etlite.{ErrorType, EtlException}

class JsonToAvroTransformer(config: Config) extends Transformer[Message[GenericRecord]] {

  val converter = new JsonAvroConverter()

  // config settings
  val timestampField = config.getString("timestamp-field")
  val timestampFieldFormat = config.getString("timestamp-field-format")
  val defaultSchemaFileName = config.getString("default-schema-file")
  val (schemaSelectionField, schemas) = {
    config.hasPath("schema-selection") match {
      case true =>
        (Some(config.getString("schema-selection.field")),
          Some(config.getConfig("schema-selection.schemas").asMap.map {case (k,v) => (k, createSchema(v))}) )
      case false => (None, None)
    }
  }

  val defaultSchema: Schema = createSchema(defaultSchemaFileName)

  @throws(classOf[EtlException])
  override def transform(inbound: InboundMessage): Message[GenericRecord] = {

    try {
      val schema = getSchema(inbound.msg)
      val record = converter.convertToGenericDataRecord(inbound.msg, schema)

      Message[GenericRecord](
        record,
        schema.getName,
        extractTimestamp(record)
      )

    } catch {
      case e: EtlException => throw e
      case e: Exception => throw new EtlException(ErrorType.TransformationError, e)
    }
  }

  private def createSchema(path: String): Schema = new Schema.Parser().parse(FileUtils.readContent(path))

  private def getSchema(msg: Array[Byte]): Schema = {
    if (schemaSelectionField.isEmpty) {
      defaultSchema
    } else {
      val msgAsString = new String(msg, "UTF8")
      val msgJson = Json.parse(msgAsString)
      val selectionValue = (msgJson \ schemaSelectionField.get).asOpt[String]
      schemas.get.getOrElse(selectionValue.get, defaultSchema)
    }
  }

  @throws(classOf[EtlException])
  private def extractTimestamp(event: GenericRecord): Long = {
    try {
      (event.get(timestampField): Any) match {
        case ts: Long => ts.asInstanceOf[Long]
        case ts: String => TimeUtils.stringTimeToLong(ts, timestampFieldFormat)
        case _ => throw new RuntimeException("timestamp field is not of either Long or String types.")
      }
    } catch {
      case e: Exception => throw new EtlException(ErrorType.PartitionTimestampError, e)
    }
  }
}

Source File: AvroDecoder.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import java.util.Arrays.copyOfRange

import kafka.serializer.Decoder
import kafka.utils.VerifiableProperties
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}


sealed trait AvroDecoder[T] extends Decoder[T] {

  def props: VerifiableProperties

  protected val schema = new Schema.Parser().parse(props.getString(Avro.SCHEMA))
  protected val skipBytes = props.getInt(Avro.SKIP_BYTES, 0)

  protected val reader = new GenericDatumReader[GenericRecord](schema)
  protected val decoder = Avro.recordDecoder(reader)

  private def skip(bytes: Array[Byte], size: Int): Array[Byte] = {
    val length = bytes.length
    length - size match {
      case remaining if remaining > 0 => copyOfRange(bytes, size, length)
      case _ => new Array[Byte](0)
    }
  }

  def parse(bytes: Array[Byte]): GenericRecord = {
    val data = if (skipBytes == 0) bytes else skip(bytes, skipBytes)
    decoder(data)
  }
}

class AvroRecordDecoder(val props: VerifiableProperties) extends AvroDecoder[GenericRecord] {
  override def fromBytes(bytes: Array[Byte]): GenericRecord = parse(bytes)
}

class AvroMapDecoder(val props: VerifiableProperties) extends AvroDecoder[Map[String, Any]] {
  override def fromBytes(bytes: Array[Byte]): Map[String, Any] = Avro.toMap(parse(bytes))
}

class AvroJsonDecoder(val props: VerifiableProperties) extends AvroDecoder[String] {
  override def fromBytes(bytes: Array[Byte]): String = Avro.toJson(parse(bytes))
}

Source File: AvroTypeSpec.scala From shapeless-datatype with Apache License 2.0

5 votes

package shapeless.datatype.avro

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.net.URI
import java.nio.ByteBuffer

import com.google.protobuf.ByteString
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.joda.time.Instant
import org.scalacheck.Prop.forAll
import org.scalacheck.ScalacheckShapeless._
import org.scalacheck._
import shapeless._
import shapeless.datatype.record._

import scala.reflect.runtime.universe._

object AvroTypeSpec extends Properties("AvroType") {
  import shapeless.datatype.test.Records._
  import shapeless.datatype.test.SerializableUtils._

  implicit def compareByteArrays(x: Array[Byte], y: Array[Byte]) = java.util.Arrays.equals(x, y)
  implicit def compareIntArrays(x: Array[Int], y: Array[Int]) = java.util.Arrays.equals(x, y)

  def roundTrip[A: TypeTag, L <: HList](m: A)(implicit
    gen: LabelledGeneric.Aux[A, L],
    fromL: FromAvroRecord[L],
    toL: ToAvroRecord[L],
    mr: MatchRecord[L]
  ): Boolean = {
    val t = ensureSerializable(AvroType[A])
    val f1: SerializableFunction[A, GenericRecord] =
      new SerializableFunction[A, GenericRecord] {
        override def apply(m: A): GenericRecord = t.toGenericRecord(m)
      }
    val f2: SerializableFunction[GenericRecord, Option[A]] =
      new SerializableFunction[GenericRecord, Option[A]] {
        override def apply(m: GenericRecord): Option[A] = t.fromGenericRecord(m)
      }
    val toFn = ensureSerializable(f1)
    val fromFn = ensureSerializable(f2)
    val copy = fromFn(roundTripRecord(toFn(m)))
    val rm = RecordMatcher[A]
    copy.exists(rm(_, m))
  }

  def roundTripRecord(r: GenericRecord): GenericRecord = {
    val writer = new GenericDatumWriter[GenericRecord](r.getSchema)
    val baos = new ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(baos, null)
    writer.write(r, encoder)
    encoder.flush()
    baos.close()
    val bytes = baos.toByteArray

    val reader = new GenericDatumReader[GenericRecord](r.getSchema)
    val bais = new ByteArrayInputStream(bytes)
    val decoder = DecoderFactory.get().binaryDecoder(bais, null)
    reader.read(null, decoder)
  }

  implicit val byteStringAvroType = AvroType.at[ByteString](Schema.Type.BYTES)(
    v => ByteString.copyFrom(v.asInstanceOf[ByteBuffer]),
    v => ByteBuffer.wrap(v.toByteArray)
  )
  implicit val instantAvroType =
    AvroType.at[Instant](Schema.Type.LONG)(v => new Instant(v.asInstanceOf[Long]), _.getMillis)
  property("required") = forAll { m: Required => roundTrip(m) }
  property("optional") = forAll { m: Optional => roundTrip(m) }
  property("repeated") = forAll { m: Repeated => roundTrip(m) }
  property("mixed") = forAll { m: Mixed => roundTrip(m) }
  property("nested") = forAll { m: Nested => roundTrip(m) }
  property("seqs") = forAll { m: Seqs => roundTrip(m) }

  implicit val uriAvroType =
    AvroType.at[URI](Schema.Type.STRING)(v => URI.create(v.toString), _.toString)
  property("custom") = forAll { m: Custom => roundTrip(m) }
}

Source File: AvroType.scala From shapeless-datatype with Apache License 2.0

5 votes

package shapeless.datatype.avro

import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import shapeless._

import scala.reflect.runtime.universe._

class AvroType[A] extends Serializable {
  def fromGenericRecord[L <: HList](
    m: GenericRecord
  )(implicit gen: LabelledGeneric.Aux[A, L], fromL: FromAvroRecord[L]): Option[A] =
    fromL(Right(m)).map(gen.from)
  def toGenericRecord[L <: HList](
    a: A
  )(implicit gen: LabelledGeneric.Aux[A, L], toL: ToAvroRecord[L], tt: TypeTag[A]): GenericRecord =
    toL(gen.to(a)).left.get.build(AvroSchema[A])
}

object AvroType {
  def apply[A: TypeTag]: AvroType[A] = new AvroType[A]

  def at[V: TypeTag](
    schemaType: Schema.Type
  )(fromFn: Any => V, toFn: V => Any): BaseAvroMappableType[V] = {
    AvroSchema.register(implicitly[TypeTag[V]].tpe, schemaType)
    new BaseAvroMappableType[V] {
      override def from(value: Any): V = fromFn(value)
      override def to(value: V): Any = toFn(value)
    }
  }
}

Source File: SpecificTestUtil.scala From sbt-avrohugger with Apache License 2.0

5 votes

package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

}

Source File: SpecificDefautValuesSpec.scala From sbt-avrohugger with Apache License 2.0

5 votes

import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

import DefaultEnum._

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === Vector(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

}

org.apache.avro.generic.GenericRecord Scala Examples